From 35995a4d815586bc968a857f7235707940a2f755 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Tue, 19 Aug 2008 20:43:25 +0300
Subject: SLUB: Replace __builtin_return_address(0) with _RET_IP_.

This patch replaces __builtin_return_address(0) with _RET_IP_, since a
previous patch moved _RET_IP_ and _THIS_IP_ to include/linux/kernel.h and
they're widely available now. This makes for shorter and easier to read
code.

[penberg@cs.helsinki.fi: remove _RET_IP_ casts to void pointer]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 000da12b5cf0..c97ed28559ec 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -253,9 +253,9 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
  * request comes from.
  */
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
-extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
+extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
-	__kmalloc_track_caller(size, flags, __builtin_return_address(0))
+	__kmalloc_track_caller(size, flags, _RET_IP_)
 #else
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc(size, flags)
@@ -271,10 +271,10 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
  * allocation request comes from.
  */
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
-extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
+extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
-			__builtin_return_address(0))
+			_RET_IP_)
 #else
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node(size, flags, node)
-- 
cgit v1.2.3-71-gd317


From b9ce08c01020eb28bfbfa6faf1c740281c5f418e Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Sun, 10 Aug 2008 20:14:03 +0300
Subject: kmemtrace: Core implementation.

kmemtrace provides tracing for slab allocator functions, such as kmalloc,
kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected data is then fed
to the userspace application in order to analyse allocation hotspots,
internal fragmentation and so on, making it possible to see how well an
allocator performs, as well as debug and profile kernel code.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 Documentation/kernel-parameters.txt |  10 ++
 MAINTAINERS                         |   6 +
 include/linux/kmemtrace.h           |  85 +++++++++
 init/main.c                         |   2 +
 lib/Kconfig.debug                   |  28 +++
 mm/Makefile                         |   1 +
 mm/kmemtrace.c                      | 335 ++++++++++++++++++++++++++++++++++++
 7 files changed, 467 insertions(+)
 create mode 100644 include/linux/kmemtrace.h
 create mode 100644 mm/kmemtrace.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e0f346d201ed..542c2d8843db 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -49,6 +49,7 @@ parameter is applicable:
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
 	JOY	Appropriate joystick support is enabled.
+	KMEMTRACE kmemtrace is enabled.
 	LIBATA  Libata driver is enabled
 	LP	Printer support is enabled.
 	LOOP	Loopback device support is enabled.
@@ -1018,6 +1019,15 @@ and is between 256 and 4096 characters. It is defined in the file
 			use the HighMem zone if it exists, and the Normal
 			zone if it does not.
 
+	kmemtrace.enable=	[KNL,KMEMTRACE] Format: { yes | no }
+				Controls whether kmemtrace is enabled
+				at boot-time.
+
+	kmemtrace.subbufs=n	[KNL,KMEMTRACE] Overrides the number of
+			subbufs kmemtrace's relay channel has. Set this
+			higher than default (KMEMTRACE_N_SUBBUFS in code) if
+			you experience buffer overruns.
+
 	movablecore=nn[KMG]	[KNL,X86-32,IA-64,PPC,X86-64] This parameter
 			is similar to kernelcore except it specifies the
 			amount of memory used for migratable allocations.
diff --git a/MAINTAINERS b/MAINTAINERS
index 618c1ef4a397..e2b3c8555051 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2565,6 +2565,12 @@ M:	jason.wessel@windriver.com
 L:	kgdb-bugreport@lists.sourceforge.net
 S:	Maintained
 
+KMEMTRACE
+P:	Eduard - Gabriel Munteanu
+M:	eduard.munteanu@linux360.ro
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 KPROBES
 P:	Ananth N Mavinakayanahalli
 M:	ananth@in.ibm.com
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
new file mode 100644
index 000000000000..2c332010cb4e
--- /dev/null
+++ b/include/linux/kmemtrace.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/marker.h>
+
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
+#ifdef CONFIG_KMEMTRACE
+
+extern void kmemtrace_init(void);
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
+		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
+		   type_id, call_site, (unsigned long) ptr,
+		   bytes_req, bytes_alloc, (unsigned long) gfp_flags, node);
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+	trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu",
+		   type_id, call_site, (unsigned long) ptr);
+}
+
+#else /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_init(void)
+{
+}
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+}
+
+#endif /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
+					unsigned long call_site,
+					const void *ptr,
+					size_t bytes_req,
+					size_t bytes_alloc,
+					gfp_t gfp_flags)
+{
+	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
+				  bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/init/main.c b/init/main.c
index 7e117a231af1..be1fe2242a55 100644
--- a/init/main.c
+++ b/init/main.c
@@ -69,6 +69,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
+#include <linux/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
@@ -653,6 +654,7 @@ asmlinkage void __init start_kernel(void)
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
 	kmem_cache_init();
+	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
 	setup_per_cpu_pageset();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..78d669b461d2 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -803,6 +803,34 @@ config FIREWIRE_OHCI_REMOTE_DMA
 
 	  If unsure, say N.
 
+config KMEMTRACE
+	bool "Kernel memory tracer (kmemtrace)"
+	depends on RELAY && DEBUG_FS && MARKERS
+	help
+	  kmemtrace provides tracing for slab allocator functions, such as
+	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+	  data is then fed to the userspace application in order to analyse
+	  allocation hotspots, internal fragmentation and so on, making it
+	  possible to see how well an allocator performs, as well as debug
+	  and profile kernel code.
+
+	  This requires an userspace application to use. See
+	  Documentation/vm/kmemtrace.txt for more information.
+
+	  Saying Y will make the kernel somewhat larger and slower. However,
+	  if you disable kmemtrace at run-time or boot-time, the performance
+	  impact is minimal (depending on the arch the kernel is built for).
+
+	  If unsure, say N.
+
+config KMEMTRACE_DEFAULT_ENABLED
+	bool "Enabled by default at boot"
+	depends on KMEMTRACE
+	help
+	  Say Y here to enable kmemtrace at boot-time by default. Whatever
+	  the choice, the behavior can be overridden by a kernel parameter,
+	  as described in documentation.
+
 menuconfig BUILD_DOCSRC
 	bool "Build targets in Documentation/ tree"
 	depends on HEADERS_CHECK
diff --git a/mm/Makefile b/mm/Makefile
index c06b45a1ff5f..3782eb66d4b3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -34,3 +34,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
new file mode 100644
index 000000000000..83ad1cc71a92
--- /dev/null
+++ b/mm/kmemtrace.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#include <linux/string.h>
+#include <linux/debugfs.h>
+#include <linux/relay.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/gfp.h>
+#include <linux/kmemtrace.h>
+
+#define KMEMTRACE_SUBBUF_SIZE		524288
+#define KMEMTRACE_DEF_N_SUBBUFS		20
+
+static struct rchan *kmemtrace_chan;
+static u32 kmemtrace_buf_overruns;
+
+static unsigned int kmemtrace_n_subbufs;
+#ifdef CONFIG_KMEMTRACE_DEFAULT_ENABLED
+static unsigned int kmemtrace_enabled = 1;
+#else
+static unsigned int kmemtrace_enabled = 0;
+#endif
+
+/*
+ * The sequence number is used for reordering kmemtrace packets
+ * in userspace, since they are logged as per-CPU data.
+ *
+ * atomic_t should always be a 32-bit signed integer. Wraparound is not
+ * likely to occur, but userspace can deal with it by expecting a certain
+ * sequence number in the next packet that will be read.
+ */
+static atomic_t kmemtrace_seq_num;
+
+#define KMEMTRACE_ABI_VERSION		1
+
+static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION;
+
+enum kmemtrace_event_id {
+	KMEMTRACE_EVENT_ALLOC = 0,
+	KMEMTRACE_EVENT_FREE,
+};
+
+struct kmemtrace_event {
+	u8		event_id;
+	u8		type_id;
+	u16		event_size;
+	s32		seq_num;
+	u64		call_site;
+	u64		ptr;
+} __attribute__ ((__packed__));
+
+struct kmemtrace_stats_alloc {
+	u64		bytes_req;
+	u64		bytes_alloc;
+	u32		gfp_flags;
+	s32		numa_node;
+} __attribute__ ((__packed__));
+
+static void kmemtrace_probe_alloc(void *probe_data, void *call_data,
+				  const char *format, va_list *args)
+{
+	unsigned long flags;
+	struct kmemtrace_event *ev;
+	struct kmemtrace_stats_alloc *stats;
+	void *buf;
+
+	local_irq_save(flags);
+
+	buf = relay_reserve(kmemtrace_chan,
+			    sizeof(struct kmemtrace_event) +
+			    sizeof(struct kmemtrace_stats_alloc));
+	if (!buf)
+		goto failed;
+
+	/*
+	 * Don't convert this to use structure initializers,
+	 * C99 does not guarantee the rvalues evaluation order.
+	 */
+
+	ev = buf;
+	ev->event_id = KMEMTRACE_EVENT_ALLOC;
+	ev->type_id = va_arg(*args, int);
+	ev->event_size = sizeof(struct kmemtrace_event) +
+			 sizeof(struct kmemtrace_stats_alloc);
+	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
+	ev->call_site = va_arg(*args, unsigned long);
+	ev->ptr = va_arg(*args, unsigned long);
+
+	stats = buf + sizeof(struct kmemtrace_event);
+	stats->bytes_req = va_arg(*args, unsigned long);
+	stats->bytes_alloc = va_arg(*args, unsigned long);
+	stats->gfp_flags = va_arg(*args, unsigned long);
+	stats->numa_node = va_arg(*args, int);
+
+failed:
+	local_irq_restore(flags);
+}
+
+static void kmemtrace_probe_free(void *probe_data, void *call_data,
+				 const char *format, va_list *args)
+{
+	unsigned long flags;
+	struct kmemtrace_event *ev;
+
+	local_irq_save(flags);
+
+	ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event));
+	if (!ev)
+		goto failed;
+
+	/*
+	 * Don't convert this to use structure initializers,
+	 * C99 does not guarantee the rvalues evaluation order.
+	 */
+	ev->event_id = KMEMTRACE_EVENT_FREE;
+	ev->type_id = va_arg(*args, int);
+	ev->event_size = sizeof(struct kmemtrace_event);
+	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
+	ev->call_site = va_arg(*args, unsigned long);
+	ev->ptr = va_arg(*args, unsigned long);
+
+failed:
+	local_irq_restore(flags);
+}
+
+static struct dentry *
+kmemtrace_create_buf_file(const char *filename, struct dentry *parent,
+			  int mode, struct rchan_buf *buf, int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+				   &relay_file_operations);
+}
+
+static int kmemtrace_remove_buf_file(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+
+	return 0;
+}
+
+static int kmemtrace_subbuf_start(struct rchan_buf *buf,
+				  void *subbuf,
+				  void *prev_subbuf,
+				  size_t prev_padding)
+{
+	if (relay_buf_full(buf)) {
+		/*
+		 * We know it's not SMP-safe, but neither
+		 * debugfs_create_u32() is.
+		 */
+		kmemtrace_buf_overruns++;
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct rchan_callbacks relay_callbacks = {
+	.create_buf_file = kmemtrace_create_buf_file,
+	.remove_buf_file = kmemtrace_remove_buf_file,
+	.subbuf_start = kmemtrace_subbuf_start,
+};
+
+static struct dentry *kmemtrace_dir;
+static struct dentry *kmemtrace_overruns_dentry;
+static struct dentry *kmemtrace_abi_version_dentry;
+
+static struct dentry *kmemtrace_enabled_dentry;
+
+static int kmemtrace_start_probes(void)
+{
+	int err;
+
+	err = marker_probe_register("kmemtrace_alloc", "type_id %d "
+				    "call_site %lu ptr %lu "
+				    "bytes_req %lu bytes_alloc %lu "
+				    "gfp_flags %lu node %d",
+				    kmemtrace_probe_alloc, NULL);
+	if (err)
+		return err;
+	err = marker_probe_register("kmemtrace_free", "type_id %d "
+				    "call_site %lu ptr %lu",
+				    kmemtrace_probe_free, NULL);
+
+	return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+	marker_probe_unregister("kmemtrace_alloc",
+				kmemtrace_probe_alloc, NULL);
+	marker_probe_unregister("kmemtrace_free",
+				kmemtrace_probe_free, NULL);
+}
+
+static int kmemtrace_enabled_get(void *data, u64 *val)
+{
+	*val = *((int *) data);
+
+	return 0;
+}
+
+static int kmemtrace_enabled_set(void *data, u64 val)
+{
+	u64 old_val = kmemtrace_enabled;
+
+	*((int *) data) = !!val;
+
+	if (old_val == val)
+		return 0;
+	if (val)
+		kmemtrace_start_probes();
+	else
+		kmemtrace_stop_probes();
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops,
+			kmemtrace_enabled_get,
+			kmemtrace_enabled_set, "%llu\n");
+
+static void kmemtrace_cleanup(void)
+{
+	if (kmemtrace_enabled_dentry)
+		debugfs_remove(kmemtrace_enabled_dentry);
+
+	kmemtrace_stop_probes();
+
+	if (kmemtrace_abi_version_dentry)
+		debugfs_remove(kmemtrace_abi_version_dentry);
+	if (kmemtrace_overruns_dentry)
+		debugfs_remove(kmemtrace_overruns_dentry);
+
+	relay_close(kmemtrace_chan);
+	kmemtrace_chan = NULL;
+
+	if (kmemtrace_dir)
+		debugfs_remove(kmemtrace_dir);
+}
+
+static int __init kmemtrace_setup_late(void)
+{
+	if (!kmemtrace_chan)
+		goto failed;
+
+	kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL);
+	if (!kmemtrace_dir)
+		goto cleanup;
+
+	kmemtrace_abi_version_dentry =
+		debugfs_create_u32("abi_version", S_IRUSR,
+				   kmemtrace_dir, &kmemtrace_abi_version);
+	kmemtrace_overruns_dentry =
+		debugfs_create_u32("total_overruns", S_IRUSR,
+				   kmemtrace_dir, &kmemtrace_buf_overruns);
+	if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry)
+		goto cleanup;
+
+	kmemtrace_enabled_dentry =
+		debugfs_create_file("enabled", S_IRUSR | S_IWUSR,
+				    kmemtrace_dir, &kmemtrace_enabled,
+				    &kmemtrace_enabled_fops);
+	if (!kmemtrace_enabled_dentry)
+		goto cleanup;
+
+	if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir))
+		goto cleanup;
+
+	printk(KERN_INFO "kmemtrace: fully up.\n");
+
+	return 0;
+
+cleanup:
+	kmemtrace_cleanup();
+failed:
+	return 1;
+}
+late_initcall(kmemtrace_setup_late);
+
+static int __init kmemtrace_set_boot_enabled(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "yes"))
+		kmemtrace_enabled = 1;
+	else if (!strcmp(str, "no"))
+		kmemtrace_enabled = 0;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+early_param("kmemtrace.enable", kmemtrace_set_boot_enabled);
+
+static int __init kmemtrace_set_subbufs(char *str)
+{
+	get_option(&str, &kmemtrace_n_subbufs);
+	return 0;
+}
+early_param("kmemtrace.subbufs", kmemtrace_set_subbufs);
+
+void kmemtrace_init(void)
+{
+	if (!kmemtrace_enabled)
+		return;
+
+	if (!kmemtrace_n_subbufs)
+		kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS;
+
+	kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE,
+				    kmemtrace_n_subbufs, &relay_callbacks,
+				    NULL);
+	if (unlikely(!kmemtrace_chan)) {
+		printk(KERN_ERR "kmemtrace: could not open relay channel.\n");
+		return;
+	}
+
+	if (unlikely(kmemtrace_start_probes()))
+		goto probe_fail;
+
+	printk(KERN_INFO "kmemtrace: early init successful.\n");
+
+	return;
+
+probe_fail:
+	printk(KERN_ERR "kmemtrace: could not register marker probes!\n");
+	kmemtrace_cleanup();
+}
+
-- 
cgit v1.2.3-71-gd317


From 36555751c6751a5bdfd6d7bdf0648343bb1ef0de Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Sun, 10 Aug 2008 20:14:05 +0300
Subject: kmemtrace: SLAB hooks.

This adds hooks for the SLAB allocator, to allow tracing with kmemtrace.

We also convert some inline functions to __always_inline to make sure
_RET_IP_, which expands to __builtin_return_address(0), always works
as expected.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab_def.h | 68 ++++++++++++++++++++++++++++++++++++++++------
 mm/slab.c                | 71 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 123 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 39c3a5eb8ebe..7555ce99f6d2 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,6 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
+#include <linux/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
@@ -28,8 +29,26 @@ extern struct cache_sizes malloc_sizes[];
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
-static inline void *kmalloc(size_t size, gfp_t flags)
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags);
+extern size_t slab_buffer_size(struct kmem_cache *cachep);
+#else
+static __always_inline void *
+kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
 {
+	return kmem_cache_alloc(cachep, flags);
+}
+static inline size_t slab_buffer_size(struct kmem_cache *cachep)
+{
+	return 0;
+}
+#endif
+
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
+{
+	struct kmem_cache *cachep;
+	void *ret;
+
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 
@@ -50,10 +69,17 @@ static inline void *kmalloc(size_t size, gfp_t flags)
 found:
 #ifdef CONFIG_ZONE_DMA
 		if (flags & GFP_DMA)
-			return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep,
-						flags);
+			cachep = malloc_sizes[i].cs_dmacachep;
+		else
 #endif
-		return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags);
+			cachep = malloc_sizes[i].cs_cachep;
+
+		ret = kmem_cache_alloc_notrace(cachep, flags);
+
+		kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
+				     size, slab_buffer_size(cachep), flags);
+
+		return ret;
 	}
 	return __kmalloc(size, flags);
 }
@@ -62,8 +88,25 @@ found:
 extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
 extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+					   gfp_t flags,
+					   int nodeid);
+#else
+static __always_inline void *
+kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+			      gfp_t flags,
+			      int nodeid)
+{
+	return kmem_cache_alloc_node(cachep, flags, nodeid);
+}
+#endif
+
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
+	struct kmem_cache *cachep;
+	void *ret;
+
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 
@@ -84,11 +127,18 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 found:
 #ifdef CONFIG_ZONE_DMA
 		if (flags & GFP_DMA)
-			return kmem_cache_alloc_node(malloc_sizes[i].cs_dmacachep,
-						flags, node);
+			cachep = malloc_sizes[i].cs_dmacachep;
+		else
 #endif
-		return kmem_cache_alloc_node(malloc_sizes[i].cs_cachep,
-						flags, node);
+			cachep = malloc_sizes[i].cs_cachep;
+
+		ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_,
+					  ret, size, slab_buffer_size(cachep),
+					  flags, node);
+
+		return ret;
 	}
 	return __kmalloc_node(size, flags, node);
 }
diff --git a/mm/slab.c b/mm/slab.c
index a14787799014..b6d9b8cdefa9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -112,6 +112,7 @@
 #include	<linux/rtmutex.h>
 #include	<linux/reciprocal_div.h>
 #include	<linux/debugobjects.h>
+#include	<linux/kmemtrace.h>
 
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
@@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 
 #endif
 
+#ifdef CONFIG_KMEMTRACE
+size_t slab_buffer_size(struct kmem_cache *cachep)
+{
+	return cachep->buffer_size;
+}
+EXPORT_SYMBOL(slab_buffer_size);
+#endif
+
 /*
  * Do not go above this order unless 0 objects fit into the slab.
  */
@@ -3613,10 +3622,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	return __cache_alloc(cachep, flags, __builtin_return_address(0));
+	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+			     obj_size(cachep), cachep->buffer_size, flags);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+{
+	return __cache_alloc(cachep, flags, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+#endif
+
 /**
  * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
  * @cachep: the cache we're checking against
@@ -3661,23 +3683,47 @@ out:
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-	return __cache_alloc_node(cachep, flags, nodeid,
-			__builtin_return_address(0));
+	void *ret = __cache_alloc_node(cachep, flags, nodeid,
+				       __builtin_return_address(0));
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+				  obj_size(cachep), cachep->buffer_size,
+				  flags, nodeid);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+				    gfp_t flags,
+				    int nodeid)
+{
+	return __cache_alloc_node(cachep, flags, nodeid,
+				  __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+#endif
+
 static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
 	struct kmem_cache *cachep;
+	void *ret;
 
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return kmem_cache_alloc_node(cachep, flags, node);
+	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+				  (unsigned long) caller, ret,
+				  size, cachep->buffer_size, flags, node);
+
+	return ret;
 }
 
-#ifdef CONFIG_DEBUG_SLAB
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __do_kmalloc_node(size, flags, node,
@@ -3710,6 +3756,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 					  void *caller)
 {
 	struct kmem_cache *cachep;
+	void *ret;
 
 	/* If you want to save a few bytes .text space: replace
 	 * __ with kmem_.
@@ -3719,11 +3766,17 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	cachep = __find_general_cachep(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return __cache_alloc(cachep, flags, caller);
+	ret = __cache_alloc(cachep, flags, caller);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
+			     (unsigned long) caller, ret,
+			     size, cachep->buffer_size, flags);
+
+	return ret;
 }
 
 
-#ifdef CONFIG_DEBUG_SLAB
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	return __do_kmalloc(size, flags, __builtin_return_address(0));
@@ -3762,6 +3815,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 		debug_check_no_obj_freed(objp, obj_size(cachep));
 	__cache_free(cachep, objp);
 	local_irq_restore(flags);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3788,6 +3843,8 @@ void kfree(const void *objp)
 	debug_check_no_obj_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp);
 }
 EXPORT_SYMBOL(kfree);
 
-- 
cgit v1.2.3-71-gd317


From 3eae2cb24a96509e0a38cc48dc1538a2826f4e33 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Sun, 10 Aug 2008 20:14:07 +0300
Subject: kmemtrace: SLOB hooks.

This adds hooks for the SLOB allocator, to allow tracing with kmemtrace.

We also convert some inline functions to __always_inline to make sure
_RET_IP_, which expands to __builtin_return_address(0), always works
as expected.

Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slob_def.h |  9 +++++----
 mm/slob.c                | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 59a3fa476ab9..0ec00b39d006 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -3,14 +3,15 @@
 
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
-static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep,
+					      gfp_t flags)
 {
 	return kmem_cache_alloc_node(cachep, flags, -1);
 }
 
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __kmalloc_node(size, flags, node);
 }
@@ -23,12 +24,12 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * kmalloc is the normal method of allocating memory
  * in the kernel.
  */
-static inline void *kmalloc(size_t size, gfp_t flags)
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	return __kmalloc_node(size, flags, -1);
 }
 
-static inline void *__kmalloc(size_t size, gfp_t flags)
+static __always_inline void *__kmalloc(size_t size, gfp_t flags)
 {
 	return kmalloc(size, flags);
 }
diff --git a/mm/slob.c b/mm/slob.c
index cb675d126791..55de44ae5d30 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,6 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
@@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+	void *ret;
 
 	if (size < PAGE_SIZE - align) {
 		if (!size)
 			return ZERO_SIZE_PTR;
 
 		m = slob_alloc(size + align, gfp, align, node);
+
 		if (!m)
 			return NULL;
 		*m = size;
-		return (void *)m + align;
+		ret = (void *)m + align;
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _RET_IP_, ret,
+					  size, size + align, gfp, node);
 	} else {
-		void *ret;
+		unsigned int order = get_order(size);
 
-		ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+		ret = slob_new_page(gfp | __GFP_COMP, order, node);
 		if (ret) {
 			struct page *page;
 			page = virt_to_page(ret);
 			page->private = size;
 		}
-		return ret;
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _RET_IP_, ret,
+					  size, PAGE_SIZE << order, gfp, node);
 	}
+
+	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 
@@ -501,6 +513,8 @@ void kfree(const void *block)
 		slob_free(m, *m + align);
 	} else
 		put_page(&sp->page);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -569,10 +583,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
-	if (c->size < PAGE_SIZE)
+	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
-	else
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
+					  _RET_IP_, b, c->size,
+					  SLOB_UNITS(c->size) * SLOB_UNIT,
+					  flags, node);
+	} else {
 		b = slob_new_page(flags, get_order(c->size), node);
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
+					  _RET_IP_, b, c->size,
+					  PAGE_SIZE << get_order(c->size),
+					  flags, node);
+	}
 
 	if (c->ctor)
 		c->ctor(b);
@@ -608,6 +631,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 	} else {
 		__kmem_cache_free(b, c->size);
 	}
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
-- 
cgit v1.2.3-71-gd317


From 5b882be4e00e53a44f47ad7eb997cac2938848bf Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Tue, 19 Aug 2008 20:43:26 +0300
Subject: kmemtrace: SLUB hooks.

This adds hooks for the SLUB allocator, to allow tracing with kmemtrace.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h | 53 ++++++++++++++++++++++++++++++++++++---
 mm/slub.c                | 65 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 109 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..dc28432b5b9a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,6 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
+#include <linux/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
@@ -204,13 +205,31 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags);
+#else
+static __always_inline void *
+kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+{
+	return kmem_cache_alloc(s, gfpflags);
+}
+#endif
+
 static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 {
-	return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size));
+	unsigned int order = get_order(size);
+	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
+			     size, PAGE_SIZE << order, flags);
+
+	return ret;
 }
 
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
+	void *ret;
+
 	if (__builtin_constant_p(size)) {
 		if (size > PAGE_SIZE)
 			return kmalloc_large(size, flags);
@@ -221,7 +240,13 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 			if (!s)
 				return ZERO_SIZE_PTR;
 
-			return kmem_cache_alloc(s, flags);
+			ret = kmem_cache_alloc_notrace(s, flags);
+
+			kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
+					     _THIS_IP_, ret,
+					     size, s->size, flags);
+
+			return ret;
 		}
 	}
 	return __kmalloc(size, flags);
@@ -231,8 +256,24 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
+#ifdef CONFIG_KMEMTRACE
+extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+					   gfp_t gfpflags,
+					   int node);
+#else
+static __always_inline void *
+kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+			      gfp_t gfpflags,
+			      int node)
+{
+	return kmem_cache_alloc_node(s, gfpflags, node);
+}
+#endif
+
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
+	void *ret;
+
 	if (__builtin_constant_p(size) &&
 		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
@@ -240,7 +281,13 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 		if (!s)
 			return ZERO_SIZE_PTR;
 
-		return kmem_cache_alloc_node(s, flags, node);
+		ret = kmem_cache_alloc_node_notrace(s, flags, node);
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _THIS_IP_, ret,
+					  size, s->size, flags, node);
+
+		return ret;
 	}
 	return __kmalloc_node(size, flags, node);
 }
diff --git a/mm/slub.c b/mm/slub.c
index 06da86654875..4c48a0146afd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -24,6 +24,7 @@
 #include <linux/kallsyms.h>
 #include <linux/memory.h>
 #include <linux/math64.h>
+#include <linux/kmemtrace.h>
 
 /*
  * Lock order:
@@ -1613,18 +1614,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-	return slab_alloc(s, gfpflags, -1, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+			     s->objsize, s->size, gfpflags);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+{
+	return slab_alloc(s, gfpflags, -1, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+#endif
+
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-	return slab_alloc(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+				  s->objsize, s->size, gfpflags, node);
+
+	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #endif
 
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+				    gfp_t gfpflags,
+				    int node)
+{
+	return slab_alloc(s, gfpflags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+#endif
+
 /*
  * Slow patch handling. This may still be called frequently since objects
  * have a longer lifetime than the cpu slabs in most processing loads.
@@ -1732,6 +1761,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 	page = virt_to_head_page(x);
 
 	slab_free(s, page, x, _RET_IP_);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -2650,6 +2681,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
+	void *ret;
 
 	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, flags);
@@ -2659,7 +2691,12 @@ void *__kmalloc(size_t size, gfp_t flags)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	return slab_alloc(s, flags, -1, _RET_IP_);
+	ret = slab_alloc(s, flags, -1, _RET_IP_);
+
+	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
+			     size, s->size, flags);
+
+	return ret;
 }
 EXPORT_SYMBOL(__kmalloc);
 
@@ -2678,16 +2715,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s;
+	void *ret;
 
-	if (unlikely(size > PAGE_SIZE))
-		return kmalloc_large_node(size, flags, node);
+	if (unlikely(size > PAGE_SIZE)) {
+		ret = kmalloc_large_node(size, flags, node);
+
+		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+					  _RET_IP_, ret,
+					  size, PAGE_SIZE << get_order(size),
+					  flags, node);
+
+		return ret;
+	}
 
 	s = get_slab(size, flags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	return slab_alloc(s, flags, node, _RET_IP_);
+	ret = slab_alloc(s, flags, node, _RET_IP_);
+
+	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
+				  size, s->size, flags, node);
+
+	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
@@ -2745,6 +2796,8 @@ void kfree(const void *x)
 		return;
 	}
 	slab_free(page->slab, page, object, _RET_IP_);
+
+	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x);
 }
 EXPORT_SYMBOL(kfree);
 
-- 
cgit v1.2.3-71-gd317


From 73cd6af0413225b0ada8b8881c3e0cfd26506dfa Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Tue, 19 Aug 2008 20:43:24 +0300
Subject: kmemtrace: Better alternative to "kmemtrace: fix printk format
 warnings".

Fix the problem "kmemtrace: fix printk format warnings" attempted to fix,
but resulted in marker-probe format mismatch warnings. Instead of carrying
size_t into probes, we get rid of it by casting to unsigned long, just as
we did with gfp_t.

This way, we don't need to change marker format strings and we don't have
to rely on other format specifiers like "%zu", making for consistent use
of more generic data types (since there are no format specifiers for
gfp_t, for example).

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/kmemtrace.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
index 2c332010cb4e..5bea8ead6a6b 100644
--- a/include/linux/kmemtrace.h
+++ b/include/linux/kmemtrace.h
@@ -33,7 +33,8 @@ static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
 		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
 		   type_id, call_site, (unsigned long) ptr,
-		   bytes_req, bytes_alloc, (unsigned long) gfp_flags, node);
+		   (unsigned long) bytes_req, (unsigned long) bytes_alloc,
+		   (unsigned long) gfp_flags, node);
 }
 
 static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-- 
cgit v1.2.3-71-gd317


From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:52 -0500
Subject: sched: add sched_class->needs_post_schedule() member

We currently run class->post_schedule() outside of the rq->lock, which
means that we need to test for the need to post_schedule outside of
the lock to avoid a forced reacquistion.  This is currently not a problem
as we only look at rq->rt.overloaded.  However, we want to enhance this
going forward to look at more state to reduce the need to post_schedule to
a bare minimum set.  Therefore, we introduce a new member-func called
needs_post_schedule() which tests for the post_schedule condtion without
actually performing the work.  Therefore it is safe to call this
function before the rq->lock is released, because we are guaranteed not
to drop the lock at an intermediate point (such as what post_schedule()
may do).

We will use this later in the series

[ rostedt: removed paranoid BUG_ON ]

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  8 +++++++-
 kernel/sched_rt.c     | 24 ++++++++++++++----------
 3 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e5f928a079e8..836a86c32a65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1012,6 +1012,7 @@ struct sched_class {
 			      struct rq *busiest, struct sched_domain *sd,
 			      enum cpu_idle_type idle);
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+	int (*needs_post_schedule) (struct rq *this_rq);
 	void (*post_schedule) (struct rq *this_rq);
 	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fca364f3593..3acbad8991a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2621,6 +2621,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
+#ifdef CONFIG_SMP
+	int post_schedule = 0;
+
+	if (current->sched_class->needs_post_schedule)
+		post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
 
 	rq->prev_mm = NULL;
 
@@ -2639,7 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
-	if (current->sched_class->post_schedule)
+	if (post_schedule)
 		current->sched_class->post_schedule(rq);
 #endif
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8d33843cb2c4..b0b6ea4ed674 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1290,20 +1290,23 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 		pull_rt_task(rq);
 }
 
+/*
+ * assumes rq->lock is held
+ */
+static int needs_post_schedule_rt(struct rq *rq)
+{
+	return rq->rt.overloaded ? 1 : 0;
+}
+
 static void post_schedule_rt(struct rq *rq)
 {
 	/*
-	 * If we have more than one rt_task queued, then
-	 * see if we can push the other rt_tasks off to other CPUS.
-	 * Note we may release the rq lock, and since
-	 * the lock was owned by prev, we need to release it
-	 * first via finish_lock_switch and then reaquire it here.
+	 * This is only called if needs_post_schedule_rt() indicates that
+	 * we need to push tasks away
 	 */
-	if (unlikely(rq->rt.overloaded)) {
-		spin_lock_irq(&rq->lock);
-		push_rt_tasks(rq);
-		spin_unlock_irq(&rq->lock);
-	}
+	spin_lock_irq(&rq->lock);
+	push_rt_tasks(rq);
+	spin_unlock_irq(&rq->lock);
 }
 
 /*
@@ -1557,6 +1560,7 @@ static const struct sched_class rt_sched_class = {
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
 	.pre_schedule		= pre_schedule_rt,
+	.needs_post_schedule	= needs_post_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
 	.switched_from		= switched_from_rt,
-- 
cgit v1.2.3-71-gd317


From 4075134e40804821f90866d7de56802e4dcecb1e Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:53 -0500
Subject: plist: fix PLIST_NODE_INIT to work with debug enabled

It seems that PLIST_NODE_INIT breaks if used and DEBUG_PI_LIST is defined.
Since there are no current users of PLIST_NODE_INIT, this has gone
undetected.  This patch fixes the build issue that enables the
DEBUG_PI_LIST later in the series when we use it in init_task.h

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 include/linux/plist.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 85de2f055874..45926d77d6ac 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -96,6 +96,10 @@ struct plist_node {
 # define PLIST_HEAD_LOCK_INIT(_lock)
 #endif
 
+#define _PLIST_HEAD_INIT(head)				\
+	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
+	.node_list = LIST_HEAD_INIT((head).node_list)
+
 /**
  * PLIST_HEAD_INIT - static struct plist_head initializer
  * @head:	struct plist_head variable name
@@ -103,8 +107,7 @@ struct plist_node {
  */
 #define PLIST_HEAD_INIT(head, _lock)			\
 {							\
-	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
-	.node_list = LIST_HEAD_INIT((head).node_list),	\
+        _PLIST_HEAD_INIT(head),                         \
 	PLIST_HEAD_LOCK_INIT(&(_lock))			\
 }
 
@@ -116,7 +119,7 @@ struct plist_node {
 #define PLIST_NODE_INIT(node, __prio)			\
 {							\
 	.prio  = (__prio),				\
-	.plist = PLIST_HEAD_INIT((node).plist, NULL),	\
+	.plist = { _PLIST_HEAD_INIT((node).plist) }, 	\
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:53 -0500
Subject: sched: create "pushable_tasks" list to limit pushing to one attempt

The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis).  When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted.  This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.

When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state.  Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over.  The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.

However, the current implementation suffers from a limitation in the
push logic.  Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc).  Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue.  This causes two problems:

1) We can have the same tasks analyzed over and over again during each
   push, which extends out the fast path in the scheduler for no
   gain.  Consider a RQ that has dozens of tasks that are bound to a
   core.  Each one of those tasks will be encountered and skipped
   for each push operation while they are queued.

2) There may be lower-priority tasks under the unpushable task that
   could have been successfully pushed, but will never be considered
   until either the unpushable task is cleared, or a pull operation
   succeeds.  The net result is a potential latency source for mid
   priority tasks.

This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks".  A task is added to the list
each time a task is activated or preempted.  It is removed from the
list any time it is deactivated, made current, or fails to push.

This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper.  This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks.  Now every task will have a push attempted (when
appropriate).

This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.

[ rostedt: added a couple more BUG_ONs ]

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/init_task.h |   1 +
 include/linux/sched.h     |   1 +
 kernel/sched.c            |   4 ++
 kernel/sched_rt.c         | 119 +++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 107 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e5..6851225f44a7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -140,6 +140,7 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
 	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 836a86c32a65..440cabb2d432 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,7 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
+	struct plist_node pushable_tasks;
 
 	struct mm_struct *mm, *active_mm;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3acbad8991a2..24ab80c28765 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -471,6 +471,7 @@ struct rt_rq {
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
 	int overloaded;
+	struct plist_head pushable_tasks;
 #endif
 	int rt_throttled;
 	u64 rt_time;
@@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+	plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
 	put_cpu();
 }
 
@@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
+	plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
 #endif
 
 	rt_rq->rt_time = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b0b6ea4ed674..fe9da6084c87 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq)
 		rq->rt.overloaded = 0;
 	}
 }
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+	plist_node_init(&p->pushable_tasks, p->prio);
+	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+#else
+
+#define enqueue_pushable_task(rq, p) do { } while (0)
+#define dequeue_pushable_task(rq, p) do { } while (0)
+
 #endif /* CONFIG_SMP */
 
 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
@@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	enqueue_rt_entity(rt_se);
 
+	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+		enqueue_pushable_task(rq, p);
+
 	inc_cpu_load(rq, p->se.load.weight);
 }
 
@@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
 
+	dequeue_pushable_task(rq, p);
+
 	dec_cpu_load(rq, p->se.load.weight);
 }
 
@@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	return next;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
@@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 
 	p = rt_task_of(rt_se);
 	p->se.exec_start = rq->clock;
+
+	return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+	struct task_struct *p = _pick_next_task_rt(rq);
+
+	/* The running task is never eligible for pushing */
+	if (p)
+		dequeue_pushable_task(rq, p);
+
 	return p;
 }
 
@@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
 	p->se.exec_start = 0;
+
+	/*
+	 * The previous task needs to be made eligible for pushing
+	 * if it is still active
+	 */
+	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+		enqueue_pushable_task(rq, p);
 }
 
 #ifdef CONFIG_SMP
@@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 	return lowest_rq;
 }
 
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+	struct task_struct *p;
+
+	if (!has_pushable_tasks(rq))
+		return NULL;
+
+	p = plist_first_entry(&rq->rt.pushable_tasks,
+			      struct task_struct, pushable_tasks);
+
+	BUG_ON(rq->cpu != task_cpu(p));
+	BUG_ON(task_current(rq, p));
+	BUG_ON(p->rt.nr_cpus_allowed <= 1);
+
+	BUG_ON(!p->se.on_rq);
+	BUG_ON(!rt_task(p));
+
+	return p;
+}
+
 /*
  * If the current CPU has more than one RT task, see if the non
  * running task can migrate over to a CPU that is running a task
@@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
-	int ret = 0;
 	int paranoid = RT_MAX_TRIES;
 
 	if (!rq->rt.overloaded)
 		return 0;
 
-	next_task = pick_next_highest_task_rt(rq, -1);
+	next_task = pick_next_pushable_task(rq);
 	if (!next_task)
 		return 0;
 
@@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq)
 		 * so it is possible that next_task has changed.
 		 * If it has, then try again.
 		 */
-		task = pick_next_highest_task_rt(rq, -1);
+		task = pick_next_pushable_task(rq);
 		if (unlikely(task != next_task) && task && paranoid--) {
 			put_task_struct(next_task);
 			next_task = task;
 			goto retry;
 		}
+
+		/*
+		 * Once we have failed to push this task, we will not
+		 * try again, since the other cpus will pull from us
+		 * when they are ready
+		 */
+		dequeue_pushable_task(rq, next_task);
 		goto out;
 	}
 
@@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq)
 
 	double_unlock_balance(rq, lowest_rq);
 
-	ret = 1;
 out:
 	put_task_struct(next_task);
 
-	return ret;
+	return 1;
 }
 
-/*
- * TODO: Currently we just use the second highest prio task on
- *       the queue, and stop when it can't migrate (or there's
- *       no more RT tasks).  There may be a case where a lower
- *       priority RT task has a different affinity than the
- *       higher RT task. In this case the lower RT task could
- *       possibly be able to migrate where as the higher priority
- *       RT task could not.  We currently ignore this issue.
- *       Enhancements are welcome!
- */
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
@@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  */
 static int needs_post_schedule_rt(struct rq *rq)
 {
-	return rq->rt.overloaded ? 1 : 0;
+	return has_pushable_tasks(rq);
 }
 
 static void post_schedule_rt(struct rq *rq)
@@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    rq->rt.overloaded &&
+	    has_pushable_tasks(rq) &&
 	    p->rt.nr_cpus_allowed > 1)
 		push_rt_tasks(rq);
 }
@@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
+		if (!task_current(rq, p)) {
+			/*
+			 * Make sure we dequeue this task from the pushable list
+			 * before going further.  It will either remain off of
+			 * the list because we are no longer pushable, or it
+			 * will be requeued.
+			 */
+			if (p->rt.nr_cpus_allowed > 1)
+				dequeue_pushable_task(rq, p);
+
+			/*
+			 * Requeue if our weight is changing and still > 1
+			 */
+			if (weight > 1)
+				enqueue_pushable_task(rq, p);
+
+		}
+
 		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
 			rq->rt.rt_nr_migratory++;
 		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq)
 	struct task_struct *p = rq->curr;
 
 	p->se.exec_start = rq->clock;
+
+	/* The running task is never eligible for pushing */
+	dequeue_pushable_task(rq, p);
 }
 
 static const struct sched_class rt_sched_class = {
-- 
cgit v1.2.3-71-gd317


From 36994e58a48fb8f9651c7dc845a6de298aba5bfc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 29 Dec 2008 13:42:23 -0800
Subject: tracing/kmemtrace: normalize the raw tracer event to the unified
 tracing API

Impact: new tracer plugin

This patch adapts kmemtrace raw events tracing to the unified tracing API.

To enable and use this tracer, just do the following:

 echo kmemtrace > /debugfs/tracing/current_tracer
 cat /debugfs/tracing/trace

You will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

type_id 1 call_site 18446744071565527833 ptr 18446612134395152256
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164672 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164912 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345165152 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 0 call_site 18446744071566144042 ptr 18446612134346191680 bytes_req 1304 bytes_alloc 1312 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584

That was to stay backward compatible with the format output produced in
inux/tracepoint.h.

This is the default ouput, but note that I tried something else.

If you change an option:

echo kmem_minimalistic > /debugfs/trace_options

and then cat /debugfs/trace, you will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

   -      C                            0xffff88007c088780          file_free_rcu
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc780     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc870     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc960     -1   d_alloc
   +      K   1304   1312   000000d0   0xffff8800791d7340     -1   reiserfs_alloc_inode
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K    992   1000   000000d0   0xffff880079045b58     -1   alloc_inode
   +      K    768   1024   000080d0   0xffff88007c096400     -1   alloc_pipe_info
   +      K    240    240   000000d0   0xffff8800790dca50     -1   d_alloc
   +      K    272    320   000080d0   0xffff88007c088780     -1   get_empty_filp
   +      K    272    320   000080d0   0xffff88007c088000     -1   get_empty_filp

Yeah I shall confess kmem_minimalistic should be: kmem_alternative.

Whatever, I find it more readable but this a personal opinion of course.
We can drop it if you want.

On the ALLOC/FREE column, + means an allocation and - a free.

On the type column, you have K = kmalloc, C = cache, P = page

I would like the flags to be GFP_* strings but that would not be easy to not
break the column with strings....

About the node...it seems to always be -1. I don't know why but that shouldn't
be difficult to find.

I moved linux/tracepoint.h to trace/tracepoint.h as well. I think that would
be more easy to find the tracer headers if they are all in their common
directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kmemtrace.h |  86 ------------
 include/linux/slab_def.h  |   2 +-
 include/linux/slub_def.h  |   2 +-
 include/trace/kmemtrace.h |  75 ++++++++++
 init/main.c               |   2 +-
 kernel/trace/Kconfig      |  22 +++
 kernel/trace/Makefile     |   1 +
 kernel/trace/kmemtrace.c  | 343 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h      |  25 ++++
 lib/Kconfig.debug         |  20 ---
 mm/kmemtrace.c            |   2 +-
 mm/slob.c                 |   2 +-
 mm/slub.c                 |   2 +-
 13 files changed, 472 insertions(+), 112 deletions(-)
 delete mode 100644 include/linux/kmemtrace.h
 create mode 100644 include/trace/kmemtrace.h
 create mode 100644 kernel/trace/kmemtrace.c

(limited to 'include/linux')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index 5bea8ead6a6b..000000000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/marker.h>
-
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
-
-#ifdef CONFIG_KMEMTRACE
-
-extern void kmemtrace_init(void);
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
-		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
-		   type_id, call_site, (unsigned long) ptr,
-		   (unsigned long) bytes_req, (unsigned long) bytes_alloc,
-		   (unsigned long) gfp_flags, node);
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-	trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu",
-		   type_id, call_site, (unsigned long) ptr);
-}
-
-#else /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_init(void)
-{
-}
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-}
-
-#endif /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
-					unsigned long call_site,
-					const void *ptr,
-					size_t bytes_req,
-					size_t bytes_alloc,
-					gfp_t gfp_flags)
-{
-	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
-				  bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 7555ce99f6d2..455f9affea9a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dc28432b5b9a..6b657f7dcb2b 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
new file mode 100644
index 000000000000..ad8b7857855a
--- /dev/null
+++ b/include/trace/kmemtrace.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/marker.h>
+
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
+#ifdef CONFIG_KMEMTRACE
+
+extern void kmemtrace_init(void);
+
+extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node);
+
+extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr);
+
+#else /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_init(void)
+{
+}
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+}
+
+#endif /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
+					unsigned long call_site,
+					const void *ptr,
+					size_t bytes_req,
+					size_t bytes_alloc,
+					gfp_t gfp_flags)
+{
+	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
+				  bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/init/main.c b/init/main.c
index 9711586aa7c9..beca7aaddb22 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..27fb74b06b3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -264,6 +264,28 @@ config HW_BRANCH_TRACER
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
+config KMEMTRACE
+	bool "Trace SLAB allocations"
+	select TRACING
+	depends on RELAY
+	help
+	  kmemtrace provides tracing for slab allocator functions, such as
+	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+	  data is then fed to the userspace application in order to analyse
+	  allocation hotspots, internal fragmentation and so on, making it
+	  possible to see how well an allocator performs, as well as debug
+	  and profile kernel code.
+
+	  This requires an userspace application to use. See
+	  Documentation/vm/kmemtrace.txt for more information.
+
+	  Saying Y will make the kernel somewhat larger and slower. However,
+	  if you disable kmemtrace at run-time or boot-time, the performance
+	  impact is minimal (depending on the arch the kernel is built for).
+
+	  If unsure, say N.
+
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..513dc86b5dfa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -33,5 +33,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..d69cbe3c2a4b
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,343 @@
+/*
+ * Memory allocator tracing
+ *
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <trace/kmemtrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_KMEM_OPT_MINIMAL	0x1
+
+static struct tracer_opt kmem_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
+	{ }
+};
+
+static struct tracer_flags kmem_tracer_flags = {
+	.val = 0,
+	.opts = kmem_opts
+};
+
+
+static bool kmem_tracing_enabled __read_mostly;
+static struct trace_array *kmemtrace_array;
+
+static int kmem_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	kmemtrace_array = tr;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+
+	kmem_tracing_enabled = true;
+
+	return 0;
+}
+
+static void kmem_trace_reset(struct trace_array *tr)
+{
+	kmem_tracing_enabled = false;
+}
+
+static void kmemtrace_headers(struct seq_file *s)
+{
+	/* Don't need headers for the original kmemtrace output */
+	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+		return;
+
+	seq_printf(s, "#\n");
+	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
+			"      POINTER         NODE    CALLER\n");
+	seq_printf(s, "# FREE   |      |     |       |       "
+			"       |   |            |        |\n");
+	seq_printf(s, "# |\n\n");
+}
+
+/*
+ * The two following functions give the original output from kmemtrace,
+ * or something close to....perhaps they need some missing things
+ */
+static enum print_line_t
+kmemtrace_print_alloc_original(struct trace_iterator *iter,
+				struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
+	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
+	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
+	   (unsigned long) entry->gfp_flags, entry->node);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_original(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+
+/* The two other following provide a more minimalistic output */
+static enum print_line_t
+kmemtrace_print_alloc_compress(struct trace_iterator *iter,
+					struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Alloc entry */
+	ret = trace_seq_printf(s, "  +      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K   ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C   ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P   ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?   ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Requested */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Allocated */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Flags
+	 * TODO: would be better to see the name of the GFP flag names
+	 */
+	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Node */
+	ret = trace_seq_printf(s, "%4d   ", entry->node);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_compress(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Free entry */
+	ret = trace_seq_printf(s, "  -      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K     ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C     ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P     ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?     ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip requested/allocated/flags */
+	ret = trace_seq_printf(s, "                       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip node */
+	ret = trace_seq_printf(s, "       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_KMEM_ALLOC: {
+		struct kmemtrace_alloc_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_alloc_compress(iter, field);
+		else
+			return kmemtrace_print_alloc_original(iter, field);
+	}
+
+	case TRACE_KMEM_FREE: {
+		struct kmemtrace_free_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_free_compress(iter, field);
+		else
+			return kmemtrace_print_free_original(iter, field);
+	}
+
+	default:
+		return TRACE_TYPE_UNHANDLED;
+	}
+}
+
+/* Trace allocations */
+void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+			     unsigned long call_site,
+			     const void *ptr,
+			     size_t bytes_req,
+			     size_t bytes_alloc,
+			     gfp_t gfp_flags,
+			     int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+		       unsigned long call_site,
+		       const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+static struct tracer kmem_tracer __read_mostly = {
+	.name		= "kmemtrace",
+	.init		= kmem_trace_init,
+	.reset		= kmem_trace_reset,
+	.print_line	= kmemtrace_print_line,
+	.print_header = kmemtrace_headers,
+	.flags		= &kmem_tracer_flags
+};
+
+static int __init init_kmem_tracer(void)
+{
+	return register_tracer(&kmem_tracer);
+}
+
+device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..534505bb39b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
+#include <trace/kmemtrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -29,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_KMEM_ALLOC,
+	TRACE_KMEM_FREE,
 	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
@@ -170,6 +173,24 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+struct kmemtrace_alloc_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+	size_t bytes_req;
+	size_t bytes_alloc;
+	gfp_t gfp_flags;
+	int node;
+};
+
+struct kmemtrace_free_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -280,6 +301,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
  		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
+			  TRACE_KMEM_ALLOC);	\
+		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
+			  TRACE_KMEM_FREE);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b5417e23ba94..b0f239e443bc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -803,26 +803,6 @@ config FIREWIRE_OHCI_REMOTE_DMA
 
 	  If unsure, say N.
 
-config KMEMTRACE
-	bool "Kernel memory tracer (kmemtrace)"
-	depends on RELAY && DEBUG_FS && MARKERS
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/vm/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 menuconfig BUILD_DOCSRC
 	bool "Build targets in Documentation/ tree"
 	depends on HEADERS_CHECK
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
index 2a70a805027c..0573b5080cc4 100644
--- a/mm/kmemtrace.c
+++ b/mm/kmemtrace.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/marker.h>
 #include <linux/gfp.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #define KMEMTRACE_SUBBUF_SIZE		524288
 #define KMEMTRACE_DEF_N_SUBBUFS		20
diff --git a/mm/slob.c b/mm/slob.c
index 0f1a49f40690..4d1c0fc33b6b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,7 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index cc4001fee7ac..7bf8cf8ec082 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
-- 
cgit v1.2.3-71-gd317


From bc22c17e12c130dc929218a95aa347e0f3fd05dc Mon Sep 17 00:00:00 2001
From: Alain Knaff <alain@knaff.lu>
Date: Sun, 4 Jan 2009 22:46:16 +0100
Subject: bzip2/lzma: library support for gzip, bzip2 and lzma decompression

Impact: Replaces inflate.c with a wrapper around zlib_inflate; new library code

This is the first part of the bzip2/lzma patch

The bzip patch is based on an idea by Christian Ludwig, includes support for
compressing the kernel with bzip2 or lzma rather than gzip. Both
compressors give smaller sizes than gzip.  Lzma's decompresses faster
than bzip2.

It also supports ramdisks and initramfs' compressed using these two
compressors.

The functionality has been successfully used for a couple of years by
the udpcast project

This version applies to "tip" kernel 2.6.28

This part contains:
- changed inflate.c to accomodate rest of patch
- implementation of bzip2 compression (not used at this stage yet)
- implementation of lzma compression (not used at this stage yet)
- Makefile routines to support bzip2 and lzma kernel compression

Signed-off-by: Alain Knaff <alain@knaff.lu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/decompress/bunzip2.h |  10 +
 include/linux/decompress/generic.h |  30 ++
 include/linux/decompress/inflate.h |  13 +
 include/linux/decompress/mm.h      |  87 +++++
 include/linux/decompress/unlzma.h  |  12 +
 lib/decompress_bunzip2.c           | 735 +++++++++++++++++++++++++++++++++++++
 lib/decompress_inflate.c           | 167 +++++++++
 lib/decompress_unlzma.c            | 647 ++++++++++++++++++++++++++++++++
 lib/zlib_inflate/inflate.h         |   4 +
 lib/zlib_inflate/inftrees.h        |   4 +
 scripts/Makefile.lib               |  14 +
 scripts/bin_size                   |  10 +
 12 files changed, 1733 insertions(+)
 create mode 100644 include/linux/decompress/bunzip2.h
 create mode 100644 include/linux/decompress/generic.h
 create mode 100644 include/linux/decompress/inflate.h
 create mode 100644 include/linux/decompress/mm.h
 create mode 100644 include/linux/decompress/unlzma.h
 create mode 100644 lib/decompress_bunzip2.c
 create mode 100644 lib/decompress_inflate.c
 create mode 100644 lib/decompress_unlzma.c
 create mode 100644 scripts/bin_size

(limited to 'include/linux')

diff --git a/include/linux/decompress/bunzip2.h b/include/linux/decompress/bunzip2.h
new file mode 100644
index 000000000000..115272137a9c
--- /dev/null
+++ b/include/linux/decompress/bunzip2.h
@@ -0,0 +1,10 @@
+#ifndef DECOMPRESS_BUNZIP2_H
+#define DECOMPRESS_BUNZIP2_H
+
+int bunzip2(unsigned char *inbuf, int len,
+	    int(*fill)(void*, unsigned int),
+	    int(*flush)(void*, unsigned int),
+	    unsigned char *output,
+	    int *pos,
+	    void(*error)(char *x));
+#endif
diff --git a/include/linux/decompress/generic.h b/include/linux/decompress/generic.h
new file mode 100644
index 000000000000..f847f514f78e
--- /dev/null
+++ b/include/linux/decompress/generic.h
@@ -0,0 +1,30 @@
+#ifndef DECOMPRESS_GENERIC_H
+#define DECOMPRESS_GENERIC_H
+
+/* Minimal chunksize to be read.
+ *Bzip2 prefers at least 4096
+ *Lzma prefers 0x10000 */
+#define COMPR_IOBUF_SIZE	4096
+
+typedef int (*decompress_fn) (unsigned char *inbuf, int len,
+			      int(*fill)(void*, unsigned int),
+			      int(*writebb)(void*, unsigned int),
+			      unsigned char *output,
+			      int *posp,
+			      void(*error)(char *x));
+
+/* inbuf   - input buffer
+ *len     - len of pre-read data in inbuf
+ *fill    - function to fill inbuf if empty
+ *writebb - function to write out outbug
+ *posp    - if non-null, input position (number of bytes read) will be
+ *	  returned here
+ *
+ *If len != 0, the inbuf is initialized (with as much data), and fill
+ *should not be called
+ *If len = 0, the inbuf is allocated, but empty. Its size is IOBUF_SIZE
+ *fill should be called (repeatedly...) to read data, at most IOBUF_SIZE
+ */
+
+
+#endif
diff --git a/include/linux/decompress/inflate.h b/include/linux/decompress/inflate.h
new file mode 100644
index 000000000000..f9b06ccc3e5c
--- /dev/null
+++ b/include/linux/decompress/inflate.h
@@ -0,0 +1,13 @@
+#ifndef INFLATE_H
+#define INFLATE_H
+
+/* Other housekeeping constants */
+#define INBUFSIZ 4096
+
+int gunzip(unsigned char *inbuf, int len,
+	   int(*fill)(void*, unsigned int),
+	   int(*flush)(void*, unsigned int),
+	   unsigned char *output,
+	   int *pos,
+	   void(*error_fn)(char *x));
+#endif
diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
new file mode 100644
index 000000000000..12ff8c3f1d05
--- /dev/null
+++ b/include/linux/decompress/mm.h
@@ -0,0 +1,87 @@
+/*
+ * linux/compr_mm.h
+ *
+ * Memory management for pre-boot and ramdisk uncompressors
+ *
+ * Authors: Alain Knaff <alain@knaff.lu>
+ *
+ */
+
+#ifndef DECOMPR_MM_H
+#define DECOMPR_MM_H
+
+#ifdef STATIC
+
+/* Code active when included from pre-boot environment: */
+
+/* A trivial malloc implementation, adapted from
+ *  malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
+ */
+static unsigned long malloc_ptr;
+static int malloc_count;
+
+static void *malloc(int size)
+{
+	void *p;
+
+	if (size < 0)
+		error("Malloc error");
+	if (!malloc_ptr)
+		malloc_ptr = free_mem_ptr;
+
+	malloc_ptr = (malloc_ptr + 3) & ~3;     /* Align */
+
+	p = (void *)malloc_ptr;
+	malloc_ptr += size;
+
+	if (free_mem_end_ptr && malloc_ptr >= free_mem_end_ptr)
+		error("Out of memory");
+
+	malloc_count++;
+	return p;
+}
+
+static void free(void *where)
+{
+	malloc_count--;
+	if (!malloc_count)
+		malloc_ptr = free_mem_ptr;
+}
+
+#define large_malloc(a) malloc(a)
+#define large_free(a) free(a)
+
+#define set_error_fn(x)
+
+#define INIT
+
+#else /* STATIC */
+
+/* Code active when compiled standalone for use when loading ramdisk: */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+/* Use defines rather than static inline in order to avoid spurious
+ * warnings when not needed (indeed large_malloc / large_free are not
+ * needed by inflate */
+
+#define malloc(a) kmalloc(a, GFP_KERNEL)
+#define free(a) kfree(a)
+
+#define large_malloc(a) vmalloc(a)
+#define large_free(a) vfree(a)
+
+static void(*error)(char *m);
+#define set_error_fn(x) error = x;
+
+#define INIT __init
+#define STATIC
+
+#include <linux/init.h>
+
+#endif /* STATIC */
+
+#endif /* DECOMPR_MM_H */
diff --git a/include/linux/decompress/unlzma.h b/include/linux/decompress/unlzma.h
new file mode 100644
index 000000000000..7796538f1bf4
--- /dev/null
+++ b/include/linux/decompress/unlzma.h
@@ -0,0 +1,12 @@
+#ifndef DECOMPRESS_UNLZMA_H
+#define DECOMPRESS_UNLZMA_H
+
+int unlzma(unsigned char *, int,
+	   int(*fill)(void*, unsigned int),
+	   int(*flush)(void*, unsigned int),
+	   unsigned char *output,
+	   int *posp,
+	   void(*error)(char *x)
+	);
+
+#endif
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
new file mode 100644
index 000000000000..5d3ddb5fcfd9
--- /dev/null
+++ b/lib/decompress_bunzip2.c
@@ -0,0 +1,735 @@
+/* vi: set sw = 4 ts = 4: */
+/*	Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
+
+	Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
+	which also acknowledges contributions by Mike Burrows, David Wheeler,
+	Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
+	Robert Sedgewick, and Jon L. Bentley.
+
+	This code is licensed under the LGPLv2:
+		LGPL (http://www.gnu.org/copyleft/lgpl.html
+*/
+
+/*
+	Size and speed optimizations by Manuel Novoa III  (mjn3@codepoet.org).
+
+	More efficient reading of Huffman codes, a streamlined read_bunzip()
+	function, and various other tweaks.  In (limited) tests, approximately
+	20% faster than bzcat on x86 and about 10% faster on arm.
+
+	Note that about 2/3 of the time is spent in read_unzip() reversing
+	the Burrows-Wheeler transformation.  Much of that time is delay
+	resulting from cache misses.
+
+	I would ask that anyone benefiting from this work, especially those
+	using it in commercial products, consider making a donation to my local
+	non-profit hospice organization in the name of the woman I loved, who
+	passed away Feb. 12, 2003.
+
+		In memory of Toni W. Hagan
+
+		Hospice of Acadiana, Inc.
+		2600 Johnston St., Suite 200
+		Lafayette, LA 70503-3240
+
+		Phone (337) 232-1234 or 1-800-738-2226
+		Fax   (337) 232-1297
+
+		http://www.hospiceacadiana.com/
+
+	Manuel
+ */
+
+/*
+	Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu)
+*/
+
+
+#ifndef STATIC
+#include <linux/decompress/bunzip2.h>
+#endif /* !STATIC */
+
+#include <linux/decompress/mm.h>
+
+#ifndef INT_MAX
+#define INT_MAX 0x7fffffff
+#endif
+
+/* Constants for Huffman coding */
+#define MAX_GROUPS		6
+#define GROUP_SIZE   		50	/* 64 would have been more efficient */
+#define MAX_HUFCODE_BITS 	20	/* Longest Huffman code allowed */
+#define MAX_SYMBOLS 		258	/* 256 literals + RUNA + RUNB */
+#define SYMBOL_RUNA		0
+#define SYMBOL_RUNB		1
+
+/* Status return values */
+#define RETVAL_OK			0
+#define RETVAL_LAST_BLOCK		(-1)
+#define RETVAL_NOT_BZIP_DATA		(-2)
+#define RETVAL_UNEXPECTED_INPUT_EOF	(-3)
+#define RETVAL_UNEXPECTED_OUTPUT_EOF	(-4)
+#define RETVAL_DATA_ERROR		(-5)
+#define RETVAL_OUT_OF_MEMORY		(-6)
+#define RETVAL_OBSOLETE_INPUT		(-7)
+
+/* Other housekeeping constants */
+#define BZIP2_IOBUF_SIZE		4096
+
+/* This is what we know about each Huffman coding group */
+struct group_data {
+	/* We have an extra slot at the end of limit[] for a sentinal value. */
+	int limit[MAX_HUFCODE_BITS+1];
+	int base[MAX_HUFCODE_BITS];
+	int permute[MAX_SYMBOLS];
+	int minLen, maxLen;
+};
+
+/* Structure holding all the housekeeping data, including IO buffers and
+   memory that persists between calls to bunzip */
+struct bunzip_data {
+	/* State for interrupting output loop */
+	int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
+	/* I/O tracking data (file handles, buffers, positions, etc.) */
+	int (*fill)(void*, unsigned int);
+	int inbufCount, inbufPos /*, outbufPos*/;
+	unsigned char *inbuf /*,*outbuf*/;
+	unsigned int inbufBitCount, inbufBits;
+	/* The CRC values stored in the block header and calculated from the
+	data */
+	unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC;
+	/* Intermediate buffer and its size (in bytes) */
+	unsigned int *dbuf, dbufSize;
+	/* These things are a bit too big to go on the stack */
+	unsigned char selectors[32768];		/* nSelectors = 15 bits */
+	struct group_data groups[MAX_GROUPS];	/* Huffman coding tables */
+	int io_error;			/* non-zero if we have IO error */
+};
+
+
+/* Return the next nnn bits of input.  All reads from the compressed input
+   are done through this function.  All reads are big endian */
+static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted)
+{
+	unsigned int bits = 0;
+
+	/* If we need to get more data from the byte buffer, do so.
+	   (Loop getting one byte at a time to enforce endianness and avoid
+	   unaligned access.) */
+	while (bd->inbufBitCount < bits_wanted) {
+		/* If we need to read more data from file into byte buffer, do
+		   so */
+		if (bd->inbufPos == bd->inbufCount) {
+			if (bd->io_error)
+				return 0;
+			bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE);
+			if (bd->inbufCount <= 0) {
+				bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF;
+				return 0;
+			}
+			bd->inbufPos = 0;
+		}
+		/* Avoid 32-bit overflow (dump bit buffer to top of output) */
+		if (bd->inbufBitCount >= 24) {
+			bits = bd->inbufBits&((1 << bd->inbufBitCount)-1);
+			bits_wanted -= bd->inbufBitCount;
+			bits <<= bits_wanted;
+			bd->inbufBitCount = 0;
+		}
+		/* Grab next 8 bits of input from buffer. */
+		bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
+		bd->inbufBitCount += 8;
+	}
+	/* Calculate result */
+	bd->inbufBitCount -= bits_wanted;
+	bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1);
+
+	return bits;
+}
+
+/* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
+
+static int INIT get_next_block(struct bunzip_data *bd)
+{
+	struct group_data *hufGroup = NULL;
+	int *base = NULL;
+	int *limit = NULL;
+	int dbufCount, nextSym, dbufSize, groupCount, selector,
+		i, j, k, t, runPos, symCount, symTotal, nSelectors,
+		byteCount[256];
+	unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
+	unsigned int *dbuf, origPtr;
+
+	dbuf = bd->dbuf;
+	dbufSize = bd->dbufSize;
+	selectors = bd->selectors;
+
+	/* Read in header signature and CRC, then validate signature.
+	   (last block signature means CRC is for whole file, return now) */
+	i = get_bits(bd, 24);
+	j = get_bits(bd, 24);
+	bd->headerCRC = get_bits(bd, 32);
+	if ((i == 0x177245) && (j == 0x385090))
+		return RETVAL_LAST_BLOCK;
+	if ((i != 0x314159) || (j != 0x265359))
+		return RETVAL_NOT_BZIP_DATA;
+	/* We can add support for blockRandomised if anybody complains.
+	   There was some code for this in busybox 1.0.0-pre3, but nobody ever
+	   noticed that it didn't actually work. */
+	if (get_bits(bd, 1))
+		return RETVAL_OBSOLETE_INPUT;
+	origPtr = get_bits(bd, 24);
+	if (origPtr > dbufSize)
+		return RETVAL_DATA_ERROR;
+	/* mapping table: if some byte values are never used (encoding things
+	   like ascii text), the compression code removes the gaps to have fewer
+	   symbols to deal with, and writes a sparse bitfield indicating which
+	   values were present.  We make a translation table to convert the
+	   symbols back to the corresponding bytes. */
+	t = get_bits(bd, 16);
+	symTotal = 0;
+	for (i = 0; i < 16; i++) {
+		if (t&(1 << (15-i))) {
+			k = get_bits(bd, 16);
+			for (j = 0; j < 16; j++)
+				if (k&(1 << (15-j)))
+					symToByte[symTotal++] = (16*i)+j;
+		}
+	}
+	/* How many different Huffman coding groups does this block use? */
+	groupCount = get_bits(bd, 3);
+	if (groupCount < 2 || groupCount > MAX_GROUPS)
+		return RETVAL_DATA_ERROR;
+	/* nSelectors: Every GROUP_SIZE many symbols we select a new
+	   Huffman coding group.  Read in the group selector list,
+	   which is stored as MTF encoded bit runs.  (MTF = Move To
+	   Front, as each value is used it's moved to the start of the
+	   list.) */
+	nSelectors = get_bits(bd, 15);
+	if (!nSelectors)
+		return RETVAL_DATA_ERROR;
+	for (i = 0; i < groupCount; i++)
+		mtfSymbol[i] = i;
+	for (i = 0; i < nSelectors; i++) {
+		/* Get next value */
+		for (j = 0; get_bits(bd, 1); j++)
+			if (j >= groupCount)
+				return RETVAL_DATA_ERROR;
+		/* Decode MTF to get the next selector */
+		uc = mtfSymbol[j];
+		for (; j; j--)
+			mtfSymbol[j] = mtfSymbol[j-1];
+		mtfSymbol[0] = selectors[i] = uc;
+	}
+	/* Read the Huffman coding tables for each group, which code
+	   for symTotal literal symbols, plus two run symbols (RUNA,
+	   RUNB) */
+	symCount = symTotal+2;
+	for (j = 0; j < groupCount; j++) {
+		unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1];
+		int	minLen,	maxLen, pp;
+		/* Read Huffman code lengths for each symbol.  They're
+		   stored in a way similar to mtf; record a starting
+		   value for the first symbol, and an offset from the
+		   previous value for everys symbol after that.
+		   (Subtracting 1 before the loop and then adding it
+		   back at the end is an optimization that makes the
+		   test inside the loop simpler: symbol length 0
+		   becomes negative, so an unsigned inequality catches
+		   it.) */
+		t = get_bits(bd, 5)-1;
+		for (i = 0; i < symCount; i++) {
+			for (;;) {
+				if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
+					return RETVAL_DATA_ERROR;
+
+				/* If first bit is 0, stop.  Else
+				   second bit indicates whether to
+				   increment or decrement the value.
+				   Optimization: grab 2 bits and unget
+				   the second if the first was 0. */
+
+				k = get_bits(bd, 2);
+				if (k < 2) {
+					bd->inbufBitCount++;
+					break;
+				}
+				/* Add one if second bit 1, else
+				 * subtract 1.  Avoids if/else */
+				t += (((k+1)&2)-1);
+			}
+			/* Correct for the initial -1, to get the
+			 * final symbol length */
+			length[i] = t+1;
+		}
+		/* Find largest and smallest lengths in this group */
+		minLen = maxLen = length[0];
+
+		for (i = 1; i < symCount; i++) {
+			if (length[i] > maxLen)
+				maxLen = length[i];
+			else if (length[i] < minLen)
+				minLen = length[i];
+		}
+
+		/* Calculate permute[], base[], and limit[] tables from
+		 * length[].
+		 *
+		 * permute[] is the lookup table for converting
+		 * Huffman coded symbols into decoded symbols.  base[]
+		 * is the amount to subtract from the value of a
+		 * Huffman symbol of a given length when using
+		 * permute[].
+		 *
+		 * limit[] indicates the largest numerical value a
+		 * symbol with a given number of bits can have.  This
+		 * is how the Huffman codes can vary in length: each
+		 * code with a value > limit[length] needs another
+		 * bit.
+		 */
+		hufGroup = bd->groups+j;
+		hufGroup->minLen = minLen;
+		hufGroup->maxLen = maxLen;
+		/* Note that minLen can't be smaller than 1, so we
+		   adjust the base and limit array pointers so we're
+		   not always wasting the first entry.  We do this
+		   again when using them (during symbol decoding).*/
+		base = hufGroup->base-1;
+		limit = hufGroup->limit-1;
+		/* Calculate permute[].  Concurently, initialize
+		 * temp[] and limit[]. */
+		pp = 0;
+		for (i = minLen; i <= maxLen; i++) {
+			temp[i] = limit[i] = 0;
+			for (t = 0; t < symCount; t++)
+				if (length[t] == i)
+					hufGroup->permute[pp++] = t;
+		}
+		/* Count symbols coded for at each bit length */
+		for (i = 0; i < symCount; i++)
+			temp[length[i]]++;
+		/* Calculate limit[] (the largest symbol-coding value
+		 *at each bit length, which is (previous limit <<
+		 *1)+symbols at this level), and base[] (number of
+		 *symbols to ignore at each bit length, which is limit
+		 *minus the cumulative count of symbols coded for
+		 *already). */
+		pp = t = 0;
+		for (i = minLen; i < maxLen; i++) {
+			pp += temp[i];
+			/* We read the largest possible symbol size
+			   and then unget bits after determining how
+			   many we need, and those extra bits could be
+			   set to anything.  (They're noise from
+			   future symbols.)  At each level we're
+			   really only interested in the first few
+			   bits, so here we set all the trailing
+			   to-be-ignored bits to 1 so they don't
+			   affect the value > limit[length]
+			   comparison. */
+			limit[i] = (pp << (maxLen - i)) - 1;
+			pp <<= 1;
+			base[i+1] = pp-(t += temp[i]);
+		}
+		limit[maxLen+1] = INT_MAX; /* Sentinal value for
+					    * reading next sym. */
+		limit[maxLen] = pp+temp[maxLen]-1;
+		base[minLen] = 0;
+	}
+	/* We've finished reading and digesting the block header.  Now
+	   read this block's Huffman coded symbols from the file and
+	   undo the Huffman coding and run length encoding, saving the
+	   result into dbuf[dbufCount++] = uc */
+
+	/* Initialize symbol occurrence counters and symbol Move To
+	 * Front table */
+	for (i = 0; i < 256; i++) {
+		byteCount[i] = 0;
+		mtfSymbol[i] = (unsigned char)i;
+	}
+	/* Loop through compressed symbols. */
+	runPos = dbufCount = symCount = selector = 0;
+	for (;;) {
+		/* Determine which Huffman coding group to use. */
+		if (!(symCount--)) {
+			symCount = GROUP_SIZE-1;
+			if (selector >= nSelectors)
+				return RETVAL_DATA_ERROR;
+			hufGroup = bd->groups+selectors[selector++];
+			base = hufGroup->base-1;
+			limit = hufGroup->limit-1;
+		}
+		/* Read next Huffman-coded symbol. */
+		/* Note: It is far cheaper to read maxLen bits and
+		   back up than it is to read minLen bits and then an
+		   additional bit at a time, testing as we go.
+		   Because there is a trailing last block (with file
+		   CRC), there is no danger of the overread causing an
+		   unexpected EOF for a valid compressed file.  As a
+		   further optimization, we do the read inline
+		   (falling back to a call to get_bits if the buffer
+		   runs dry).  The following (up to got_huff_bits:) is
+		   equivalent to j = get_bits(bd, hufGroup->maxLen);
+		 */
+		while (bd->inbufBitCount < hufGroup->maxLen) {
+			if (bd->inbufPos == bd->inbufCount) {
+				j = get_bits(bd, hufGroup->maxLen);
+				goto got_huff_bits;
+			}
+			bd->inbufBits =
+				(bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
+			bd->inbufBitCount += 8;
+		};
+		bd->inbufBitCount -= hufGroup->maxLen;
+		j = (bd->inbufBits >> bd->inbufBitCount)&
+			((1 << hufGroup->maxLen)-1);
+got_huff_bits:
+		/* Figure how how many bits are in next symbol and
+		 * unget extras */
+		i = hufGroup->minLen;
+		while (j > limit[i])
+			++i;
+		bd->inbufBitCount += (hufGroup->maxLen - i);
+		/* Huffman decode value to get nextSym (with bounds checking) */
+		if ((i > hufGroup->maxLen)
+			|| (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i]))
+				>= MAX_SYMBOLS))
+			return RETVAL_DATA_ERROR;
+		nextSym = hufGroup->permute[j];
+		/* We have now decoded the symbol, which indicates
+		   either a new literal byte, or a repeated run of the
+		   most recent literal byte.  First, check if nextSym
+		   indicates a repeated run, and if so loop collecting
+		   how many times to repeat the last literal. */
+		if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
+			/* If this is the start of a new run, zero out
+			 * counter */
+			if (!runPos) {
+				runPos = 1;
+				t = 0;
+			}
+			/* Neat trick that saves 1 symbol: instead of
+			   or-ing 0 or 1 at each bit position, add 1
+			   or 2 instead.  For example, 1011 is 1 << 0
+			   + 1 << 1 + 2 << 2.  1010 is 2 << 0 + 2 << 1
+			   + 1 << 2.  You can make any bit pattern
+			   that way using 1 less symbol than the basic
+			   or 0/1 method (except all bits 0, which
+			   would use no symbols, but a run of length 0
+			   doesn't mean anything in this context).
+			   Thus space is saved. */
+			t += (runPos << nextSym);
+			/* +runPos if RUNA; +2*runPos if RUNB */
+
+			runPos <<= 1;
+			continue;
+		}
+		/* When we hit the first non-run symbol after a run,
+		   we now know how many times to repeat the last
+		   literal, so append that many copies to our buffer
+		   of decoded symbols (dbuf) now.  (The last literal
+		   used is the one at the head of the mtfSymbol
+		   array.) */
+		if (runPos) {
+			runPos = 0;
+			if (dbufCount+t >= dbufSize)
+				return RETVAL_DATA_ERROR;
+
+			uc = symToByte[mtfSymbol[0]];
+			byteCount[uc] += t;
+			while (t--)
+				dbuf[dbufCount++] = uc;
+		}
+		/* Is this the terminating symbol? */
+		if (nextSym > symTotal)
+			break;
+		/* At this point, nextSym indicates a new literal
+		   character.  Subtract one to get the position in the
+		   MTF array at which this literal is currently to be
+		   found.  (Note that the result can't be -1 or 0,
+		   because 0 and 1 are RUNA and RUNB.  But another
+		   instance of the first symbol in the mtf array,
+		   position 0, would have been handled as part of a
+		   run above.  Therefore 1 unused mtf position minus 2
+		   non-literal nextSym values equals -1.) */
+		if (dbufCount >= dbufSize)
+			return RETVAL_DATA_ERROR;
+		i = nextSym - 1;
+		uc = mtfSymbol[i];
+		/* Adjust the MTF array.  Since we typically expect to
+		 *move only a small number of symbols, and are bound
+		 *by 256 in any case, using memmove here would
+		 *typically be bigger and slower due to function call
+		 *overhead and other assorted setup costs. */
+		do {
+			mtfSymbol[i] = mtfSymbol[i-1];
+		} while (--i);
+		mtfSymbol[0] = uc;
+		uc = symToByte[uc];
+		/* We have our literal byte.  Save it into dbuf. */
+		byteCount[uc]++;
+		dbuf[dbufCount++] = (unsigned int)uc;
+	}
+	/* At this point, we've read all the Huffman-coded symbols
+	   (and repeated runs) for this block from the input stream,
+	   and decoded them into the intermediate buffer.  There are
+	   dbufCount many decoded bytes in dbuf[].  Now undo the
+	   Burrows-Wheeler transform on dbuf.  See
+	   http://dogma.net/markn/articles/bwt/bwt.htm
+	 */
+	/* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
+	j = 0;
+	for (i = 0; i < 256; i++) {
+		k = j+byteCount[i];
+		byteCount[i] = j;
+		j = k;
+	}
+	/* Figure out what order dbuf would be in if we sorted it. */
+	for (i = 0; i < dbufCount; i++) {
+		uc = (unsigned char)(dbuf[i] & 0xff);
+		dbuf[byteCount[uc]] |= (i << 8);
+		byteCount[uc]++;
+	}
+	/* Decode first byte by hand to initialize "previous" byte.
+	   Note that it doesn't get output, and if the first three
+	   characters are identical it doesn't qualify as a run (hence
+	   writeRunCountdown = 5). */
+	if (dbufCount) {
+		if (origPtr >= dbufCount)
+			return RETVAL_DATA_ERROR;
+		bd->writePos = dbuf[origPtr];
+		bd->writeCurrent = (unsigned char)(bd->writePos&0xff);
+		bd->writePos >>= 8;
+		bd->writeRunCountdown = 5;
+	}
+	bd->writeCount = dbufCount;
+
+	return RETVAL_OK;
+}
+
+/* Undo burrows-wheeler transform on intermediate buffer to produce output.
+   If start_bunzip was initialized with out_fd =-1, then up to len bytes of
+   data are written to outbuf.  Return value is number of bytes written or
+   error (all errors are negative numbers).  If out_fd!=-1, outbuf and len
+   are ignored, data is written to out_fd and return is RETVAL_OK or error.
+*/
+
+static int INIT read_bunzip(struct bunzip_data *bd, char *outbuf, int len)
+{
+	const unsigned int *dbuf;
+	int pos, xcurrent, previous, gotcount;
+
+	/* If last read was short due to end of file, return last block now */
+	if (bd->writeCount < 0)
+		return bd->writeCount;
+
+	gotcount = 0;
+	dbuf = bd->dbuf;
+	pos = bd->writePos;
+	xcurrent = bd->writeCurrent;
+
+	/* We will always have pending decoded data to write into the output
+	   buffer unless this is the very first call (in which case we haven't
+	   Huffman-decoded a block into the intermediate buffer yet). */
+
+	if (bd->writeCopies) {
+		/* Inside the loop, writeCopies means extra copies (beyond 1) */
+		--bd->writeCopies;
+		/* Loop outputting bytes */
+		for (;;) {
+			/* If the output buffer is full, snapshot
+			 * state and return */
+			if (gotcount >= len) {
+				bd->writePos = pos;
+				bd->writeCurrent = xcurrent;
+				bd->writeCopies++;
+				return len;
+			}
+			/* Write next byte into output buffer, updating CRC */
+			outbuf[gotcount++] = xcurrent;
+			bd->writeCRC = (((bd->writeCRC) << 8)
+				^bd->crc32Table[((bd->writeCRC) >> 24)
+				^xcurrent]);
+			/* Loop now if we're outputting multiple
+			 * copies of this byte */
+			if (bd->writeCopies) {
+				--bd->writeCopies;
+				continue;
+			}
+decode_next_byte:
+			if (!bd->writeCount--)
+				break;
+			/* Follow sequence vector to undo
+			 * Burrows-Wheeler transform */
+			previous = xcurrent;
+			pos = dbuf[pos];
+			xcurrent = pos&0xff;
+			pos >>= 8;
+			/* After 3 consecutive copies of the same
+			   byte, the 4th is a repeat count.  We count
+			   down from 4 instead *of counting up because
+			   testing for non-zero is faster */
+			if (--bd->writeRunCountdown) {
+				if (xcurrent != previous)
+					bd->writeRunCountdown = 4;
+			} else {
+				/* We have a repeated run, this byte
+				 * indicates the count */
+				bd->writeCopies = xcurrent;
+				xcurrent = previous;
+				bd->writeRunCountdown = 5;
+				/* Sometimes there are just 3 bytes
+				 * (run length 0) */
+				if (!bd->writeCopies)
+					goto decode_next_byte;
+				/* Subtract the 1 copy we'd output
+				 * anyway to get extras */
+				--bd->writeCopies;
+			}
+		}
+		/* Decompression of this block completed successfully */
+		bd->writeCRC = ~bd->writeCRC;
+		bd->totalCRC = ((bd->totalCRC << 1) |
+				(bd->totalCRC >> 31)) ^ bd->writeCRC;
+		/* If this block had a CRC error, force file level CRC error. */
+		if (bd->writeCRC != bd->headerCRC) {
+			bd->totalCRC = bd->headerCRC+1;
+			return RETVAL_LAST_BLOCK;
+		}
+	}
+
+	/* Refill the intermediate buffer by Huffman-decoding next
+	 * block of input */
+	/* (previous is just a convenient unused temp variable here) */
+	previous = get_next_block(bd);
+	if (previous) {
+		bd->writeCount = previous;
+		return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
+	}
+	bd->writeCRC = 0xffffffffUL;
+	pos = bd->writePos;
+	xcurrent = bd->writeCurrent;
+	goto decode_next_byte;
+}
+
+static int INIT nofill(void *buf, unsigned int len)
+{
+	return -1;
+}
+
+/* Allocate the structure, read file header.  If in_fd ==-1, inbuf must contain
+   a complete bunzip file (len bytes long).  If in_fd!=-1, inbuf and len are
+   ignored, and data is read from file handle into temporary buffer. */
+static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len,
+			     int (*fill)(void*, unsigned int))
+{
+	struct bunzip_data *bd;
+	unsigned int i, j, c;
+	const unsigned int BZh0 =
+		(((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16)
+		+(((unsigned int)'h') << 8)+(unsigned int)'0';
+
+	/* Figure out how much data to allocate */
+	i = sizeof(struct bunzip_data);
+
+	/* Allocate bunzip_data.  Most fields initialize to zero. */
+	bd = *bdp = malloc(i);
+	memset(bd, 0, sizeof(struct bunzip_data));
+	/* Setup input buffer */
+	bd->inbuf = inbuf;
+	bd->inbufCount = len;
+	if (fill != NULL)
+		bd->fill = fill;
+	else
+		bd->fill = nofill;
+
+	/* Init the CRC32 table (big endian) */
+	for (i = 0; i < 256; i++) {
+		c = i << 24;
+		for (j = 8; j; j--)
+			c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1);
+		bd->crc32Table[i] = c;
+	}
+
+	/* Ensure that file starts with "BZh['1'-'9']." */
+	i = get_bits(bd, 32);
+	if (((unsigned int)(i-BZh0-1)) >= 9)
+		return RETVAL_NOT_BZIP_DATA;
+
+	/* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
+	   uncompressed data.  Allocate intermediate buffer for block. */
+	bd->dbufSize = 100000*(i-BZh0);
+
+	bd->dbuf = large_malloc(bd->dbufSize * sizeof(int));
+	return RETVAL_OK;
+}
+
+/* Example usage: decompress src_fd to dst_fd.  (Stops at end of bzip2 data,
+   not end of file.) */
+STATIC int INIT bunzip2(unsigned char *buf, int len,
+			int(*fill)(void*, unsigned int),
+			int(*flush)(void*, unsigned int),
+			unsigned char *outbuf,
+			int *pos,
+			void(*error_fn)(char *x))
+{
+	struct bunzip_data *bd;
+	int i = -1;
+	unsigned char *inbuf;
+
+	set_error_fn(error_fn);
+	if (flush)
+		outbuf = malloc(BZIP2_IOBUF_SIZE);
+	else
+		len -= 4; /* Uncompressed size hack active in pre-boot
+			     environment */
+	if (!outbuf) {
+		error("Could not allocate output bufer");
+		return -1;
+	}
+	if (buf)
+		inbuf = buf;
+	else
+		inbuf = malloc(BZIP2_IOBUF_SIZE);
+	if (!inbuf) {
+		error("Could not allocate input bufer");
+		goto exit_0;
+	}
+	i = start_bunzip(&bd, inbuf, len, fill);
+	if (!i) {
+		for (;;) {
+			i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE);
+			if (i <= 0)
+				break;
+			if (!flush)
+				outbuf += i;
+			else
+				if (i != flush(outbuf, i)) {
+					i = RETVAL_UNEXPECTED_OUTPUT_EOF;
+					break;
+				}
+		}
+	}
+	/* Check CRC and release memory */
+	if (i == RETVAL_LAST_BLOCK) {
+		if (bd->headerCRC != bd->totalCRC)
+			error("Data integrity error when decompressing.");
+		else
+			i = RETVAL_OK;
+	} else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) {
+		error("Compressed file ends unexpectedly");
+	}
+	if (bd->dbuf)
+		large_free(bd->dbuf);
+	if (pos)
+		*pos = bd->inbufPos;
+	free(bd);
+	if (!buf)
+		free(inbuf);
+exit_0:
+	if (flush)
+		free(outbuf);
+	return i;
+}
+
+#define decompress bunzip2
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
new file mode 100644
index 000000000000..163e66aea5f6
--- /dev/null
+++ b/lib/decompress_inflate.c
@@ -0,0 +1,167 @@
+#ifdef STATIC
+/* Pre-boot environment: included */
+
+/* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots
+ * errors about console_printk etc... on ARM */
+#define _LINUX_KERNEL_H
+
+#include "zlib_inflate/inftrees.c"
+#include "zlib_inflate/inffast.c"
+#include "zlib_inflate/inflate.c"
+
+#else /* STATIC */
+/* initramfs et al: linked */
+
+#include <linux/zutil.h>
+
+#include "zlib_inflate/inftrees.h"
+#include "zlib_inflate/inffast.h"
+#include "zlib_inflate/inflate.h"
+
+#include "zlib_inflate/infutil.h"
+
+#endif /* STATIC */
+
+#include <linux/decompress/mm.h>
+
+#define INBUF_LEN (16*1024)
+
+/* Included from initramfs et al code */
+STATIC int INIT gunzip(unsigned char *buf, int len,
+		       int(*fill)(void*, unsigned int),
+		       int(*flush)(void*, unsigned int),
+		       unsigned char *out_buf,
+		       int *pos,
+		       void(*error_fn)(char *x)) {
+	u8 *zbuf;
+	struct z_stream_s *strm;
+	int rc;
+	size_t out_len;
+
+	set_error_fn(error_fn);
+	rc = -1;
+	if (flush) {
+		out_len = 0x8100; /* 32 K */
+		out_buf = malloc(out_len);
+	} else {
+		out_len = 0x7fffffff; /* no limit */
+	}
+	if (!out_buf) {
+		error("Out of memory while allocating output buffer");
+		goto gunzip_nomem1;
+	}
+
+	if (buf)
+		zbuf = buf;
+	else {
+		zbuf = malloc(INBUF_LEN);
+		len = 0;
+	}
+	if (!zbuf) {
+		error("Out of memory while allocating input buffer");
+		goto gunzip_nomem2;
+	}
+
+	strm = malloc(sizeof(*strm));
+	if (strm == NULL) {
+		error("Out of memory while allocating z_stream");
+		goto gunzip_nomem3;
+	}
+
+	strm->workspace = malloc(flush ? zlib_inflate_workspacesize() :
+				 sizeof(struct inflate_state));
+	if (strm->workspace == NULL) {
+		error("Out of memory while allocating workspace");
+		goto gunzip_nomem4;
+	}
+
+	if (len == 0)
+		len = fill(zbuf, INBUF_LEN);
+
+	/* verify the gzip header */
+	if (len < 10 ||
+	   zbuf[0] != 0x1f || zbuf[1] != 0x8b || zbuf[2] != 0x08) {
+		if (pos)
+			*pos = 0;
+		error("Not a gzip file");
+		goto gunzip_5;
+	}
+
+	/* skip over gzip header (1f,8b,08... 10 bytes total +
+	 * possible asciz filename)
+	 */
+	strm->next_in = zbuf + 10;
+	/* skip over asciz filename */
+	if (zbuf[3] & 0x8) {
+		while (strm->next_in[0])
+			strm->next_in++;
+		strm->next_in++;
+	}
+	strm->avail_in = len - 10;
+
+	strm->next_out = out_buf;
+	strm->avail_out = out_len;
+
+	rc = zlib_inflateInit2(strm, -MAX_WBITS);
+
+	if (!flush) {
+		WS(strm)->inflate_state.wsize = 0;
+		WS(strm)->inflate_state.window = NULL;
+	}
+
+	while (rc == Z_OK) {
+		if (strm->avail_in == 0) {
+			/* TODO: handle case where both pos and fill are set */
+			len = fill(zbuf, INBUF_LEN);
+			if (len < 0) {
+				rc = -1;
+				error("read error");
+				break;
+			}
+			strm->next_in = zbuf;
+			strm->avail_in = len;
+		}
+		rc = zlib_inflate(strm, 0);
+
+		/* Write any data generated */
+		if (flush && strm->next_out > out_buf) {
+			int l = strm->next_out - out_buf;
+			if (l != flush(out_buf, l)) {
+				rc = -1;
+				error("write error");
+				break;
+			}
+			strm->next_out = out_buf;
+			strm->avail_out = out_len;
+		}
+
+		/* after Z_FINISH, only Z_STREAM_END is "we unpacked it all" */
+		if (rc == Z_STREAM_END) {
+			rc = 0;
+			break;
+		} else if (rc != Z_OK) {
+			error("uncompression error");
+			rc = -1;
+		}
+	}
+
+	zlib_inflateEnd(strm);
+	if (pos)
+		/* add + 8 to skip over trailer */
+		*pos = strm->next_in - zbuf+8;
+
+gunzip_5:
+	free(strm->workspace);
+gunzip_nomem4:
+	free(strm);
+gunzip_nomem3:
+	if (!buf)
+		free(zbuf);
+gunzip_nomem2:
+	if (flush)
+		free(out_buf);
+gunzip_nomem1:
+	return rc; /* returns Z_OK (0) if successful */
+}
+
+#define decompress gunzip
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
new file mode 100644
index 000000000000..546f2f4c157e
--- /dev/null
+++ b/lib/decompress_unlzma.c
@@ -0,0 +1,647 @@
+/* Lzma decompressor for Linux kernel. Shamelessly snarfed
+ *from busybox 1.1.1
+ *
+ *Linux kernel adaptation
+ *Copyright (C) 2006  Alain < alain@knaff.lu >
+ *
+ *Based on small lzma deflate implementation/Small range coder
+ *implementation for lzma.
+ *Copyright (C) 2006  Aurelien Jacobs < aurel@gnuage.org >
+ *
+ *Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/)
+ *Copyright (C) 1999-2005  Igor Pavlov
+ *
+ *Copyrights of the parts, see headers below.
+ *
+ *
+ *This program is free software; you can redistribute it and/or
+ *modify it under the terms of the GNU Lesser General Public
+ *License as published by the Free Software Foundation; either
+ *version 2.1 of the License, or (at your option) any later version.
+ *
+ *This program is distributed in the hope that it will be useful,
+ *but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *Lesser General Public License for more details.
+ *
+ *You should have received a copy of the GNU Lesser General Public
+ *License along with this library; if not, write to the Free Software
+ *Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef STATIC
+#include <linux/decompress/unlzma.h>
+#endif /* STATIC */
+
+#include <linux/decompress/mm.h>
+
+#define	MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+static long long INIT read_int(unsigned char *ptr, int size)
+{
+	int i;
+	long long ret = 0;
+
+	for (i = 0; i < size; i++)
+		ret = (ret << 8) | ptr[size-i-1];
+	return ret;
+}
+
+#define ENDIAN_CONVERT(x) \
+  x = (typeof(x))read_int((unsigned char *)&x, sizeof(x))
+
+
+/* Small range coder implementation for lzma.
+ *Copyright (C) 2006  Aurelien Jacobs < aurel@gnuage.org >
+ *
+ *Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/)
+ *Copyright (c) 1999-2005  Igor Pavlov
+ */
+
+#include <linux/compiler.h>
+
+#define LZMA_IOBUF_SIZE	0x10000
+
+struct rc {
+	int (*fill)(void*, unsigned int);
+	uint8_t *ptr;
+	uint8_t *buffer;
+	uint8_t *buffer_end;
+	int buffer_size;
+	uint32_t code;
+	uint32_t range;
+	uint32_t bound;
+};
+
+
+#define RC_TOP_BITS 24
+#define RC_MOVE_BITS 5
+#define RC_MODEL_TOTAL_BITS 11
+
+
+/* Called twice: once at startup and once in rc_normalize() */
+static void INIT rc_read(struct rc *rc)
+{
+	rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE);
+	if (rc->buffer_size <= 0)
+		error("unexpected EOF");
+	rc->ptr = rc->buffer;
+	rc->buffer_end = rc->buffer + rc->buffer_size;
+}
+
+/* Called once */
+static inline void INIT rc_init(struct rc *rc,
+				       int (*fill)(void*, unsigned int),
+				       char *buffer, int buffer_size)
+{
+	rc->fill = fill;
+	rc->buffer = (uint8_t *)buffer;
+	rc->buffer_size = buffer_size;
+	rc->buffer_end = rc->buffer + rc->buffer_size;
+	rc->ptr = rc->buffer;
+
+	rc->code = 0;
+	rc->range = 0xFFFFFFFF;
+}
+
+static inline void INIT rc_init_code(struct rc *rc)
+{
+	int i;
+
+	for (i = 0; i < 5; i++) {
+		if (rc->ptr >= rc->buffer_end)
+			rc_read(rc);
+		rc->code = (rc->code << 8) | *rc->ptr++;
+	}
+}
+
+
+/* Called once. TODO: bb_maybe_free() */
+static inline void INIT rc_free(struct rc *rc)
+{
+	free(rc->buffer);
+}
+
+/* Called twice, but one callsite is in inline'd rc_is_bit_0_helper() */
+static void INIT rc_do_normalize(struct rc *rc)
+{
+	if (rc->ptr >= rc->buffer_end)
+		rc_read(rc);
+	rc->range <<= 8;
+	rc->code = (rc->code << 8) | *rc->ptr++;
+}
+static inline void INIT rc_normalize(struct rc *rc)
+{
+	if (rc->range < (1 << RC_TOP_BITS))
+		rc_do_normalize(rc);
+}
+
+/* Called 9 times */
+/* Why rc_is_bit_0_helper exists?
+ *Because we want to always expose (rc->code < rc->bound) to optimizer
+ */
+static inline uint32_t INIT rc_is_bit_0_helper(struct rc *rc, uint16_t *p)
+{
+	rc_normalize(rc);
+	rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS);
+	return rc->bound;
+}
+static inline int INIT rc_is_bit_0(struct rc *rc, uint16_t *p)
+{
+	uint32_t t = rc_is_bit_0_helper(rc, p);
+	return rc->code < t;
+}
+
+/* Called ~10 times, but very small, thus inlined */
+static inline void INIT rc_update_bit_0(struct rc *rc, uint16_t *p)
+{
+	rc->range = rc->bound;
+	*p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS;
+}
+static inline void rc_update_bit_1(struct rc *rc, uint16_t *p)
+{
+	rc->range -= rc->bound;
+	rc->code -= rc->bound;
+	*p -= *p >> RC_MOVE_BITS;
+}
+
+/* Called 4 times in unlzma loop */
+static int INIT rc_get_bit(struct rc *rc, uint16_t *p, int *symbol)
+{
+	if (rc_is_bit_0(rc, p)) {
+		rc_update_bit_0(rc, p);
+		*symbol *= 2;
+		return 0;
+	} else {
+		rc_update_bit_1(rc, p);
+		*symbol = *symbol * 2 + 1;
+		return 1;
+	}
+}
+
+/* Called once */
+static inline int INIT rc_direct_bit(struct rc *rc)
+{
+	rc_normalize(rc);
+	rc->range >>= 1;
+	if (rc->code >= rc->range) {
+		rc->code -= rc->range;
+		return 1;
+	}
+	return 0;
+}
+
+/* Called twice */
+static inline void INIT
+rc_bit_tree_decode(struct rc *rc, uint16_t *p, int num_levels, int *symbol)
+{
+	int i = num_levels;
+
+	*symbol = 1;
+	while (i--)
+		rc_get_bit(rc, p + *symbol, symbol);
+	*symbol -= 1 << num_levels;
+}
+
+
+/*
+ * Small lzma deflate implementation.
+ * Copyright (C) 2006  Aurelien Jacobs < aurel@gnuage.org >
+ *
+ * Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/)
+ * Copyright (C) 1999-2005  Igor Pavlov
+ */
+
+
+struct lzma_header {
+	uint8_t pos;
+	uint32_t dict_size;
+	uint64_t dst_size;
+} __attribute__ ((packed)) ;
+
+
+#define LZMA_BASE_SIZE 1846
+#define LZMA_LIT_SIZE 768
+
+#define LZMA_NUM_POS_BITS_MAX 4
+
+#define LZMA_LEN_NUM_LOW_BITS 3
+#define LZMA_LEN_NUM_MID_BITS 3
+#define LZMA_LEN_NUM_HIGH_BITS 8
+
+#define LZMA_LEN_CHOICE 0
+#define LZMA_LEN_CHOICE_2 (LZMA_LEN_CHOICE + 1)
+#define LZMA_LEN_LOW (LZMA_LEN_CHOICE_2 + 1)
+#define LZMA_LEN_MID (LZMA_LEN_LOW \
+		      + (1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_LOW_BITS)))
+#define LZMA_LEN_HIGH (LZMA_LEN_MID \
+		       +(1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_MID_BITS)))
+#define LZMA_NUM_LEN_PROBS (LZMA_LEN_HIGH + (1 << LZMA_LEN_NUM_HIGH_BITS))
+
+#define LZMA_NUM_STATES 12
+#define LZMA_NUM_LIT_STATES 7
+
+#define LZMA_START_POS_MODEL_INDEX 4
+#define LZMA_END_POS_MODEL_INDEX 14
+#define LZMA_NUM_FULL_DISTANCES (1 << (LZMA_END_POS_MODEL_INDEX >> 1))
+
+#define LZMA_NUM_POS_SLOT_BITS 6
+#define LZMA_NUM_LEN_TO_POS_STATES 4
+
+#define LZMA_NUM_ALIGN_BITS 4
+
+#define LZMA_MATCH_MIN_LEN 2
+
+#define LZMA_IS_MATCH 0
+#define LZMA_IS_REP (LZMA_IS_MATCH + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
+#define LZMA_IS_REP_G0 (LZMA_IS_REP + LZMA_NUM_STATES)
+#define LZMA_IS_REP_G1 (LZMA_IS_REP_G0 + LZMA_NUM_STATES)
+#define LZMA_IS_REP_G2 (LZMA_IS_REP_G1 + LZMA_NUM_STATES)
+#define LZMA_IS_REP_0_LONG (LZMA_IS_REP_G2 + LZMA_NUM_STATES)
+#define LZMA_POS_SLOT (LZMA_IS_REP_0_LONG \
+		       + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
+#define LZMA_SPEC_POS (LZMA_POS_SLOT \
+		       +(LZMA_NUM_LEN_TO_POS_STATES << LZMA_NUM_POS_SLOT_BITS))
+#define LZMA_ALIGN (LZMA_SPEC_POS \
+		    + LZMA_NUM_FULL_DISTANCES - LZMA_END_POS_MODEL_INDEX)
+#define LZMA_LEN_CODER (LZMA_ALIGN + (1 << LZMA_NUM_ALIGN_BITS))
+#define LZMA_REP_LEN_CODER (LZMA_LEN_CODER + LZMA_NUM_LEN_PROBS)
+#define LZMA_LITERAL (LZMA_REP_LEN_CODER + LZMA_NUM_LEN_PROBS)
+
+
+struct writer {
+	uint8_t *buffer;
+	uint8_t previous_byte;
+	size_t buffer_pos;
+	int bufsize;
+	size_t global_pos;
+	int(*flush)(void*, unsigned int);
+	struct lzma_header *header;
+};
+
+struct cstate {
+	int state;
+	uint32_t rep0, rep1, rep2, rep3;
+};
+
+static inline size_t INIT get_pos(struct writer *wr)
+{
+	return
+		wr->global_pos + wr->buffer_pos;
+}
+
+static inline uint8_t INIT peek_old_byte(struct writer *wr,
+						uint32_t offs)
+{
+	if (!wr->flush) {
+		int32_t pos;
+		while (offs > wr->header->dict_size)
+			offs -= wr->header->dict_size;
+		pos = wr->buffer_pos - offs;
+		return wr->buffer[pos];
+	} else {
+		uint32_t pos = wr->buffer_pos - offs;
+		while (pos >= wr->header->dict_size)
+			pos += wr->header->dict_size;
+		return wr->buffer[pos];
+	}
+
+}
+
+static inline void INIT write_byte(struct writer *wr, uint8_t byte)
+{
+	wr->buffer[wr->buffer_pos++] = wr->previous_byte = byte;
+	if (wr->flush && wr->buffer_pos == wr->header->dict_size) {
+		wr->buffer_pos = 0;
+		wr->global_pos += wr->header->dict_size;
+		wr->flush((char *)wr->buffer, wr->header->dict_size);
+	}
+}
+
+
+static inline void INIT copy_byte(struct writer *wr, uint32_t offs)
+{
+	write_byte(wr, peek_old_byte(wr, offs));
+}
+
+static inline void INIT copy_bytes(struct writer *wr,
+					 uint32_t rep0, int len)
+{
+	do {
+		copy_byte(wr, rep0);
+		len--;
+	} while (len != 0 && wr->buffer_pos < wr->header->dst_size);
+}
+
+static inline void INIT process_bit0(struct writer *wr, struct rc *rc,
+				     struct cstate *cst, uint16_t *p,
+				     int pos_state, uint16_t *prob,
+				     int lc, uint32_t literal_pos_mask) {
+	int mi = 1;
+	rc_update_bit_0(rc, prob);
+	prob = (p + LZMA_LITERAL +
+		(LZMA_LIT_SIZE
+		 * (((get_pos(wr) & literal_pos_mask) << lc)
+		    + (wr->previous_byte >> (8 - lc))))
+		);
+
+	if (cst->state >= LZMA_NUM_LIT_STATES) {
+		int match_byte = peek_old_byte(wr, cst->rep0);
+		do {
+			int bit;
+			uint16_t *prob_lit;
+
+			match_byte <<= 1;
+			bit = match_byte & 0x100;
+			prob_lit = prob + 0x100 + bit + mi;
+			if (rc_get_bit(rc, prob_lit, &mi)) {
+				if (!bit)
+					break;
+			} else {
+				if (bit)
+					break;
+			}
+		} while (mi < 0x100);
+	}
+	while (mi < 0x100) {
+		uint16_t *prob_lit = prob + mi;
+		rc_get_bit(rc, prob_lit, &mi);
+	}
+	write_byte(wr, mi);
+	if (cst->state < 4)
+		cst->state = 0;
+	else if (cst->state < 10)
+		cst->state -= 3;
+	else
+		cst->state -= 6;
+}
+
+static inline void INIT process_bit1(struct writer *wr, struct rc *rc,
+					    struct cstate *cst, uint16_t *p,
+					    int pos_state, uint16_t *prob) {
+  int offset;
+	uint16_t *prob_len;
+	int num_bits;
+	int len;
+
+	rc_update_bit_1(rc, prob);
+	prob = p + LZMA_IS_REP + cst->state;
+	if (rc_is_bit_0(rc, prob)) {
+		rc_update_bit_0(rc, prob);
+		cst->rep3 = cst->rep2;
+		cst->rep2 = cst->rep1;
+		cst->rep1 = cst->rep0;
+		cst->state = cst->state < LZMA_NUM_LIT_STATES ? 0 : 3;
+		prob = p + LZMA_LEN_CODER;
+	} else {
+		rc_update_bit_1(rc, prob);
+		prob = p + LZMA_IS_REP_G0 + cst->state;
+		if (rc_is_bit_0(rc, prob)) {
+			rc_update_bit_0(rc, prob);
+			prob = (p + LZMA_IS_REP_0_LONG
+				+ (cst->state <<
+				   LZMA_NUM_POS_BITS_MAX) +
+				pos_state);
+			if (rc_is_bit_0(rc, prob)) {
+				rc_update_bit_0(rc, prob);
+
+				cst->state = cst->state < LZMA_NUM_LIT_STATES ?
+					9 : 11;
+				copy_byte(wr, cst->rep0);
+				return;
+			} else {
+				rc_update_bit_1(rc, prob);
+			}
+		} else {
+			uint32_t distance;
+
+			rc_update_bit_1(rc, prob);
+			prob = p + LZMA_IS_REP_G1 + cst->state;
+			if (rc_is_bit_0(rc, prob)) {
+				rc_update_bit_0(rc, prob);
+				distance = cst->rep1;
+			} else {
+				rc_update_bit_1(rc, prob);
+				prob = p + LZMA_IS_REP_G2 + cst->state;
+				if (rc_is_bit_0(rc, prob)) {
+					rc_update_bit_0(rc, prob);
+					distance = cst->rep2;
+				} else {
+					rc_update_bit_1(rc, prob);
+					distance = cst->rep3;
+					cst->rep3 = cst->rep2;
+				}
+				cst->rep2 = cst->rep1;
+			}
+			cst->rep1 = cst->rep0;
+			cst->rep0 = distance;
+		}
+		cst->state = cst->state < LZMA_NUM_LIT_STATES ? 8 : 11;
+		prob = p + LZMA_REP_LEN_CODER;
+	}
+
+	prob_len = prob + LZMA_LEN_CHOICE;
+	if (rc_is_bit_0(rc, prob_len)) {
+		rc_update_bit_0(rc, prob_len);
+		prob_len = (prob + LZMA_LEN_LOW
+			    + (pos_state <<
+			       LZMA_LEN_NUM_LOW_BITS));
+		offset = 0;
+		num_bits = LZMA_LEN_NUM_LOW_BITS;
+	} else {
+		rc_update_bit_1(rc, prob_len);
+		prob_len = prob + LZMA_LEN_CHOICE_2;
+		if (rc_is_bit_0(rc, prob_len)) {
+			rc_update_bit_0(rc, prob_len);
+			prob_len = (prob + LZMA_LEN_MID
+				    + (pos_state <<
+				       LZMA_LEN_NUM_MID_BITS));
+			offset = 1 << LZMA_LEN_NUM_LOW_BITS;
+			num_bits = LZMA_LEN_NUM_MID_BITS;
+		} else {
+			rc_update_bit_1(rc, prob_len);
+			prob_len = prob + LZMA_LEN_HIGH;
+			offset = ((1 << LZMA_LEN_NUM_LOW_BITS)
+				  + (1 << LZMA_LEN_NUM_MID_BITS));
+			num_bits = LZMA_LEN_NUM_HIGH_BITS;
+		}
+	}
+
+	rc_bit_tree_decode(rc, prob_len, num_bits, &len);
+	len += offset;
+
+	if (cst->state < 4) {
+		int pos_slot;
+
+		cst->state += LZMA_NUM_LIT_STATES;
+		prob =
+			p + LZMA_POS_SLOT +
+			((len <
+			  LZMA_NUM_LEN_TO_POS_STATES ? len :
+			  LZMA_NUM_LEN_TO_POS_STATES - 1)
+			 << LZMA_NUM_POS_SLOT_BITS);
+		rc_bit_tree_decode(rc, prob,
+				   LZMA_NUM_POS_SLOT_BITS,
+				   &pos_slot);
+		if (pos_slot >= LZMA_START_POS_MODEL_INDEX) {
+			int i, mi;
+			num_bits = (pos_slot >> 1) - 1;
+			cst->rep0 = 2 | (pos_slot & 1);
+			if (pos_slot < LZMA_END_POS_MODEL_INDEX) {
+				cst->rep0 <<= num_bits;
+				prob = p + LZMA_SPEC_POS +
+					cst->rep0 - pos_slot - 1;
+			} else {
+				num_bits -= LZMA_NUM_ALIGN_BITS;
+				while (num_bits--)
+					cst->rep0 = (cst->rep0 << 1) |
+						rc_direct_bit(rc);
+				prob = p + LZMA_ALIGN;
+				cst->rep0 <<= LZMA_NUM_ALIGN_BITS;
+				num_bits = LZMA_NUM_ALIGN_BITS;
+			}
+			i = 1;
+			mi = 1;
+			while (num_bits--) {
+				if (rc_get_bit(rc, prob + mi, &mi))
+					cst->rep0 |= i;
+				i <<= 1;
+			}
+		} else
+			cst->rep0 = pos_slot;
+		if (++(cst->rep0) == 0)
+			return;
+	}
+
+	len += LZMA_MATCH_MIN_LEN;
+
+	copy_bytes(wr, cst->rep0, len);
+}
+
+
+
+STATIC inline int INIT unlzma(unsigned char *buf, int in_len,
+			      int(*fill)(void*, unsigned int),
+			      int(*flush)(void*, unsigned int),
+			      unsigned char *output,
+			      int *posp,
+			      void(*error_fn)(char *x)
+	)
+{
+	struct lzma_header header;
+	int lc, pb, lp;
+	uint32_t pos_state_mask;
+	uint32_t literal_pos_mask;
+	uint16_t *p;
+	int num_probs;
+	struct rc rc;
+	int i, mi;
+	struct writer wr;
+	struct cstate cst;
+	unsigned char *inbuf;
+	int ret = -1;
+
+	set_error_fn(error_fn);
+	if (!flush)
+		in_len -= 4; /* Uncompressed size hack active in pre-boot
+				environment */
+	if (buf)
+		inbuf = buf;
+	else
+		inbuf = malloc(LZMA_IOBUF_SIZE);
+	if (!inbuf) {
+		error("Could not allocate input bufer");
+		goto exit_0;
+	}
+
+	cst.state = 0;
+	cst.rep0 = cst.rep1 = cst.rep2 = cst.rep3 = 1;
+
+	wr.header = &header;
+	wr.flush = flush;
+	wr.global_pos = 0;
+	wr.previous_byte = 0;
+	wr.buffer_pos = 0;
+
+	rc_init(&rc, fill, inbuf, in_len);
+
+	for (i = 0; i < sizeof(header); i++) {
+		if (rc.ptr >= rc.buffer_end)
+			rc_read(&rc);
+		((unsigned char *)&header)[i] = *rc.ptr++;
+	}
+
+	if (header.pos >= (9 * 5 * 5))
+		error("bad header");
+
+	mi = 0;
+	lc = header.pos;
+	while (lc >= 9) {
+		mi++;
+		lc -= 9;
+	}
+	pb = 0;
+	lp = mi;
+	while (lp >= 5) {
+		pb++;
+		lp -= 5;
+	}
+	pos_state_mask = (1 << pb) - 1;
+	literal_pos_mask = (1 << lp) - 1;
+
+	ENDIAN_CONVERT(header.dict_size);
+	ENDIAN_CONVERT(header.dst_size);
+
+	if (header.dict_size == 0)
+		header.dict_size = 1;
+
+	if (output)
+		wr.buffer = output;
+	else {
+		wr.bufsize = MIN(header.dst_size, header.dict_size);
+		wr.buffer = large_malloc(wr.bufsize);
+	}
+	if (wr.buffer == NULL)
+		goto exit_1;
+
+	num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
+	p = (uint16_t *) large_malloc(num_probs * sizeof(*p));
+	if (p == 0)
+		goto exit_2;
+	num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
+	for (i = 0; i < num_probs; i++)
+		p[i] = (1 << RC_MODEL_TOTAL_BITS) >> 1;
+
+	rc_init_code(&rc);
+
+	while (get_pos(&wr) < header.dst_size) {
+		int pos_state =	get_pos(&wr) & pos_state_mask;
+		uint16_t *prob = p + LZMA_IS_MATCH +
+			(cst.state << LZMA_NUM_POS_BITS_MAX) + pos_state;
+		if (rc_is_bit_0(&rc, prob))
+			process_bit0(&wr, &rc, &cst, p, pos_state, prob,
+				     lc, literal_pos_mask);
+		else {
+			process_bit1(&wr, &rc, &cst, p, pos_state, prob);
+			if (cst.rep0 == 0)
+				break;
+		}
+	}
+
+	if (posp)
+		*posp = rc.ptr-rc.buffer;
+	if (wr.flush)
+		wr.flush(wr.buffer, wr.buffer_pos);
+	ret = 0;
+	large_free(p);
+exit_2:
+	if (!output)
+		large_free(wr.buffer);
+exit_1:
+	if (!buf)
+		free(inbuf);
+exit_0:
+	return ret;
+}
+
+#define decompress unlzma
diff --git a/lib/zlib_inflate/inflate.h b/lib/zlib_inflate/inflate.h
index df8a6c92052d..3d17b3d1b21f 100644
--- a/lib/zlib_inflate/inflate.h
+++ b/lib/zlib_inflate/inflate.h
@@ -1,3 +1,6 @@
+#ifndef INFLATE_H
+#define INFLATE_H
+
 /* inflate.h -- internal inflate state definition
  * Copyright (C) 1995-2004 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
@@ -105,3 +108,4 @@ struct inflate_state {
     unsigned short work[288];   /* work area for code table building */
     code codes[ENOUGH];         /* space for code tables */
 };
+#endif
diff --git a/lib/zlib_inflate/inftrees.h b/lib/zlib_inflate/inftrees.h
index 5f5219b1240e..b70b4731ac7a 100644
--- a/lib/zlib_inflate/inftrees.h
+++ b/lib/zlib_inflate/inftrees.h
@@ -1,3 +1,6 @@
+#ifndef INFTREES_H
+#define INFTREES_H
+
 /* inftrees.h -- header to use inftrees.c
  * Copyright (C) 1995-2005 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
@@ -53,3 +56,4 @@ typedef enum {
 extern int zlib_inflate_table (codetype type, unsigned short *lens,
                              unsigned codes, code **table,
                              unsigned *bits, unsigned short *work);
+#endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index e06365775bdf..70b4676e3b99 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -186,3 +186,17 @@ quiet_cmd_gzip = GZIP    $@
 cmd_gzip = gzip -f -9 < $< > $@
 
 
+# Bzip2
+# ---------------------------------------------------------------------------
+
+# Bzip2 does not include size in file... so we have to fake that
+size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
+
+quiet_cmd_bzip2 = BZIP2    $@
+cmd_bzip2 = (bzip2 -9 < $< ; $(size_append) $<) > $@ || (rm -f $@ ; false)
+
+# Lzma
+# ---------------------------------------------------------------------------
+
+quiet_cmd_lzma = LZMA    $@
+cmd_lzma = (lzma -9 -c $< ; $(size_append) $<) >$@ || (rm -f $@ ; false)
diff --git a/scripts/bin_size b/scripts/bin_size
new file mode 100644
index 000000000000..43e1b360cee6
--- /dev/null
+++ b/scripts/bin_size
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+if [ $# = 0 ] ; then
+   echo Usage: $0 file
+fi
+
+size_dec=`stat -c "%s" $1`
+size_hex_echo_string=`printf "%08x" $size_dec |
+     sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'`
+/bin/echo -ne $size_hex_echo_string
-- 
cgit v1.2.3-71-gd317


From f0402a262e1a4c03fc66b83659823bdcaac3c41a Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 5 Jan 2009 23:59:01 +0900
Subject: generic: add common struct for dma map operations

This adds struct dma_map_ops include/linux/dma-mapping.h, which, is
used to handle multiple sets of dma mapping API.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/dma-mapping.h | 48 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index ba9114ec5d3a..d7d090d21031 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -3,6 +3,8 @@
 
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/dma-attrs.h>
+#include <linux/scatterlist.h>
 
 /* These definitions mirror those in pci.h, so they can be used
  * interchangeably with their PCI_ counterparts */
@@ -13,6 +15,52 @@ enum dma_data_direction {
 	DMA_NONE = 3,
 };
 
+struct dma_map_ops {
+	void* (*alloc_coherent)(struct device *dev, size_t size,
+				dma_addr_t *dma_handle, gfp_t gfp);
+	void (*free_coherent)(struct device *dev, size_t size,
+			      void *vaddr, dma_addr_t dma_handle);
+	dma_addr_t (*map_page)(struct device *dev, struct page *page,
+			       unsigned long offset, size_t size,
+			       enum dma_data_direction dir,
+			       struct dma_attrs *attrs);
+	void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
+			   size_t size, enum dma_data_direction dir,
+			   struct dma_attrs *attrs);
+	int (*map_sg)(struct device *dev, struct scatterlist *sg,
+		      int nents, enum dma_data_direction dir,
+		      struct dma_attrs *attrs);
+	void (*unmap_sg)(struct device *dev,
+			 struct scatterlist *sg, int nents,
+			 enum dma_data_direction dir,
+			 struct dma_attrs *attrs);
+	void (*sync_single_for_cpu)(struct device *dev,
+				    dma_addr_t dma_handle, size_t size,
+				    enum dma_data_direction dir);
+	void (*sync_single_for_device)(struct device *dev,
+				       dma_addr_t dma_handle, size_t size,
+				       enum dma_data_direction dir);
+	void (*sync_single_range_for_cpu)(struct device *dev,
+					  dma_addr_t dma_handle,
+					  unsigned long offset,
+					  size_t size,
+					  enum dma_data_direction dir);
+	void (*sync_single_range_for_device)(struct device *dev,
+					     dma_addr_t dma_handle,
+					     unsigned long offset,
+					     size_t size,
+					     enum dma_data_direction dir);
+	void (*sync_sg_for_cpu)(struct device *dev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction dir);
+	void (*sync_sg_for_device)(struct device *dev,
+				   struct scatterlist *sg, int nents,
+				   enum dma_data_direction dir);
+	int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
+	int (*dma_supported)(struct device *dev, u64 mask);
+	int is_phys;
+};
+
 #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
 
 /*
-- 
cgit v1.2.3-71-gd317


From 160c1d8e40866edfeae7d68816b7005d70acf391 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 5 Jan 2009 23:59:02 +0900
Subject: x86, ia64: convert to use generic dma_map_ops struct

This converts X86 and IA64 to use include/linux/dma-mapping.h.

It's a bit large but pretty boring. The major change for X86 is
converting 'int dir' to 'enum dma_data_direction dir' in DMA mapping
operations. The major changes for IA64 is using map_page and
unmap_page instead of map_single and unmap_single.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/dig/Makefile              |   4 +-
 arch/ia64/dig/dig_vtd_iommu.c       |  77 -------------------
 arch/ia64/hp/common/hwsw_iommu.c    |   6 +-
 arch/ia64/hp/common/sba_iommu.c     |  46 ++++++++----
 arch/ia64/include/asm/dma-mapping.h | 107 ++++++++------------------
 arch/ia64/include/asm/machvec.h     |  12 +--
 arch/ia64/kernel/dma-mapping.c      |   4 +-
 arch/ia64/kernel/machvec.c          |   8 +-
 arch/ia64/kernel/pci-dma.c          |  49 +++++++-----
 arch/ia64/kernel/pci-swiotlb.c      |  32 +++++---
 arch/ia64/sn/pci/pci_dma.c          |  58 +++++++-------
 arch/x86/include/asm/device.h       |   2 +-
 arch/x86/include/asm/dma-mapping.h  | 146 +++++++++++++-----------------------
 arch/x86/include/asm/iommu.h        |   2 +-
 arch/x86/kernel/amd_iommu.c         |   8 +-
 arch/x86/kernel/pci-calgary_64.c    |  15 ++--
 arch/x86/kernel/pci-dma.c           |   4 +-
 arch/x86/kernel/pci-gart_64.c       |  14 ++--
 arch/x86/kernel/pci-nommu.c         |   5 +-
 arch/x86/kernel/pci-swiotlb_64.c    |   6 +-
 drivers/pci/intel-iommu.c           |   9 +--
 include/linux/intel-iommu.h         |   6 +-
 include/linux/swiotlb.h             |  18 +++--
 lib/swiotlb.c                       |  18 +++--
 24 files changed, 278 insertions(+), 378 deletions(-)
 delete mode 100644 arch/ia64/dig/dig_vtd_iommu.c

(limited to 'include/linux')

diff --git a/arch/ia64/dig/Makefile b/arch/ia64/dig/Makefile
index 5c0283830bd6..2f7caddf093e 100644
--- a/arch/ia64/dig/Makefile
+++ b/arch/ia64/dig/Makefile
@@ -7,8 +7,8 @@
 
 obj-y := setup.o
 ifeq ($(CONFIG_DMAR), y)
-obj-$(CONFIG_IA64_GENERIC) += machvec.o machvec_vtd.o dig_vtd_iommu.o
+obj-$(CONFIG_IA64_GENERIC) += machvec.o machvec_vtd.o
 else
 obj-$(CONFIG_IA64_GENERIC) += machvec.o
 endif
-obj-$(CONFIG_IA64_DIG_VTD) += dig_vtd_iommu.o
+
diff --git a/arch/ia64/dig/dig_vtd_iommu.c b/arch/ia64/dig/dig_vtd_iommu.c
deleted file mode 100644
index fdb8ba9f4992..000000000000
--- a/arch/ia64/dig/dig_vtd_iommu.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/dma-mapping.h>
-#include <linux/intel-iommu.h>
-
-void *
-vtd_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		 gfp_t flags)
-{
-	return intel_alloc_coherent(dev, size, dma_handle, flags);
-}
-EXPORT_SYMBOL_GPL(vtd_alloc_coherent);
-
-void
-vtd_free_coherent(struct device *dev, size_t size, void *vaddr,
-		 dma_addr_t dma_handle)
-{
-	intel_free_coherent(dev, size, vaddr, dma_handle);
-}
-EXPORT_SYMBOL_GPL(vtd_free_coherent);
-
-dma_addr_t
-vtd_map_single_attrs(struct device *dev, void *addr, size_t size,
-		     int dir, struct dma_attrs *attrs)
-{
-	return intel_map_single(dev, (phys_addr_t)addr, size, dir);
-}
-EXPORT_SYMBOL_GPL(vtd_map_single_attrs);
-
-void
-vtd_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size,
-		       int dir, struct dma_attrs *attrs)
-{
-	intel_unmap_single(dev, iova, size, dir);
-}
-EXPORT_SYMBOL_GPL(vtd_unmap_single_attrs);
-
-int
-vtd_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents,
-		 int dir, struct dma_attrs *attrs)
-{
-	return intel_map_sg(dev, sglist, nents, dir);
-}
-EXPORT_SYMBOL_GPL(vtd_map_sg_attrs);
-
-void
-vtd_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
-		   int nents, int dir, struct dma_attrs *attrs)
-{
-	intel_unmap_sg(dev, sglist, nents, dir);
-}
-EXPORT_SYMBOL_GPL(vtd_unmap_sg_attrs);
-
-int
-vtd_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(vtd_dma_mapping_error);
-
-extern int iommu_dma_supported(struct device *dev, u64 mask);
-
-struct dma_mapping_ops vtd_dma_ops = {
-	.alloc_coherent		= vtd_alloc_coherent,
-	.free_coherent		= vtd_free_coherent,
-	.map_single_attrs	= vtd_map_single_attrs,
-	.unmap_single_attrs	= vtd_unmap_single_attrs,
-	.map_sg_attrs		= vtd_map_sg_attrs,
-	.unmap_sg_attrs		= vtd_unmap_sg_attrs,
-	.sync_single_for_cpu	= machvec_dma_sync_single,
-	.sync_sg_for_cpu	= machvec_dma_sync_sg,
-	.sync_single_for_device	= machvec_dma_sync_single,
-	.sync_sg_for_device	= machvec_dma_sync_sg,
-	.dma_supported_op	= iommu_dma_supported,
-	.mapping_error		= vtd_dma_mapping_error,
-};
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index e5bbeba77810..e4a80d82e3d8 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -17,7 +17,7 @@
 #include <linux/swiotlb.h>
 #include <asm/machvec.h>
 
-extern struct dma_mapping_ops sba_dma_ops, swiotlb_dma_ops;
+extern struct dma_map_ops sba_dma_ops, swiotlb_dma_ops;
 
 /* swiotlb declarations & definitions: */
 extern int swiotlb_late_init_with_default_size (size_t size);
@@ -30,10 +30,10 @@ extern int swiotlb_late_init_with_default_size (size_t size);
 static inline int use_swiotlb(struct device *dev)
 {
 	return dev && dev->dma_mask &&
-		!sba_dma_ops.dma_supported_op(dev, *dev->dma_mask);
+		!sba_dma_ops.dma_supported(dev, *dev->dma_mask);
 }
 
-struct dma_mapping_ops *hwsw_dma_get_ops(struct device *dev)
+struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
 {
 	if (use_swiotlb(dev))
 		return &swiotlb_dma_ops;
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 29e7206f3dc6..129b62eb39e5 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -909,11 +909,13 @@ sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
  *
  * See Documentation/DMA-mapping.txt
  */
-static dma_addr_t
-sba_map_single_attrs(struct device *dev, void *addr, size_t size, int dir,
-		     struct dma_attrs *attrs)
+static dma_addr_t sba_map_page(struct device *dev, struct page *page,
+			       unsigned long poff, size_t size,
+			       enum dma_data_direction dir,
+			       struct dma_attrs *attrs)
 {
 	struct ioc *ioc;
+	void *addr = page_address(page) + poff;
 	dma_addr_t iovp;
 	dma_addr_t offset;
 	u64 *pdir_start;
@@ -992,6 +994,14 @@ sba_map_single_attrs(struct device *dev, void *addr, size_t size, int dir,
 	return SBA_IOVA(ioc, iovp, offset);
 }
 
+static dma_addr_t sba_map_single_attrs(struct device *dev, void *addr,
+				       size_t size, enum dma_data_direction dir,
+				       struct dma_attrs *attrs)
+{
+	return sba_map_page(dev, virt_to_page(addr),
+			    (unsigned long)addr & ~PAGE_MASK, size, dir, attrs);
+}
+
 #ifdef ENABLE_MARK_CLEAN
 static SBA_INLINE void
 sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
@@ -1026,8 +1036,8 @@ sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
  *
  * See Documentation/DMA-mapping.txt
  */
-static void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size,
-				   int dir, struct dma_attrs *attrs)
+static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
+			   enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct ioc *ioc;
 #if DELAYED_RESOURCE_CNT > 0
@@ -1095,6 +1105,12 @@ static void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t s
 #endif /* DELAYED_RESOURCE_CNT == 0 */
 }
 
+void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size,
+			    enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	sba_unmap_page(dev, iova, size, dir, attrs);
+}
+
 /**
  * sba_alloc_coherent - allocate/map shared mem for DMA
  * @dev: instance of PCI owned by the driver that's asking.
@@ -1423,7 +1439,8 @@ sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
  * See Documentation/DMA-mapping.txt
  */
 static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			    int nents, int dir, struct dma_attrs *attrs)
+			    int nents, enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
 {
 	struct ioc *ioc;
 	int coalesced, filled = 0;
@@ -1514,7 +1531,8 @@ static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
  * See Documentation/DMA-mapping.txt
  */
 static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			       int nents, int dir, struct dma_attrs *attrs)
+			       int nents, enum dma_data_direction dir,
+			       struct dma_attrs *attrs)
 {
 #ifdef ASSERT_PDIR_SANITY
 	struct ioc *ioc;
@@ -2062,7 +2080,7 @@ static struct acpi_driver acpi_sba_ioc_driver = {
 	},
 };
 
-extern struct dma_mapping_ops swiotlb_dma_ops;
+extern struct dma_map_ops swiotlb_dma_ops;
 
 static int __init
 sba_init(void)
@@ -2176,18 +2194,18 @@ sba_page_override(char *str)
 
 __setup("sbapagesize=",sba_page_override);
 
-struct dma_mapping_ops sba_dma_ops = {
+struct dma_map_ops sba_dma_ops = {
 	.alloc_coherent		= sba_alloc_coherent,
 	.free_coherent		= sba_free_coherent,
-	.map_single_attrs	= sba_map_single_attrs,
-	.unmap_single_attrs	= sba_unmap_single_attrs,
-	.map_sg_attrs		= sba_map_sg_attrs,
-	.unmap_sg_attrs		= sba_unmap_sg_attrs,
+	.map_page		= sba_map_page,
+	.unmap_page		= sba_unmap_page,
+	.map_sg			= sba_map_sg_attrs,
+	.unmap_sg		= sba_unmap_sg_attrs,
 	.sync_single_for_cpu	= machvec_dma_sync_single,
 	.sync_sg_for_cpu	= machvec_dma_sync_sg,
 	.sync_single_for_device	= machvec_dma_sync_single,
 	.sync_sg_for_device	= machvec_dma_sync_sg,
-	.dma_supported_op	= sba_dma_supported,
+	.dma_supported		= sba_dma_supported,
 	.mapping_error		= sba_dma_mapping_error,
 };
 
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index bac3159379f7..d6230f514536 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -9,73 +9,21 @@
 #include <linux/scatterlist.h>
 #include <asm/swiotlb.h>
 
-struct dma_mapping_ops {
-	int             (*mapping_error)(struct device *dev,
-					 dma_addr_t dma_addr);
-	void*           (*alloc_coherent)(struct device *dev, size_t size,
-				dma_addr_t *dma_handle, gfp_t gfp);
-	void            (*free_coherent)(struct device *dev, size_t size,
-				void *vaddr, dma_addr_t dma_handle);
-	dma_addr_t      (*map_single)(struct device *hwdev, unsigned long ptr,
-				size_t size, int direction);
-	void            (*unmap_single)(struct device *dev, dma_addr_t addr,
-				size_t size, int direction);
-	dma_addr_t      (*map_single_attrs)(struct device *dev, void *cpu_addr,
-					    size_t size, int direction,
-					    struct dma_attrs *attrs);
-	void		(*unmap_single_attrs)(struct device *dev,
-					      dma_addr_t dma_addr,
-					      size_t size, int direction,
-					      struct dma_attrs *attrs);
-	void            (*sync_single_for_cpu)(struct device *hwdev,
-				dma_addr_t dma_handle, size_t size,
-				int direction);
-	void            (*sync_single_for_device)(struct device *hwdev,
-				dma_addr_t dma_handle, size_t size,
-				int direction);
-	void            (*sync_single_range_for_cpu)(struct device *hwdev,
-				dma_addr_t dma_handle, unsigned long offset,
-				size_t size, int direction);
-	void            (*sync_single_range_for_device)(struct device *hwdev,
-				dma_addr_t dma_handle, unsigned long offset,
-				size_t size, int direction);
-	void            (*sync_sg_for_cpu)(struct device *hwdev,
-				struct scatterlist *sg, int nelems,
-				int direction);
-	void            (*sync_sg_for_device)(struct device *hwdev,
-				struct scatterlist *sg, int nelems,
-				int direction);
-	int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
-				int nents, int direction);
-	void            (*unmap_sg)(struct device *hwdev,
-				struct scatterlist *sg, int nents,
-				int direction);
-	int             (*map_sg_attrs)(struct device *dev,
-					struct scatterlist *sg, int nents,
-					int direction, struct dma_attrs *attrs);
-	void            (*unmap_sg_attrs)(struct device *dev,
-					  struct scatterlist *sg, int nents,
-					  int direction,
-					  struct dma_attrs *attrs);
-	int             (*dma_supported_op)(struct device *hwdev, u64 mask);
-	int		is_phys;
-};
-
-extern struct dma_mapping_ops *dma_ops;
+extern struct dma_map_ops *dma_ops;
 extern struct ia64_machine_vector ia64_mv;
 extern void set_iommu_machvec(void);
 
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 				       dma_addr_t *daddr, gfp_t gfp)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	return ops->alloc_coherent(dev, size, daddr, gfp | GFP_DMA);
 }
 
 static inline void dma_free_coherent(struct device *dev, size_t size,
 				     void *caddr, dma_addr_t daddr)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	ops->free_coherent(dev, size, caddr, daddr);
 }
 
@@ -87,8 +35,10 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev,
 					      enum dma_data_direction dir,
 					      struct dma_attrs *attrs)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
-	return ops->map_single_attrs(dev, caddr, size, dir, attrs);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
+	return ops->map_page(dev, virt_to_page(caddr),
+			     (unsigned long)caddr & ~PAGE_MASK, size,
+			     dir, attrs);
 }
 
 static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t daddr,
@@ -96,8 +46,8 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t daddr,
 					  enum dma_data_direction dir,
 					  struct dma_attrs *attrs)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
-	ops->unmap_single_attrs(dev, daddr, size, dir, attrs);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
+	ops->unmap_page(dev, daddr, size, dir, attrs);
 }
 
 #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
@@ -107,8 +57,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
 				   int nents, enum dma_data_direction dir,
 				   struct dma_attrs *attrs)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
-	return ops->map_sg_attrs(dev, sgl, nents, dir, attrs);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
+	return ops->map_sg(dev, sgl, nents, dir, attrs);
 }
 
 static inline void dma_unmap_sg_attrs(struct device *dev,
@@ -116,8 +66,8 @@ static inline void dma_unmap_sg_attrs(struct device *dev,
 				      enum dma_data_direction dir,
 				      struct dma_attrs *attrs)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
-	ops->unmap_sg_attrs(dev, sgl, nents, dir, attrs);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
+	ops->unmap_sg(dev, sgl, nents, dir, attrs);
 }
 
 #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
@@ -127,7 +77,7 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t daddr,
 					   size_t size,
 					   enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	ops->sync_single_for_cpu(dev, daddr, size, dir);
 }
 
@@ -135,7 +85,7 @@ static inline void dma_sync_sg_for_cpu(struct device *dev,
 				       struct scatterlist *sgl,
 				       int nents, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	ops->sync_sg_for_cpu(dev, sgl, nents, dir);
 }
 
@@ -144,7 +94,7 @@ static inline void dma_sync_single_for_device(struct device *dev,
 					      size_t size,
 					      enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	ops->sync_single_for_device(dev, daddr, size, dir);
 }
 
@@ -153,20 +103,29 @@ static inline void dma_sync_sg_for_device(struct device *dev,
 					  int nents,
 					  enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	ops->sync_sg_for_device(dev, sgl, nents, dir);
 }
 
 static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	return ops->mapping_error(dev, daddr);
 }
 
-#define dma_map_page(dev, pg, off, size, dir)				\
-	dma_map_single(dev, page_address(pg) + (off), (size), (dir))
-#define dma_unmap_page(dev, dma_addr, size, dir)			\
-	dma_unmap_single(dev, dma_addr, size, dir)
+static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+				      size_t offset, size_t size,
+				      enum dma_data_direction dir)
+{
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
+	return ops->map_page(dev, page, offset, size, dir, NULL);
+}
+
+static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
+				  size_t size, enum dma_data_direction dir)
+{
+	dma_unmap_single(dev, addr, size, dir);
+}
 
 /*
  * Rest of this file is part of the "Advanced DMA API".  Use at your own risk.
@@ -180,8 +139,8 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
 
 static inline int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
-	return ops->dma_supported_op(dev, mask);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
+	return ops->dma_supported(dev, mask);
 }
 
 static inline int
diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h
index 95e1708fa4e3..e8442c7e4cc8 100644
--- a/arch/ia64/include/asm/machvec.h
+++ b/arch/ia64/include/asm/machvec.h
@@ -11,7 +11,6 @@
 #define _ASM_IA64_MACHVEC_H
 
 #include <linux/types.h>
-#include <linux/swiotlb.h>
 
 /* forward declarations: */
 struct device;
@@ -24,6 +23,7 @@ struct task_struct;
 struct pci_dev;
 struct msi_desc;
 struct dma_attrs;
+enum dma_data_direction;
 
 typedef void ia64_mv_setup_t (char **);
 typedef void ia64_mv_cpu_init_t (void);
@@ -45,7 +45,7 @@ typedef void ia64_mv_kernel_launch_event_t(void);
 
 /* DMA-mapping interface: */
 typedef void ia64_mv_dma_init (void);
-typedef struct dma_mapping_ops *ia64_mv_dma_get_ops(struct device *);
+typedef struct dma_map_ops *ia64_mv_dma_get_ops(struct device *);
 
 /*
  * WARNING: The legacy I/O space is _architected_.  Platforms are
@@ -97,8 +97,10 @@ machvec_noop_bus (struct pci_bus *bus)
 
 extern void machvec_setup (char **);
 extern void machvec_timer_interrupt (int, void *);
-extern void machvec_dma_sync_single (struct device *, dma_addr_t, size_t, int);
-extern void machvec_dma_sync_sg (struct device *, struct scatterlist *, int, int);
+extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
+				    enum dma_data_direction);
+extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
+				enum dma_data_direction);
 extern void machvec_tlb_migrate_finish (struct mm_struct *);
 
 # if defined (CONFIG_IA64_HP_SIM)
@@ -250,7 +252,7 @@ extern void machvec_init_from_cmdline(const char *cmdline);
 # endif /* CONFIG_IA64_GENERIC */
 
 extern void swiotlb_dma_init(void);
-extern struct dma_mapping_ops *dma_get_ops(struct device *);
+extern struct dma_map_ops *dma_get_ops(struct device *);
 
 /*
  * Define default versions so we can extend machvec for new platforms without having
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 427f69617226..7060e13fa421 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -1,9 +1,9 @@
 #include <linux/dma-mapping.h>
 
-struct dma_mapping_ops *dma_ops;
+struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
-struct dma_mapping_ops *dma_get_ops(struct device *dev)
+struct dma_map_ops *dma_get_ops(struct device *dev)
 {
 	return dma_ops;
 }
diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c
index 7ccb228ceedc..d41a40ef80c0 100644
--- a/arch/ia64/kernel/machvec.c
+++ b/arch/ia64/kernel/machvec.c
@@ -1,5 +1,5 @@
 #include <linux/module.h>
-
+#include <linux/dma-mapping.h>
 #include <asm/machvec.h>
 #include <asm/system.h>
 
@@ -75,14 +75,16 @@ machvec_timer_interrupt (int irq, void *dev_id)
 EXPORT_SYMBOL(machvec_timer_interrupt);
 
 void
-machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir)
+machvec_dma_sync_single(struct device *hwdev, dma_addr_t dma_handle, size_t size,
+			enum dma_data_direction dir)
 {
 	mb();
 }
 EXPORT_SYMBOL(machvec_dma_sync_single);
 
 void
-machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir)
+machvec_dma_sync_sg(struct device *hwdev, struct scatterlist *sg, int n,
+		    enum dma_data_direction dir)
 {
 	mb();
 }
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index 640669eba5d4..b30209ec8c6e 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -41,21 +41,7 @@ struct device fallback_dev = {
 	.dma_mask = &fallback_dev.coherent_dma_mask,
 };
 
-extern struct dma_mapping_ops vtd_dma_ops;
-
-void __init pci_iommu_alloc(void)
-{
-	dma_ops = &vtd_dma_ops;
-	/*
-	 * The order of these functions is important for
-	 * fall-back/fail-over reasons
-	 */
-	detect_intel_iommu();
-
-#ifdef CONFIG_SWIOTLB
-	pci_swiotlb_init();
-#endif
-}
+extern struct dma_map_ops intel_dma_ops;
 
 static int __init pci_iommu_init(void)
 {
@@ -81,10 +67,10 @@ iommu_dma_init(void)
 
 int iommu_dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_mapping_ops *ops = platform_dma_get_ops(dev);
+	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 
-	if (ops->dma_supported_op)
-		return ops->dma_supported_op(dev, mask);
+	if (ops->dma_supported)
+		return ops->dma_supported(dev, mask);
 
 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
@@ -113,4 +99,31 @@ int iommu_dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(iommu_dma_supported);
 
+static int vtd_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return 0;
+}
+
+void __init pci_iommu_alloc(void)
+{
+	dma_ops = &intel_dma_ops;
+
+	dma_ops->sync_single_for_cpu = machvec_dma_sync_single;
+	dma_ops->sync_sg_for_cpu = machvec_dma_sync_sg;
+	dma_ops->sync_single_for_device = machvec_dma_sync_single;
+	dma_ops->sync_sg_for_device = machvec_dma_sync_sg;
+	dma_ops->dma_supported = iommu_dma_supported;
+	dma_ops->mapping_error = vtd_dma_mapping_error;
+
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+	detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+}
+
 #endif
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 9f172c864377..6bf8f66786bd 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -16,24 +16,36 @@ EXPORT_SYMBOL(swiotlb);
 /* Set this to 1 if there is a HW IOMMU in the system */
 int iommu_detected __read_mostly;
 
-struct dma_mapping_ops swiotlb_dma_ops = {
+static dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   struct dma_attrs *attrs)
+{
+	return swiotlb_map_single_attrs(dev, page_address(page) + offset, size,
+					dir, attrs);
+}
+
+static void swiotlb_unmap_page(struct device *dev, dma_addr_t dma_handle,
+			       size_t size, enum dma_data_direction dir,
+			       struct dma_attrs *attrs)
+{
+	swiotlb_unmap_single_attrs(dev, dma_handle, size, dir, attrs);
+}
+
+struct dma_map_ops swiotlb_dma_ops = {
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
-	.map_single = swiotlb_map_single,
-	.unmap_single = swiotlb_unmap_single,
-	.map_single_attrs = swiotlb_map_single_attrs,
-	.unmap_single_attrs = swiotlb_unmap_single_attrs,
-	.map_sg_attrs = swiotlb_map_sg_attrs,
-	.unmap_sg_attrs	= swiotlb_unmap_sg_attrs,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
 	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
 	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
-	.map_sg = swiotlb_map_sg,
-	.unmap_sg = swiotlb_unmap_sg,
-	.dma_supported_op = swiotlb_dma_supported,
+	.dma_supported = swiotlb_dma_supported,
 	.mapping_error = swiotlb_dma_mapping_error,
 };
 
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index efdd69490009..9c788f9cedfd 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/dma-attrs.h>
 #include <linux/dma-mapping.h>
 #include <asm/dma.h>
 #include <asm/sn/intr.h>
@@ -171,10 +170,12 @@ static void sn_dma_free_coherent(struct device *dev, size_t size, void *cpu_addr
  * TODO: simplify our interface;
  *       figure out how to save dmamap handle so can use two step.
  */
-static dma_addr_t sn_dma_map_single_attrs(struct device *dev, void *cpu_addr,
-					  size_t size, int direction,
-					  struct dma_attrs *attrs)
+static dma_addr_t sn_dma_map_page(struct device *dev, struct page *page,
+				  unsigned long offset, size_t size,
+				  enum dma_data_direction dir,
+				  struct dma_attrs *attrs)
 {
+	void *cpu_addr = page_address(page) + offset;
 	dma_addr_t dma_addr;
 	unsigned long phys_addr;
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -212,20 +213,20 @@ static dma_addr_t sn_dma_map_single_attrs(struct device *dev, void *cpu_addr,
  * by @dma_handle into the coherence domain.  On SN, we're always cache
  * coherent, so we just need to free any ATEs associated with this mapping.
  */
-static void sn_dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
-				      size_t size, int direction,
-				      struct dma_attrs *attrs)
+static void sn_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
+			      size_t size, enum dma_data_direction dir,
+			      struct dma_attrs *attrs)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev);
 
 	BUG_ON(dev->bus != &pci_bus_type);
 
-	provider->dma_unmap(pdev, dma_addr, direction);
+	provider->dma_unmap(pdev, dma_addr, dir);
 }
 
 /**
- * sn_dma_unmap_sg_attrs - unmap a DMA scatterlist
+ * sn_dma_unmap_sg - unmap a DMA scatterlist
  * @dev: device to unmap
  * @sg: scatterlist to unmap
  * @nhwentries: number of scatterlist entries
@@ -234,9 +235,9 @@ static void sn_dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
  *
  * Unmap a set of streaming mode DMA translations.
  */
-static void sn_dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
-				  int nhwentries, int direction,
-				  struct dma_attrs *attrs)
+static void sn_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
+			    int nhwentries, enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
 {
 	int i;
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -246,14 +247,14 @@ static void sn_dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
 	BUG_ON(dev->bus != &pci_bus_type);
 
 	for_each_sg(sgl, sg, nhwentries, i) {
-		provider->dma_unmap(pdev, sg->dma_address, direction);
+		provider->dma_unmap(pdev, sg->dma_address, dir);
 		sg->dma_address = (dma_addr_t) NULL;
 		sg->dma_length = 0;
 	}
 }
 
 /**
- * sn_dma_map_sg_attrs - map a scatterlist for DMA
+ * sn_dma_map_sg - map a scatterlist for DMA
  * @dev: device to map for
  * @sg: scatterlist to map
  * @nhwentries: number of entries
@@ -267,8 +268,9 @@ static void sn_dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
  *
  * Maps each entry of @sg for DMA.
  */
-static int sn_dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
-			       int nhwentries, int direction, struct dma_attrs *attrs)
+static int sn_dma_map_sg(struct device *dev, struct scatterlist *sgl,
+			 int nhwentries, enum dma_data_direction dir,
+			 struct dma_attrs *attrs)
 {
 	unsigned long phys_addr;
 	struct scatterlist *saved_sg = sgl, *sg;
@@ -305,8 +307,7 @@ static int sn_dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
 			 * Free any successfully allocated entries.
 			 */
 			if (i > 0)
-				sn_dma_unmap_sg_attrs(dev, saved_sg, i,
-						      direction, attrs);
+				sn_dma_unmap_sg(dev, saved_sg, i, dir, attrs);
 			return 0;
 		}
 
@@ -317,25 +318,26 @@ static int sn_dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
 }
 
 static void sn_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
-				       size_t size, int direction)
+				       size_t size, enum dma_data_direction dir)
 {
 	BUG_ON(dev->bus != &pci_bus_type);
 }
 
 static void sn_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
-					  size_t size, int direction)
+					  size_t size,
+					  enum dma_data_direction dir)
 {
 	BUG_ON(dev->bus != &pci_bus_type);
 }
 
 static void sn_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-				   int nelems, int direction)
+				   int nelems, enum dma_data_direction dir)
 {
 	BUG_ON(dev->bus != &pci_bus_type);
 }
 
 static void sn_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-				      int nelems, int direction)
+				      int nelems, enum dma_data_direction dir)
 {
 	BUG_ON(dev->bus != &pci_bus_type);
 }
@@ -455,19 +457,19 @@ int sn_pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size)
 	return ret;
 }
 
-static struct dma_mapping_ops sn_dma_ops = {
+static struct dma_map_ops sn_dma_ops = {
 	.alloc_coherent		= sn_dma_alloc_coherent,
 	.free_coherent		= sn_dma_free_coherent,
-	.map_single_attrs	= sn_dma_map_single_attrs,
-	.unmap_single_attrs	= sn_dma_unmap_single_attrs,
-	.map_sg_attrs		= sn_dma_map_sg_attrs,
-	.unmap_sg_attrs		= sn_dma_unmap_sg_attrs,
+	.map_page		= sn_dma_map_page,
+	.unmap_page		= sn_dma_unmap_page,
+	.map_sg			= sn_dma_map_sg,
+	.unmap_sg		= sn_dma_unmap_sg,
 	.sync_single_for_cpu 	= sn_dma_sync_single_for_cpu,
 	.sync_sg_for_cpu	= sn_dma_sync_sg_for_cpu,
 	.sync_single_for_device = sn_dma_sync_single_for_device,
 	.sync_sg_for_device	= sn_dma_sync_sg_for_device,
 	.mapping_error		= sn_dma_mapping_error,
-	.dma_supported_op	= sn_dma_supported,
+	.dma_supported		= sn_dma_supported,
 };
 
 void sn_dma_init(void)
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 3c034f48fdb0..4994a20acbcb 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -6,7 +6,7 @@ struct dev_archdata {
 	void	*acpi_handle;
 #endif
 #ifdef CONFIG_X86_64
-struct dma_mapping_ops *dma_ops;
+struct dma_map_ops *dma_ops;
 #endif
 #ifdef CONFIG_DMAR
 	void *iommu; /* hook for IOMMU specific extension */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index b81f82268a16..5a347805a6c7 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -17,50 +17,9 @@ extern int iommu_merge;
 extern struct device x86_dma_fallback_dev;
 extern int panic_on_overflow;
 
-struct dma_mapping_ops {
-	int             (*mapping_error)(struct device *dev,
-					 dma_addr_t dma_addr);
-	void*           (*alloc_coherent)(struct device *dev, size_t size,
-				dma_addr_t *dma_handle, gfp_t gfp);
-	void            (*free_coherent)(struct device *dev, size_t size,
-				void *vaddr, dma_addr_t dma_handle);
-	void            (*sync_single_for_cpu)(struct device *hwdev,
-				dma_addr_t dma_handle, size_t size,
-				int direction);
-	void            (*sync_single_for_device)(struct device *hwdev,
-				dma_addr_t dma_handle, size_t size,
-				int direction);
-	void            (*sync_single_range_for_cpu)(struct device *hwdev,
-				dma_addr_t dma_handle, unsigned long offset,
-				size_t size, int direction);
-	void            (*sync_single_range_for_device)(struct device *hwdev,
-				dma_addr_t dma_handle, unsigned long offset,
-				size_t size, int direction);
-	void            (*sync_sg_for_cpu)(struct device *hwdev,
-				struct scatterlist *sg, int nelems,
-				int direction);
-	void            (*sync_sg_for_device)(struct device *hwdev,
-				struct scatterlist *sg, int nelems,
-				int direction);
-	int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
-				int nents, int direction);
-	void            (*unmap_sg)(struct device *hwdev,
-				struct scatterlist *sg, int nents,
-				int direction);
-	dma_addr_t	(*map_page)(struct device *dev, struct page *page,
-				    unsigned long offset, size_t size,
-				    enum dma_data_direction dir,
-				    struct dma_attrs *attrs);
-	void 		(*unmap_page)(struct device *dev, dma_addr_t dma_handle,
-				      size_t size, enum dma_data_direction dir,
-				      struct dma_attrs *attrs);
-	int             (*dma_supported)(struct device *hwdev, u64 mask);
-	int		is_phys;
-};
-
-extern struct dma_mapping_ops *dma_ops;
-
-static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
+extern struct dma_map_ops *dma_ops;
+
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 #ifdef CONFIG_X86_32
 	return dma_ops;
@@ -75,7 +34,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
 /* Make sure we keep the same behaviour */
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 	if (ops->mapping_error)
 		return ops->mapping_error(dev, dma_addr);
 
@@ -94,138 +53,139 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 
 static inline dma_addr_t
 dma_map_single(struct device *hwdev, void *ptr, size_t size,
-	       int direction)
+	       enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	return ops->map_page(hwdev, virt_to_page(ptr),
 			     (unsigned long)ptr & ~PAGE_MASK, size,
-			     direction, NULL);
+			     dir, NULL);
 }
 
 static inline void
 dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
-		 int direction)
+		 enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, direction, NULL);
+		ops->unmap_page(dev, addr, size, dir, NULL);
 }
 
 static inline int
 dma_map_sg(struct device *hwdev, struct scatterlist *sg,
-	   int nents, int direction)
+	   int nents, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
-	return ops->map_sg(hwdev, sg, nents, direction);
+	BUG_ON(!valid_dma_direction(dir));
+	return ops->map_sg(hwdev, sg, nents, dir, NULL);
 }
 
 static inline void
 dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
-	     int direction)
+	     enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->unmap_sg)
-		ops->unmap_sg(hwdev, sg, nents, direction);
+		ops->unmap_sg(hwdev, sg, nents, dir, NULL);
 }
 
 static inline void
 dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-			size_t size, int direction)
+			size_t size, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
+		ops->sync_single_for_cpu(hwdev, dma_handle, size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
-			   size_t size, int direction)
+			   size_t size, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_for_device)
-		ops->sync_single_for_device(hwdev, dma_handle, size, direction);
+		ops->sync_single_for_device(hwdev, dma_handle, size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-			      unsigned long offset, size_t size, int direction)
+			      unsigned long offset, size_t size,
+			      enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_range_for_cpu)
 		ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
-					       size, direction);
+					       size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
 				 unsigned long offset, size_t size,
-				 int direction)
+				 enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_range_for_device)
 		ops->sync_single_range_for_device(hwdev, dma_handle,
-						  offset, size, direction);
+						  offset, size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-		    int nelems, int direction)
+		    int nelems, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+		ops->sync_sg_for_cpu(hwdev, sg, nelems, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-		       int nelems, int direction)
+		       int nelems, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+		ops->sync_sg_for_device(hwdev, sg, nelems, dir);
 
 	flush_write_buffers();
 }
 
 static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
-				      int direction)
+				      enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
-	BUG_ON(!valid_dma_direction(direction));
-	return ops->map_page(dev, page, offset, size, direction, NULL);
+	BUG_ON(!valid_dma_direction(dir));
+	return ops->map_page(dev, page, offset, size, dir, NULL);
 }
 
 static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
-				  size_t size, int direction)
+				  size_t size, enum dma_data_direction dir)
 {
-	dma_unmap_single(dev, addr, size, direction);
+	dma_unmap_single(dev, addr, size, dir);
 }
 
 static inline void
@@ -271,7 +231,7 @@ static inline void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 	void *memory;
 
 	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
@@ -297,7 +257,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 static inline void dma_free_coherent(struct device *dev, size_t size,
 				     void *vaddr, dma_addr_t bus)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
 	WARN_ON(irqs_disabled());       /* for portability */
 
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index a6ee9e6f530f..af326a2975b5 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -3,7 +3,7 @@
 
 extern void pci_iommu_shutdown(void);
 extern void no_iommu_init(void);
-extern struct dma_mapping_ops nommu_dma_ops;
+extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a5dedb690a9a..008e522b9536 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1394,7 +1394,8 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
  * lists).
  */
 static int map_sg(struct device *dev, struct scatterlist *sglist,
-		  int nelems, int dir)
+		  int nelems, enum dma_data_direction dir,
+		  struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	struct amd_iommu *iommu;
@@ -1461,7 +1462,8 @@ unmap:
  * lists).
  */
 static void unmap_sg(struct device *dev, struct scatterlist *sglist,
-		     int nelems, int dir)
+		     int nelems, enum dma_data_direction dir,
+		     struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	struct amd_iommu *iommu;
@@ -1648,7 +1650,7 @@ static void prealloc_protection_domains(void)
 	}
 }
 
-static struct dma_mapping_ops amd_iommu_dma_ops = {
+static struct dma_map_ops amd_iommu_dma_ops = {
 	.alloc_coherent = alloc_coherent,
 	.free_coherent = free_coherent,
 	.map_page = map_page,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 756138b604e1..755c21e906f3 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -380,8 +380,9 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
 	return tbl;
 }
 
-static void calgary_unmap_sg(struct device *dev,
-	struct scatterlist *sglist, int nelems, int direction)
+static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
+			     int nelems,enum dma_data_direction dir,
+			     struct dma_attrs *attrs)
 {
 	struct iommu_table *tbl = find_iommu_table(dev);
 	struct scatterlist *s;
@@ -404,7 +405,8 @@ static void calgary_unmap_sg(struct device *dev,
 }
 
 static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
-	int nelems, int direction)
+			  int nelems, enum dma_data_direction dir,
+			  struct dma_attrs *attrs)
 {
 	struct iommu_table *tbl = find_iommu_table(dev);
 	struct scatterlist *s;
@@ -429,15 +431,14 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
 		s->dma_address = (entry << PAGE_SHIFT) | s->offset;
 
 		/* insert into HW table */
-		tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
-			  direction);
+		tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
 
 		s->dma_length = s->length;
 	}
 
 	return nelems;
 error:
-	calgary_unmap_sg(dev, sg, nelems, direction);
+	calgary_unmap_sg(dev, sg, nelems, dir, NULL);
 	for_each_sg(sg, s, nelems, i) {
 		sg->dma_address = bad_dma_address;
 		sg->dma_length = 0;
@@ -518,7 +519,7 @@ static void calgary_free_coherent(struct device *dev, size_t size,
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-static struct dma_mapping_ops calgary_dma_ops = {
+static struct dma_map_ops calgary_dma_ops = {
 	.alloc_coherent = calgary_alloc_coherent,
 	.free_coherent = calgary_free_coherent,
 	.map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 19a1044a0cd9..0d75c129b18a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -12,7 +12,7 @@
 
 static int forbid_dac __read_mostly;
 
-struct dma_mapping_ops *dma_ops;
+struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -224,7 +224,7 @@ early_param("iommu", iommu_setup);
 
 int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 9c557c0c928c..8cb3e45439cf 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -302,8 +302,8 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
 /*
  * Wrapper for pci_unmap_single working with scatterlists.
  */
-static void
-gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+			  enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *s;
 	int i;
@@ -333,7 +333,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 			addr = dma_map_area(dev, addr, s->length, dir, 0);
 			if (addr == bad_dma_address) {
 				if (i > 0)
-					gart_unmap_sg(dev, sg, i, dir);
+					gart_unmap_sg(dev, sg, i, dir, NULL);
 				nents = 0;
 				sg[0].dma_length = 0;
 				break;
@@ -404,8 +404,8 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
  * DMA map all entries in a scatterlist.
  * Merge chunks that have page aligned sizes into a continuous mapping.
  */
-static int
-gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		       enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *s, *ps, *start_sg, *sgmap;
 	int need = 0, nextneed, i, out, start;
@@ -472,7 +472,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 
 error:
 	flush_gart();
-	gart_unmap_sg(dev, sg, out, dir);
+	gart_unmap_sg(dev, sg, out, dir, NULL);
 
 	/* When it was forced or merged try again in a dumb way */
 	if (force_iommu || iommu_merge) {
@@ -711,7 +711,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	return -1;
 }
 
-static struct dma_mapping_ops gart_dma_ops = {
+static struct dma_map_ops gart_dma_ops = {
 	.map_sg				= gart_map_sg,
 	.unmap_sg			= gart_unmap_sg,
 	.map_page			= gart_map_page,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index d42b69c90b40..fe50214db876 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -54,7 +54,8 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
  * the same here.
  */
 static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
-	       int nents, int direction)
+			int nents, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
 {
 	struct scatterlist *s;
 	int i;
@@ -78,7 +79,7 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-struct dma_mapping_ops nommu_dma_ops = {
+struct dma_map_ops nommu_dma_ops = {
 	.alloc_coherent = dma_generic_alloc_coherent,
 	.free_coherent = nommu_free_coherent,
 	.map_sg = nommu_map_sg,
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3ae354c0fdef..3f0d9924dd1c 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -67,7 +67,7 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
 }
 
-struct dma_mapping_ops swiotlb_dma_ops = {
+struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = x86_swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
@@ -77,8 +77,8 @@ struct dma_mapping_ops swiotlb_dma_ops = {
 	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
-	.map_sg = swiotlb_map_sg,
-	.unmap_sg = swiotlb_unmap_sg,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
 	.map_page = swiotlb_map_page,
 	.unmap_page = swiotlb_unmap_page,
 	.dma_supported = NULL,
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index da273e4ef66c..b9a562933903 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2441,7 +2441,8 @@ void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 #define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
 
 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
-		    int nelems, int dir)
+		    int nelems, enum dma_data_direction dir,
+		    struct dma_attrs *attrs)
 {
 	int i;
 	struct pci_dev *pdev = to_pci_dev(hwdev);
@@ -2499,7 +2500,7 @@ static int intel_nontranslate_map_sg(struct device *hddev,
 }
 
 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
-		 int dir)
+		 enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	void *addr;
 	int i;
@@ -2579,15 +2580,13 @@ int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
 	return nelems;
 }
 
-static struct dma_mapping_ops intel_dma_ops = {
+struct dma_map_ops intel_dma_ops = {
 	.alloc_coherent = intel_alloc_coherent,
 	.free_coherent = intel_free_coherent,
 	.map_sg = intel_map_sg,
 	.unmap_sg = intel_unmap_sg,
-#ifdef CONFIG_X86_64
 	.map_page = intel_map_page,
 	.unmap_page = intel_unmap_page,
-#endif
 };
 
 static inline int iommu_domain_cache_init(void)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c4f6c101dbcd..a254db1decd0 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -334,7 +334,9 @@ extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t);
 extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t);
 extern dma_addr_t intel_map_single(struct device *, phys_addr_t, size_t, int);
 extern void intel_unmap_single(struct device *, dma_addr_t, size_t, int);
-extern int intel_map_sg(struct device *, struct scatterlist *, int, int);
-extern void intel_unmap_sg(struct device *, struct scatterlist *, int, int);
+extern int intel_map_sg(struct device *, struct scatterlist *, int,
+			enum dma_data_direction, struct dma_attrs *);
+extern void intel_unmap_sg(struct device *, struct scatterlist *, int,
+			   enum dma_data_direction, struct dma_attrs *);
 
 #endif
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index dedd3c0cfe30..0567c3d8633b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -66,36 +66,38 @@ swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
 
 extern int
 swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		     int dir, struct dma_attrs *attrs);
+		     enum dma_data_direction dir, struct dma_attrs *attrs);
 
 extern void
 swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-		       int nelems, int dir, struct dma_attrs *attrs);
+		       int nelems, enum dma_data_direction dir,
+		       struct dma_attrs *attrs);
 
 extern void
 swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-			    size_t size, int dir);
+			    size_t size, enum dma_data_direction dir);
 
 extern void
 swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-			int nelems, int dir);
+			int nelems, enum dma_data_direction dir);
 
 extern void
 swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
-			       size_t size, int dir);
+			       size_t size, enum dma_data_direction dir);
 
 extern void
 swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-			   int nelems, int dir);
+			   int nelems, enum dma_data_direction dir);
 
 extern void
 swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-				  unsigned long offset, size_t size, int dir);
+				  unsigned long offset, size_t size,
+				  enum dma_data_direction dir);
 
 extern void
 swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
 				     unsigned long offset, size_t size,
-				     int dir);
+				     enum dma_data_direction dir);
 
 extern int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 48deef7e1976..d047de990a3f 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -736,7 +736,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 
 void
 swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-			    size_t size, int dir)
+			    size_t size, enum dma_data_direction dir)
 {
 	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
 }
@@ -744,7 +744,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
 
 void
 swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
-			       size_t size, int dir)
+			       size_t size, enum dma_data_direction dir)
 {
 	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
 }
@@ -769,7 +769,8 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
 
 void
 swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-				  unsigned long offset, size_t size, int dir)
+				  unsigned long offset, size_t size,
+				  enum dma_data_direction dir)
 {
 	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
 				  SYNC_FOR_CPU);
@@ -778,7 +779,8 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
 
 void
 swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
-				     unsigned long offset, size_t size, int dir)
+				     unsigned long offset, size_t size,
+				     enum dma_data_direction dir)
 {
 	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
 				  SYNC_FOR_DEVICE);
@@ -803,7 +805,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
  */
 int
 swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		     int dir, struct dma_attrs *attrs)
+		     enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -850,7 +852,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
  */
 void
 swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-		       int nelems, int dir, struct dma_attrs *attrs)
+		       int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -902,7 +904,7 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 
 void
 swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-			int nelems, int dir)
+			int nelems, enum dma_data_direction dir)
 {
 	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
 }
@@ -910,7 +912,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
 
 void
 swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-			   int nelems, int dir)
+			   int nelems, enum dma_data_direction dir)
 {
 	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
 }
-- 
cgit v1.2.3-71-gd317


From f98eee8ea99fe74ee9c4e867ba178ec3072793be Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 5 Jan 2009 23:59:03 +0900
Subject: x86, ia64: remove duplicated swiotlb code

This adds swiotlb_map_page and swiotlb_unmap_page to lib/swiotlb.c and
remove IA64 and X86's swiotlb_map_page and swiotlb_unmap_page.

This also removes unnecessary swiotlb_map_single, swiotlb_map_single_attrs,
swiotlb_unmap_single and swiotlb_unmap_single_attrs.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/kernel/pci-swiotlb.c   | 16 --------------
 arch/x86/kernel/pci-swiotlb_64.c | 17 --------------
 include/linux/swiotlb.h          | 21 ++++++------------
 lib/swiotlb.c                    | 48 +++++++++++++++-------------------------
 4 files changed, 25 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 6bf8f66786bd..e6b2ec9b27da 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -16,22 +16,6 @@ EXPORT_SYMBOL(swiotlb);
 /* Set this to 1 if there is a HW IOMMU in the system */
 int iommu_detected __read_mostly;
 
-static dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction dir,
-				   struct dma_attrs *attrs)
-{
-	return swiotlb_map_single_attrs(dev, page_address(page) + offset, size,
-					dir, attrs);
-}
-
-static void swiotlb_unmap_page(struct device *dev, dma_addr_t dma_handle,
-			       size_t size, enum dma_data_direction dir,
-			       struct dma_attrs *attrs)
-{
-	swiotlb_unmap_single_attrs(dev, dma_handle, size, dir, attrs);
-}
-
 struct dma_map_ops swiotlb_dma_ops = {
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3f0d9924dd1c..5e32c4f6a7ba 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -38,23 +38,6 @@ int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
 	return 0;
 }
 
-/* these will be moved to lib/swiotlb.c later on */
-
-static dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction dir,
-				   struct dma_attrs *attrs)
-{
-	return swiotlb_map_single(dev, page_address(page) + offset, size, dir);
-}
-
-static void swiotlb_unmap_page(struct device *dev, dma_addr_t dma_handle,
-			       size_t size, enum dma_data_direction dir,
-			       struct dma_attrs *attrs)
-{
-	swiotlb_unmap_single(dev, dma_handle, size, dir);
-}
-
 static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 					dma_addr_t *dma_handle, gfp_t flags)
 {
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 0567c3d8633b..493dc17e7c87 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -41,20 +41,13 @@ extern void
 swiotlb_free_coherent(struct device *hwdev, size_t size,
 		      void *vaddr, dma_addr_t dma_handle);
 
-extern dma_addr_t
-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir);
-
-extern void
-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
-		     size_t size, int dir);
-
-extern dma_addr_t
-swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
-			 int dir, struct dma_attrs *attrs);
-
-extern void
-swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
-			   size_t size, int dir, struct dma_attrs *attrs);
+extern dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   struct dma_attrs *attrs);
+extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			       size_t size, enum dma_data_direction dir,
+			       struct dma_attrs *attrs);
 
 extern int
 swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d047de990a3f..ec7922bd0d61 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -636,11 +636,14 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
  * Once the device is given the dma address, the device owns this memory until
  * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
  */
-dma_addr_t
-swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
-			 int dir, struct dma_attrs *attrs)
-{
-	dma_addr_t dev_addr = swiotlb_virt_to_bus(hwdev, ptr);
+dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+			    unsigned long offset, size_t size,
+			    enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
+{
+	phys_addr_t phys = page_to_phys(page) + offset;
+	void *ptr = page_address(page) + offset;
+	dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys);
 	void *map;
 
 	BUG_ON(dir == DMA_NONE);
@@ -649,37 +652,30 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (!address_needs_mapping(hwdev, dev_addr, size) &&
+	if (!address_needs_mapping(dev, dev_addr, size) &&
 	    !range_needs_mapping(ptr, size))
 		return dev_addr;
 
 	/*
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
-	map = map_single(hwdev, virt_to_phys(ptr), size, dir);
+	map = map_single(dev, phys, size, dir);
 	if (!map) {
-		swiotlb_full(hwdev, size, dir, 1);
+		swiotlb_full(dev, size, dir, 1);
 		map = io_tlb_overflow_buffer;
 	}
 
-	dev_addr = swiotlb_virt_to_bus(hwdev, map);
+	dev_addr = swiotlb_virt_to_bus(dev, map);
 
 	/*
 	 * Ensure that the address returned is DMA'ble
 	 */
-	if (address_needs_mapping(hwdev, dev_addr, size))
+	if (address_needs_mapping(dev, dev_addr, size))
 		panic("map_single: bounce buffer is not DMA'ble");
 
 	return dev_addr;
 }
-EXPORT_SYMBOL(swiotlb_map_single_attrs);
-
-dma_addr_t
-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
-{
-	return swiotlb_map_single_attrs(hwdev, ptr, size, dir, NULL);
-}
-EXPORT_SYMBOL(swiotlb_map_single);
+EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
 /*
  * Unmap a single streaming mode DMA translation.  The dma_addr and size must
@@ -689,9 +685,9 @@ EXPORT_SYMBOL(swiotlb_map_single);
  * After this call, reads by the cpu to the buffer are guaranteed to see
  * whatever the device wrote there.
  */
-void
-swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
-			   size_t size, int dir, struct dma_attrs *attrs)
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			size_t size, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
@@ -701,15 +697,7 @@ swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
 	else if (dir == DMA_FROM_DEVICE)
 		dma_mark_clean(dma_addr, size);
 }
-EXPORT_SYMBOL(swiotlb_unmap_single_attrs);
-
-void
-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
-		     int dir)
-{
-	return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
-}
-EXPORT_SYMBOL(swiotlb_unmap_single);
+EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
 
 /*
  * Make physical memory consistent for a single streaming mode DMA translation
-- 
cgit v1.2.3-71-gd317


From d506fc322ec2af04fc47be83d796a1c9e1a16022 Mon Sep 17 00:00:00 2001
From: Jani Nikula <ext-jani.1.nikula@nokia.com>
Date: Wed, 7 Jan 2009 11:54:25 +0200
Subject: ALSA: Add support for video out to the jack reporting API

Add support for reporting new jack types SND_JACK_VIDEOOUT and
SND_JACK_AVOUT (a combination of LINEOUT and VIDEOOUT) to the jack
reporting API.

Also add the corresponding SW_VIDEOOUT_INSERT switch to the input system
header.

Signed-off-by: Jani Nikula <ext-jani.1.nikula@nokia.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/input.h | 1 +
 include/sound/jack.h  | 2 ++
 sound/core/jack.c     | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 9a6355f74db2..adc13322d1d2 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -661,6 +661,7 @@ struct input_absinfo {
 #define SW_DOCK			0x05  /* set = plugged into dock */
 #define SW_LINEOUT_INSERT	0x06  /* set = inserted */
 #define SW_JACK_PHYSICAL_INSERT 0x07  /* set = mechanical switch set */
+#define SW_VIDEOOUT_INSERT	0x08  /* set = inserted */
 #define SW_MAX			0x0f
 #define SW_CNT			(SW_MAX+1)
 
diff --git a/include/sound/jack.h b/include/sound/jack.h
index 85266a2f5c6f..6b013c6f6a04 100644
--- a/include/sound/jack.h
+++ b/include/sound/jack.h
@@ -40,6 +40,8 @@ enum snd_jack_types {
 	SND_JACK_HEADSET	= SND_JACK_HEADPHONE | SND_JACK_MICROPHONE,
 	SND_JACK_LINEOUT	= 0x0004,
 	SND_JACK_MECHANICAL	= 0x0008, /* If detected separately */
+	SND_JACK_VIDEOOUT	= 0x0010,
+	SND_JACK_AVOUT		= SND_JACK_LINEOUT | SND_JACK_VIDEOOUT,
 };
 
 struct snd_jack {
diff --git a/sound/core/jack.c b/sound/core/jack.c
index b2da10c9916a..43b10d6e522b 100644
--- a/sound/core/jack.c
+++ b/sound/core/jack.c
@@ -28,6 +28,7 @@ static int jack_types[] = {
 	SW_MICROPHONE_INSERT,
 	SW_LINEOUT_INSERT,
 	SW_JACK_PHYSICAL_INSERT,
+	SW_VIDEOOUT_INSERT,
 };
 
 static int snd_jack_dev_free(struct snd_device *device)
-- 
cgit v1.2.3-71-gd317


From 01d07820a0df6b6134c1bb75b1e84c9d0cdab3be Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sun, 4 Jan 2009 03:11:05 +0900
Subject: sparseirq: make for_each_irq_desc() more robust

Raja reported for_each_irq_desc() has possibility unsafeness:

if anyone write folliwing code, for_each_irq_desc() doesn't work
as intended:

(right now this code does not exist at all)

 if (safe)
   for_each_irq_desc(irq, desc) {
      ...
   }
 else
   panic();

Reported-by: Raja R Harinath <harinath@hurrynot.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irqnr.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 86af92e9e84c..52ebbb4b161d 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -28,13 +28,17 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
 	     irq++, desc = irq_to_desc(irq))				\
-		if (desc)
+		if (!desc)						\
+			;						\
+		else
 
 
 # define for_each_irq_desc_reverse(irq, desc)				\
 	for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0;	\
 	     irq--, desc = irq_to_desc(irq))				\
-		if (desc)
+		if (!desc)						\
+			;						\
+		else
 
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
-- 
cgit v1.2.3-71-gd317


From 889c92d21db40be0b7d22a59395060237895bb85 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 8 Jan 2009 15:14:17 -0800
Subject: bzip2/lzma: centralize format detection

Centralize the compression format detection to a common routine in the
lib directory, and use it for both initramfs and initrd.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/decompress/generic.h |  3 +++
 init/do_mounts_rd.c                | 38 ++++++-----------------------
 init/initramfs.c                   | 39 ++++++-----------------------
 lib/Makefile                       |  9 ++++---
 lib/decompress.c                   | 50 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+), 67 deletions(-)
 create mode 100644 lib/decompress.c

(limited to 'include/linux')

diff --git a/include/linux/decompress/generic.h b/include/linux/decompress/generic.h
index f847f514f78e..6dfb856327bb 100644
--- a/include/linux/decompress/generic.h
+++ b/include/linux/decompress/generic.h
@@ -26,5 +26,8 @@ typedef int (*decompress_fn) (unsigned char *inbuf, int len,
  *fill should be called (repeatedly...) to read data, at most IOBUF_SIZE
  */
 
+/* Utility routine to detect the decompression method */
+decompress_fn decompress_method(const unsigned char *inbuf, int len,
+				const char **name);
 
 #endif
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index 9c9d7dbcf9ca..a06ed4f92e0e 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -12,9 +12,6 @@
 
 #include <linux/decompress/generic.h>
 
-#include <linux/decompress/bunzip2.h>
-#include <linux/decompress/unlzma.h>
-#include <linux/decompress/inflate.h>
 
 int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */
 
@@ -49,24 +46,6 @@ static int __init crd_load(int in_fd, int out_fd, decompress_fn deco);
  *	cramfs
  *	gzip
  */
-static const struct compress_format {
-	unsigned char magic[2];
-	const char *name;
-	decompress_fn decompressor;
-} compressed_formats[] = {
-#ifdef CONFIG_RD_GZIP
-	{ {037, 0213}, "gzip", gunzip },
-	{ {037, 0236}, "gzip", gunzip },
-#endif
-#ifdef CONFIG_RD_BZIP2
-	{ {0x42, 0x5a}, "bzip2", bunzip2 },
-#endif
-#ifdef CONFIG_RD_LZMA
-	{ {0x5d, 0x00}, "lzma", unlzma },
-#endif
-	{ {0, 0}, NULL, NULL }
-};
-
 static int __init
 identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
 {
@@ -77,7 +56,7 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
 	struct cramfs_super *cramfsb;
 	int nblocks = -1;
 	unsigned char *buf;
-	const struct compress_format *cf;
+	const char *compress_name;
 
 	buf = kmalloc(size, GFP_KERNEL);
 	if (!buf)
@@ -95,15 +74,12 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
 	sys_lseek(fd, start_block * BLOCK_SIZE, 0);
 	sys_read(fd, buf, size);
 
-	for (cf = compressed_formats; cf->decompressor; cf++) {
-		if (buf[0] == cf->magic[0] && buf[1] == cf->magic[1]) {
-			printk(KERN_NOTICE
-			       "RAMDISK: %s image found at block %d\n",
-			       cf->name, start_block);
-			*decompressor = cf->decompressor;
-			nblocks = 0;
-			goto done;
-		}
+	*decompressor = decompress_method(buf, size, &compress_name);
+	if (*decompressor) {
+		printk(KERN_NOTICE "RAMDISK: %s image found at block %d\n",
+		       compress_name, start_block);
+		nblocks = 0;
+		goto done;
 	}
 
 	/* romfs is at block zero too */
diff --git a/init/initramfs.c b/init/initramfs.c
index a3ba91cdab89..2f42984e5582 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -416,13 +416,13 @@ static int __init flush_buffer(void *bufv, unsigned len)
 
 static unsigned my_inptr;   /* index of next byte to be processed in inbuf */
 
-#include <linux/decompress/bunzip2.h>
-#include <linux/decompress/unlzma.h>
-#include <linux/decompress/inflate.h>
+#include <linux/decompress/generic.h>
 
 static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only)
 {
 	int written;
+	decompress_fn decompress;
+
 	dry_run = check_only;
 	header_buf = kmalloc(110, GFP_KERNEL);
 	symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL);
@@ -450,35 +450,10 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only)
 			continue;
 		}
 		this_header = 0;
-#ifdef CONFIG_RD_GZIP
-		if (!gunzip(buf, len, NULL, flush_buffer, NULL,
-			    &my_inptr, error) &&
-		    message == NULL)
-			goto ok;
-#endif
-
-#ifdef CONFIG_RD_BZIP2
-		message = NULL; /* Zero out message, or else cpio will
-				   think an error has already occured */
-		if (!bunzip2(buf, len, NULL, flush_buffer, NULL,
-			     &my_inptr, error) &&
-		    message == NULL) {
-			goto ok;
-		}
-#endif
-
-#ifdef CONFIG_RD_LZMA
-		message = NULL; /* Zero out message, or else cpio will
-				   think an error has already occured */
-		if (!unlzma(buf, len, NULL, flush_buffer, NULL,
-			    &my_inptr, error) &&
-		    message == NULL) {
-			goto ok;
-		}
-#endif
-#if defined CONFIG_RD_GZIP || defined CONFIG_RD_BZIP2 || defined CONFIG_RD_LZMA
-ok:
-#endif
+		decompress = decompress_method(buf, len, NULL);
+		if (decompress)
+			decompress(buf, len, NULL, flush_buffer, NULL,
+				   &my_inptr, error);
 		if (state != Reset)
 			error("junk in compressed archive");
 		this_header = saved_offset + my_inptr;
diff --git a/lib/Makefile b/lib/Makefile
index d9ac5a414fa7..790de7c25d0d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o
+	 proportions.o prio_heap.o ratelimit.o show_mem.o \
+	 is_single_threaded.o decompress.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
@@ -65,9 +66,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 obj-$(CONFIG_LZO_COMPRESS) += lzo/
 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
 
-obj-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
-obj-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
-obj-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
+lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
+lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
+lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
 
 obj-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
diff --git a/lib/decompress.c b/lib/decompress.c
new file mode 100644
index 000000000000..edac55cc7823
--- /dev/null
+++ b/lib/decompress.c
@@ -0,0 +1,50 @@
+/*
+ * decompress.c
+ *
+ * Detect the decompression method based on magic number
+ */
+
+#include <linux/decompress/generic.h>
+
+#include <linux/decompress/bunzip2.h>
+#include <linux/decompress/unlzma.h>
+#include <linux/decompress/inflate.h>
+
+#include <linux/types.h>
+#include <linux/string.h>
+
+static const struct compress_format {
+	unsigned char magic[2];
+	const char *name;
+	decompress_fn decompressor;
+} compressed_formats[] = {
+#ifdef CONFIG_DECOMPRESS_GZIP
+	{ {037, 0213}, "gzip", gunzip },
+	{ {037, 0236}, "gzip", gunzip },
+#endif
+#ifdef CONFIG_DECOMPRESS_BZIP2
+	{ {0x42, 0x5a}, "bzip2", bunzip2 },
+#endif
+#ifdef CONFIG_DECOMPRESS_LZMA
+	{ {0x5d, 0x00}, "lzma", unlzma },
+#endif
+	{ {0, 0}, NULL, NULL }
+};
+
+decompress_fn decompress_method(const unsigned char *inbuf, int len,
+				const char **name)
+{
+	const struct compress_format *cf;
+
+	if (len < 2)
+		return NULL;	/* Need at least this much... */
+
+	for (cf = compressed_formats; cf->decompressor; cf++) {
+		if (!memcmp(inbuf, cf->magic, 2))
+			break;
+
+	}
+	if (name)
+		*name = cf->name;
+	return cf->decompressor;
+}
-- 
cgit v1.2.3-71-gd317


From a6ba2b2dabb583e7820e567fb309d771b50cb9ff Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 8 Jan 2009 15:16:16 +0000
Subject: ASoC: Implement WM8350 headphone jack detection

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/mfd/wm8350/audio.h |   1 +
 sound/soc/codecs/wm8350.c        | 116 +++++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/wm8350.h        |   8 +++
 3 files changed, 125 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8350/audio.h b/include/linux/mfd/wm8350/audio.h
index af95a1d2f3a1..d899dc0223ba 100644
--- a/include/linux/mfd/wm8350/audio.h
+++ b/include/linux/mfd/wm8350/audio.h
@@ -490,6 +490,7 @@
 /*
  * R231 (0xE7) - Jack Status
  */
+#define WM8350_JACK_L_LVL			0x0800
 #define WM8350_JACK_R_LVL                       0x0400
 
 /*
diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c
index e3989d406f54..47a9dabb5235 100644
--- a/sound/soc/codecs/wm8350.c
+++ b/sound/soc/codecs/wm8350.c
@@ -51,10 +51,17 @@ struct wm8350_output {
 	u16 mute;
 };
 
+struct wm8350_jack_data {
+	struct snd_soc_jack *jack;
+	int report;
+};
+
 struct wm8350_data {
 	struct snd_soc_codec codec;
 	struct wm8350_output out1;
 	struct wm8350_output out2;
+	struct wm8350_jack_data hpl;
+	struct wm8350_jack_data hpr;
 	struct regulator_bulk_data supplies[ARRAY_SIZE(supply_names)];
 };
 
@@ -1328,6 +1335,95 @@ static int wm8350_resume(struct platform_device *pdev)
 	return 0;
 }
 
+static void wm8350_hp_jack_handler(struct wm8350 *wm8350, int irq, void *data)
+{
+	struct wm8350_data *priv = data;
+	u16 reg;
+	int report;
+	int mask;
+	struct wm8350_jack_data *jack = NULL;
+
+	switch (irq) {
+	case WM8350_IRQ_CODEC_JCK_DET_L:
+		jack = &priv->hpl;
+		mask = WM8350_JACK_L_LVL;
+		break;
+
+	case WM8350_IRQ_CODEC_JCK_DET_R:
+		jack = &priv->hpr;
+		mask = WM8350_JACK_R_LVL;
+		break;
+
+	default:
+		BUG();
+	}
+
+	if (!jack->jack) {
+		dev_warn(wm8350->dev, "Jack interrupt called with no jack\n");
+		return;
+	}
+
+	/* Debounce */
+	msleep(200);
+
+	reg = wm8350_reg_read(wm8350, WM8350_JACK_PIN_STATUS);
+	if (reg & mask)
+		report = jack->report;
+	else
+		report = 0;
+
+	snd_soc_jack_report(jack->jack, report, jack->report);
+}
+
+/**
+ * wm8350_hp_jack_detect - Enable headphone jack detection.
+ *
+ * @codec:  WM8350 codec
+ * @which:  left or right jack detect signal
+ * @jack:   jack to report detection events on
+ * @report: value to report
+ *
+ * Enables the headphone jack detection of the WM8350.
+ */
+int wm8350_hp_jack_detect(struct snd_soc_codec *codec, enum wm8350_jack which,
+			  struct snd_soc_jack *jack, int report)
+{
+	struct wm8350_data *priv = codec->private_data;
+	struct wm8350 *wm8350 = codec->control_data;
+	int irq;
+	int ena;
+
+	switch (which) {
+	case WM8350_JDL:
+		priv->hpl.jack = jack;
+		priv->hpl.report = report;
+		irq = WM8350_IRQ_CODEC_JCK_DET_L;
+		ena = WM8350_JDL_ENA;
+		break;
+
+	case WM8350_JDR:
+		priv->hpr.jack = jack;
+		priv->hpr.report = report;
+		irq = WM8350_IRQ_CODEC_JCK_DET_R;
+		ena = WM8350_JDR_ENA;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	wm8350_set_bits(wm8350, WM8350_POWER_MGMT_4, WM8350_TOCLK_ENA);
+	wm8350_set_bits(wm8350, WM8350_JACK_DETECT, ena);
+
+	/* Sync status */
+	wm8350_hp_jack_handler(wm8350, irq, priv);
+
+	wm8350_unmask_irq(wm8350, irq);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wm8350_hp_jack_detect);
+
 static struct snd_soc_codec *wm8350_codec;
 
 static int wm8350_probe(struct platform_device *pdev)
@@ -1381,6 +1477,13 @@ static int wm8350_probe(struct platform_device *pdev)
 	wm8350_set_bits(wm8350, WM8350_ROUT2_VOLUME,
 			WM8350_OUT2_VU | WM8350_OUT2R_MUTE);
 
+	wm8350_mask_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_L);
+	wm8350_mask_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_R);
+	wm8350_register_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_L,
+			    wm8350_hp_jack_handler, priv);
+	wm8350_register_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_R,
+			    wm8350_hp_jack_handler, priv);
+
 	ret = snd_soc_new_pcms(socdev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to create pcms\n");
@@ -1411,8 +1514,21 @@ static int wm8350_remove(struct platform_device *pdev)
 	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
 	struct snd_soc_codec *codec = socdev->codec;
 	struct wm8350 *wm8350 = codec->control_data;
+	struct wm8350_data *priv = codec->private_data;
 	int ret;
 
+	wm8350_clear_bits(wm8350, WM8350_JACK_DETECT,
+			  WM8350_JDL_ENA | WM8350_JDR_ENA);
+	wm8350_clear_bits(wm8350, WM8350_POWER_MGMT_4, WM8350_TOCLK_ENA);
+
+	wm8350_mask_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_L);
+	wm8350_mask_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_R);
+	wm8350_free_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_L);
+	wm8350_free_irq(wm8350, WM8350_IRQ_CODEC_JCK_DET_R);
+
+	priv->hpl.jack = NULL;
+	priv->hpr.jack = NULL;
+
 	/* cancel any work waiting to be queued. */
 	ret = cancel_delayed_work(&codec->delayed_work);
 
diff --git a/sound/soc/codecs/wm8350.h b/sound/soc/codecs/wm8350.h
index cc2887aa6c38..d11bd9288cf9 100644
--- a/sound/soc/codecs/wm8350.h
+++ b/sound/soc/codecs/wm8350.h
@@ -17,4 +17,12 @@
 extern struct snd_soc_dai wm8350_dai;
 extern struct snd_soc_codec_device soc_codec_dev_wm8350;
 
+enum wm8350_jack {
+	WM8350_JDL = 1,
+	WM8350_JDR = 2,
+};
+
+int wm8350_hp_jack_detect(struct snd_soc_codec *codec, enum wm8350_jack which,
+			  struct snd_soc_jack *jack, int report);
+
 #endif
-- 
cgit v1.2.3-71-gd317


From d7e51e66899f95dabc89b4d4c6674a6e50fa37fc Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 7 Jan 2009 15:03:13 -0800
Subject: sparseirq: make some func to be used with genirq

Impact: clean up sparseirq fallout on random.c

Ingo suggested to change some ifdef from SPARSE_IRQ to GENERIC_HARDIRQS
so we could some #ifdef later if all arch support genirq

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/char/random.c        | 2 +-
 drivers/pci/intr_remapping.c | 2 +-
 include/linux/irq.h          | 6 ++----
 include/linux/kernel_stat.h  | 6 +++---
 kernel/irq/handle.c          | 7 ++++---
 5 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7c13581ca9cd..a778918c8f42 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,7 +558,7 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
 
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f78371b22529..3d604132a04f 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -20,7 +20,7 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_GENERIC_HARDIRQS
 static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
 {
 	struct irq_2_iommu *iommu;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..e9a878978c85 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -160,12 +160,10 @@ struct irq_2_iommu;
  */
 struct irq_desc {
 	unsigned int		irq;
-#ifdef CONFIG_SPARSE_IRQ
 	struct timer_rand_state *timer_rand_state;
 	unsigned int            *kstat_irqs;
-# ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_INTR_REMAP
 	struct irq_2_iommu      *irq_2_iommu;
-# endif
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
@@ -202,13 +200,13 @@ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc
 extern struct irq_desc irq_desc[NR_IRQS];
 #else /* CONFIG_SPARSE_IRQ */
 extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+#endif /* CONFIG_SPARSE_IRQ */
 
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
 #define kstat_incr_irqs_this_cpu(irqno, DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()]++)
 
-#endif /* CONFIG_SPARSE_IRQ */
 
 extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d20413119..a3431b164bea 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
 #endif
 };
@@ -41,7 +41,7 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
 #define kstat_irqs_this_cpu(irq) \
 	(kstat_this_cpu.irqs[irq])
 
@@ -55,7 +55,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 #endif
 
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..48299a8a22f8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -213,6 +213,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
 	struct irq_desc *desc;
@@ -222,8 +223,10 @@ int __init early_irq_init(void)
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_all[i];
+	}
 
 	return arch_early_irq_init();
 }
@@ -451,12 +454,10 @@ void early_init_irq_lock_class(void)
 	}
 }
 
-#ifdef CONFIG_SPARSE_IRQ
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	return desc ? desc->kstat_irqs[cpu] : 0;
 }
-#endif
 EXPORT_SYMBOL(kstat_irqs_cpu);
 
-- 
cgit v1.2.3-71-gd317


From 0b8698ab5847cbe25775083659f00c658a8161c9 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Fri, 9 Jan 2009 18:32:09 +0000
Subject: swiotlb: range_needs_mapping should take a physical address.

The swiotlb_arch_range_needs_mapping() hook should take a physical
address rather than a virtual address in order to support highmem pages.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-swiotlb_64.c |  2 +-
 include/linux/swiotlb.h          |  2 +-
 lib/swiotlb.c                    | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 5e32c4f6a7ba..34f12e9996ed 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -33,7 +33,7 @@ phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
 	return baddr;
 }
 
-int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
 {
 	return 0;
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 493dc17e7c87..ac9ff54f7cb3 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -31,7 +31,7 @@ extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
 				      phys_addr_t address);
 extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
 
-extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size);
+extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
 
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 30fe65ede2bb..31bae40830ca 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -145,7 +145,7 @@ static void *swiotlb_bus_to_virt(dma_addr_t address)
 	return phys_to_virt(swiotlb_bus_to_phys(address));
 }
 
-int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
 {
 	return 0;
 }
@@ -315,9 +315,9 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
 }
 
-static inline int range_needs_mapping(void *ptr, size_t size)
+static inline int range_needs_mapping(phys_addr_t paddr, size_t size)
 {
-	return swiotlb_force || swiotlb_arch_range_needs_mapping(ptr, size);
+	return swiotlb_force || swiotlb_arch_range_needs_mapping(paddr, size);
 }
 
 static int is_swiotlb_buffer(char *addr)
@@ -653,7 +653,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	 * buffering it.
 	 */
 	if (!address_needs_mapping(dev, dev_addr, size) &&
-	    !range_needs_mapping(ptr, size))
+	    !range_needs_mapping(virt_to_phys(ptr), size))
 		return dev_addr;
 
 	/*
@@ -804,7 +804,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 		void *addr = sg_virt(sg);
 		dma_addr_t dev_addr = swiotlb_virt_to_bus(hwdev, addr);
 
-		if (range_needs_mapping(addr, sg->length) ||
+		if (range_needs_mapping(sg_phys(sg), sg->length) ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
 			void *map = map_single(hwdev, sg_phys(sg),
 					       sg->length, dir);
-- 
cgit v1.2.3-71-gd317


From f00012074b1a1a67d9c8603617bbbab267347ca6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:42 +0800
Subject: ftrace, ia64: Add macro for ftrace_caller

Define FTRACE_ADDR. In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++++
 kernel/trace/ftrace.c  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..054721487574 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -126,6 +126,10 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+
+#ifndef FTRACE_ADDR
+#define FTRACE_ADDR ((unsigned long)ftrace_caller)
+#endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern void ftrace_graph_caller(void);
 extern int ftrace_enable_ftrace_graph_caller(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 76bb884b6e16..9f536108d3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -455,7 +455,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-	ftrace_addr = (unsigned long)ftrace_caller;
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
 	ip = rec->ip;
 
-- 
cgit v1.2.3-71-gd317


From 41719b03091911028116155deddc5eedf8c45e37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 15:36:26 +0100
Subject: mutex: preemption fixes

The problem is that dropping the spinlock right before schedule is a voluntary
preemption point and can cause a schedule, right after which we schedule again.

Fix this inefficiency by keeping preemption disabled until we schedule, do this
by explicity disabling preemption and providing a schedule() variant that
assumes preemption is already disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/mutex.c        |  5 ++++-
 kernel/sched.c        | 10 +++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..9f0b372cfa6f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -328,6 +328,7 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
 
 struct nsproxy;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 357c6d221efe..524ffc33dc05 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,6 +131,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct mutex_waiter waiter;
 	unsigned long flags;
 
+	preempt_disable();
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -170,13 +171,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
+		__schedule();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
@@ -193,6 +195,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..b001c133c359 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4538,15 +4538,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4603,7 +4601,13 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
-- 
cgit v1.2.3-71-gd317


From 0d66bf6d3514b35eb6897629059443132992dbd7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 Jan 2009 14:01:47 +0100
Subject: mutex: implement adaptive spinning

Change mutex contention behaviour such that it will sometimes busy wait on
acquisition - moving its behaviour closer to that of spinlocks.

This concept got ported to mainline from the -rt tree, where it was originally
implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins.

Testing with Ingo's test-mutex application (http://lkml.org/lkml/2006/1/8/50)
gave a 345% boost for VFS scalability on my testbox:

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               296604

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               85870

The key criteria for the busy wait is that the lock owner has to be running on
a (different) cpu. The idea is that as long as the owner is running, there is a
fair chance it'll release the lock soon, and thus we'll be better off spinning
instead of blocking/scheduling.

Since regular mutexes (as opposed to rtmutexes) do not atomically track the
owner, we add the owner in a non-atomic fashion and deal with the races in
the slowpath.

Furthermore, to ease the testing of the performance impact of this new code,
there is means to disable this behaviour runtime (without having to reboot
the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
by issuing the following command:

 # echo NO_OWNER_SPIN > /debug/sched_features

This command re-enables spinning again (this is also the default):

 # echo OWNER_SPIN > /debug/sched_features

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h   |   5 ++-
 include/linux/sched.h   |   1 +
 kernel/mutex-debug.c    |   9 +---
 kernel/mutex-debug.h    |  18 ++++----
 kernel/mutex.c          | 115 +++++++++++++++++++++++++++++++++++++++++++-----
 kernel/mutex.h          |  22 ++++++++-
 kernel/sched.c          |  61 +++++++++++++++++++++++++
 kernel/sched_features.h |   1 +
 8 files changed, 201 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 7a0e5c4f8072..3069ec7e0ab8 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -50,8 +50,10 @@ struct mutex {
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
 #endif
@@ -68,7 +70,6 @@ struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 #ifdef CONFIG_DEBUG_MUTEXES
-	struct mutex		*lock;
 	void			*magic;
 #endif
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f0b372cfa6f..c34b137cd1e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -330,6 +330,7 @@ extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160eb532..50d022e5a560 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdfc534b..6b2d735846a5 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 524ffc33dc05..ff42e975590c 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
 	 * 'unlocked' into 'locked' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
 	 * The unlocking fastpath is the 0->1 transition from 'locked'
 	 * into 'unlocked' state:
 	 */
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
 
@@ -132,10 +147,71 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	unsigned long flags;
 
 	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn't track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 *
+	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+	 * to serialize everything.
+	 */
+
+	for (;;) {
+		struct thread_info *owner;
+
+		/*
+		 * If there are pending waiters, join them.
+		 */
+		if (!list_empty(&lock->wait_list))
+			break;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+#endif
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
@@ -185,8 +261,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -222,7 +298,8 @@ int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +337,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 		wake_up_process(waiter->task);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +373,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -352,9 +439,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +468,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075dafbb290..67578ca48f94 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index b001c133c359..589e7308c615 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4614,6 +4614,67 @@ need_resched:
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+	unsigned int cpu;
+	struct rq *rq;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	for (;;) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
+
+		cpu_relax();
+	}
+out:
+	return 1;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c6..07bc02e99ab1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)
-- 
cgit v1.2.3-71-gd317


From 831451ac4e44d3a20b581ce726ef1d1144373f7d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 12:39:18 +0100
Subject: sched: introduce avg_wakeup

Introduce a new avg_wakeup statistic.

avg_wakeup is a measure of how frequently a task wakes up other tasks, it
represents the average time between wakeups, with a limit of avg_runtime
for when it doesn't wake up anybody.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  3 +++
 kernel/sched.c        | 36 ++++++++++++++++++++++++++++++------
 kernel/sched_debug.c  |  1 +
 3 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..daf4e07bc978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,6 +1046,9 @@ struct sched_entity {
 	u64			exec_max;
 	u64			slice_max;
 
+	u64			start_runtime;
+	u64			avg_wakeup;
+
 	u64			nr_migrations;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..86f5a063f0b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1705,6 +1705,9 @@ static void update_avg(u64 *avg, u64 sample)
 
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
+	if (wakeup)
+		p->se.start_runtime = p->se.sum_exec_runtime;
+
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
@@ -1712,10 +1715,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-	if (sleep && p->se.last_wakeup) {
-		update_avg(&p->se.avg_overlap,
-			   p->se.sum_exec_runtime - p->se.last_wakeup);
-		p->se.last_wakeup = 0;
+	if (sleep) {
+		if (p->se.last_wakeup) {
+			update_avg(&p->se.avg_overlap,
+				p->se.sum_exec_runtime - p->se.last_wakeup);
+			p->se.last_wakeup = 0;
+		} else {
+			update_avg(&p->se.avg_wakeup,
+				sysctl_sched_wakeup_granularity);
+		}
 	}
 
 	sched_info_dequeued(p);
@@ -2345,6 +2353,22 @@ out_activate:
 	activate_task(rq, p, 1);
 	success = 1;
 
+	/*
+	 * Only attribute actual wakeups done by this task.
+	 */
+	if (!in_interrupt()) {
+		struct sched_entity *se = &current->se;
+		u64 sample = se->sum_exec_runtime;
+
+		if (se->last_wakeup)
+			sample -= se->last_wakeup;
+		else
+			sample -= se->start_runtime;
+		update_avg(&se->avg_wakeup, sample);
+
+		se->last_wakeup = se->sum_exec_runtime;
+	}
+
 out_running:
 	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, sync);
@@ -2355,8 +2379,6 @@ out_running:
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
-	current->se.last_wakeup = current->se.sum_exec_runtime;
-
 	task_rq_unlock(rq, &flags);
 
 	return success;
@@ -2386,6 +2408,8 @@ static void __sched_fork(struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
+	p->se.start_runtime		= 0;
+	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 16eeba4e4169..2b1260f0e800 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -397,6 +397,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 	PN(se.avg_overlap);
+	PN(se.avg_wakeup);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
-- 
cgit v1.2.3-71-gd317


From 34cb61359b503d7aff6447acb037a5efd6ce93b2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 16 Jan 2009 13:36:06 +0100
Subject: sched: fix !CONFIG_SCHEDSTATS build failure

Stephen Rothwell reported this linux-next build failure with !CONFIG_SCHEDSTATS:

| In file included from kernel/sched.c:1703:
| kernel/sched_fair.c: In function 'adaptive_gran':
| kernel/sched_fair.c:1324: error: 'struct sched_entity' has no member named 'avg_wakeup'

The start_runtime and avg_wakeup metrics are now not just for statistics,
but also for scheduling - so they always need to be available. (Also
move out the nr_migrations fields - for future perfcounters usage.)

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index daf4e07bc978..5d56b54350a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1031,6 +1031,10 @@ struct sched_entity {
 	u64			last_wakeup;
 	u64			avg_overlap;
 
+	u64			start_runtime;
+	u64			avg_wakeup;
+	u64			nr_migrations;
+
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
@@ -1046,10 +1050,6 @@ struct sched_entity {
 	u64			exec_max;
 	u64			slice_max;
 
-	u64			start_runtime;
-	u64			avg_wakeup;
-
-	u64			nr_migrations;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
-- 
cgit v1.2.3-71-gd317


From 74296a8ed6aa3c5bf672808ada690de7ba323ecc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 16 Jan 2009 17:43:50 +0100
Subject: irq: provide debug_poll_all_shared_irqs() method under
 CONFIG_DEBUG_SHIRQ

Provide a shared interrupt debug facility under CONFIG_DEBUG_SHIRQ:
it uses the existing irqpoll facilities to iterate through all
registered interrupt handlers and call those which can handle shared
IRQ lines.

This can be handy for suspend/resume debugging: if we call this function
early during resume we can trigger crashes in those drivers which have
incorrect assumptions about when exactly their ISRs will be called
during suspend/resume.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  6 ++++++
 kernel/irq/spurious.c     | 14 +++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..468e3a25a4a1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -462,6 +462,12 @@ static inline void init_irq_proc(void)
 }
 #endif
 
+#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ)
+extern void debug_poll_all_shared_irqs(void);
+#else
+static inline void debug_poll_all_shared_irqs(void) { }
+#endif
+
 int show_interrupts(struct seq_file *p, void *v);
 
 struct irq_desc;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..4d568294de3e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
 	return ok;
 }
 
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_all_shared_irqs(void)
 {
 	struct irq_desc *desc;
 	int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
 
 		try_one_irq(i, desc);
 	}
+}
+
+static void poll_spurious_irqs(unsigned long dummy)
+{
+	poll_all_shared_irqs();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
 
+#ifdef CONFIG_DEBUG_SHIRQ
+void debug_poll_all_shared_irqs(void)
+{
+	poll_all_shared_irqs();
+}
+#endif
+
 /*
  * If 99,900 of the previous 100,000 interrupts have not been handled
  * then assume that the IRQ is stuck in some manner. Drop a diagnostic
-- 
cgit v1.2.3-71-gd317


From b1818748b0cf9427e48acf9713295e829a0d715f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:31:01 +0100
Subject: x86, ftrace, hw-branch-tracer: dump trace on oops

Dump the branch trace on an oops (based on ftrace_dump_on_oops).

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c      |  6 ++++++
 include/linux/ftrace.h           | 13 +++++++++++++
 kernel/trace/trace.h             |  1 -
 kernel/trace/trace_hw_branches.c | 29 ++++++++++++++++++++++-------
 4 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..077c9ea655fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -14,6 +14,7 @@
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
+#include <linux/ftrace.h>
 
 #include <asm/stacktrace.h>
 
@@ -195,6 +196,11 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
+	/* notify the hw-branch tracer so it may disable tracing and
+	   add the last trace to the trace buffer -
+	   the earlier this happens, the more useful the trace. */
+	trace_hw_branch_oops();
+
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 054721487574..9f7880d87c39 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -496,4 +496,17 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 #endif /* CONFIG_TRACING */
 
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+
+void trace_hw_branch(u64 from, u64 to);
+void trace_hw_branch_oops(void);
+
+#else /* CONFIG_HW_BRANCH_TRACER */
+
+static inline void trace_hw_branch(u64 from, u64 to) {}
+static inline void trace_hw_branch_oops(void) {}
+
+#endif /* CONFIG_HW_BRANCH_TRACER */
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54b72781e920..b96037d970df 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,7 +438,6 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 398195397c75..e56df2c7d679 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -40,6 +40,7 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int __read_mostly trace_hw_branches_enabled;
+static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
@@ -128,6 +129,8 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 
 static int bts_trace_init(struct trace_array *tr)
 {
+	hw_branch_trace = tr;
+
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
@@ -170,8 +173,9 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	return TRACE_TYPE_UNHANDLED;
 }
 
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+void trace_hw_branch(u64 from, u64 to)
 {
+	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
 	unsigned long irq1, irq2;
@@ -204,8 +208,7 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 	local_irq_restore(irq1);
 }
 
-static void trace_bts_at(struct trace_array *tr,
-			 const struct bts_trace *trace, void *at)
+static void trace_bts_at(const struct bts_trace *trace, void *at)
 {
 	struct bts_struct bts;
 	int err = 0;
@@ -220,7 +223,7 @@ static void trace_bts_at(struct trace_array *tr,
 
 	switch (bts.qualifier) {
 	case BTS_BRANCH:
-		trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
 		break;
 	}
 }
@@ -236,12 +239,15 @@ static void trace_bts_cpu(void *arg)
 	const struct bts_trace *trace;
 	unsigned char *at;
 
-	if (!this_tracer)
+	if (unlikely(!tr))
 		return;
 
 	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
 		return;
 
+	if (unlikely(!this_tracer))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -249,11 +255,11 @@ static void trace_bts_cpu(void *arg)
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 out:
 	ds_resume_bts(this_tracer);
@@ -268,6 +274,15 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+void trace_hw_branch_oops(void)
+{
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_bts_cpu(hw_branch_trace);
+
+	mutex_unlock(&bts_tracer_mutex);
+}
+
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
-- 
cgit v1.2.3-71-gd317


From 5803c5122acb31ebf5f76b1a9925e2c72c4436e1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Fri, 9 Jan 2009 13:01:08 +0000
Subject: arcnet: convert to internal stats

Use pre-existing network_device_stats inside network_device rather than own
private structure.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/arcnet/arc-rawmode.c |  2 +-
 drivers/net/arcnet/arcnet.c      | 38 ++++++++++----------------------
 drivers/net/arcnet/capmode.c     |  2 +-
 drivers/net/arcnet/rfc1051.c     | 12 +++++-----
 drivers/net/arcnet/rfc1201.c     | 47 ++++++++++++++++++++--------------------
 include/linux/arcdevice.h        |  2 --
 6 files changed, 42 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/arcnet/arc-rawmode.c b/drivers/net/arcnet/arc-rawmode.c
index 3ff9affb1a91..da017cbb5f64 100644
--- a/drivers/net/arcnet/arc-rawmode.c
+++ b/drivers/net/arcnet/arc-rawmode.c
@@ -102,7 +102,7 @@ static void rx(struct net_device *dev, int bufnum,
 	skb = alloc_skb(length + ARC_HDR_SIZE, GFP_ATOMIC);
 	if (skb == NULL) {
 		BUGMSG(D_NORMAL, "Memory squeeze, dropping packet.\n");
-		lp->stats.rx_dropped++;
+		dev->stats.rx_dropped++;
 		return;
 	}
 	skb_put(skb, length + ARC_HDR_SIZE);
diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index 6b53e5ed125c..34b9a4d0da30 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -105,7 +105,6 @@ static int arcnet_header(struct sk_buff *skb, struct net_device *dev,
 			 unsigned short type, const void *daddr,
 			 const void *saddr, unsigned len);
 static int arcnet_rebuild_header(struct sk_buff *skb);
-static struct net_device_stats *arcnet_get_stats(struct net_device *dev);
 static int go_tx(struct net_device *dev);
 
 static int debug = ARCNET_DEBUG;
@@ -347,7 +346,6 @@ static void arcdev_setup(struct net_device *dev)
 	dev->stop = arcnet_close;
 	dev->hard_start_xmit = arcnet_send_packet;
 	dev->tx_timeout = arcnet_timeout;
-	dev->get_stats = arcnet_get_stats;
 }
 
 struct net_device *alloc_arcdev(char *name)
@@ -583,8 +581,8 @@ static int arcnet_rebuild_header(struct sk_buff *skb)
 	} else {
 		BUGMSG(D_NORMAL,
 		       "I don't understand ethernet protocol %Xh addresses!\n", type);
-		lp->stats.tx_errors++;
-		lp->stats.tx_aborted_errors++;
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
 	}
 
 	/* if we couldn't resolve the address... give up. */
@@ -645,7 +643,7 @@ static int arcnet_send_packet(struct sk_buff *skb, struct net_device *dev)
 		    !proto->ack_tx) {
 			/* done right away and we don't want to acknowledge
 			   the package later - forget about it now */
-			lp->stats.tx_bytes += skb->len;
+			dev->stats.tx_bytes += skb->len;
 			freeskb = 1;
 		} else {
 			/* do it the 'split' way */
@@ -709,7 +707,7 @@ static int go_tx(struct net_device *dev)
 	/* start sending */
 	ACOMMAND(TXcmd | (lp->cur_tx << 3));
 
-	lp->stats.tx_packets++;
+	dev->stats.tx_packets++;
 	lp->lasttrans_dest = lp->lastload_dest;
 	lp->lastload_dest = 0;
 	lp->excnak_pending = 0;
@@ -732,11 +730,11 @@ static void arcnet_timeout(struct net_device *dev)
 		msg = " - missed IRQ?";
 	} else {
 		msg = "";
-		lp->stats.tx_aborted_errors++;
+		dev->stats.tx_aborted_errors++;
 		lp->timed_out = 1;
 		ACOMMAND(NOTXcmd | (lp->cur_tx << 3));
 	}
-	lp->stats.tx_errors++;
+	dev->stats.tx_errors++;
 
 	/* make sure we didn't miss a TX or a EXC NAK IRQ */
 	AINTMASK(0);
@@ -865,8 +863,8 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
 						       "transmit was not acknowledged! "
 						       "(status=%Xh, dest=%02Xh)\n",
 						       status, lp->lasttrans_dest);
-						lp->stats.tx_errors++;
-						lp->stats.tx_carrier_errors++;
+						dev->stats.tx_errors++;
+						dev->stats.tx_carrier_errors++;
 					} else {
 						BUGMSG(D_DURING,
 						       "broadcast was not acknowledged; that's normal "
@@ -905,7 +903,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
 				if (txbuf != -1) {
 					if (lp->outgoing.proto->continue_tx(dev, txbuf)) {
 						/* that was the last segment */
-						lp->stats.tx_bytes += lp->outgoing.skb->len;
+						dev->stats.tx_bytes += lp->outgoing.skb->len;
 						if(!lp->outgoing.proto->ack_tx)
 						  {
 						    dev_kfree_skb_irq(lp->outgoing.skb);
@@ -930,7 +928,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
 		}
 		if (status & lp->intmask & RECONflag) {
 			ACOMMAND(CFLAGScmd | CONFIGclear);
-			lp->stats.tx_carrier_errors++;
+			dev->stats.tx_carrier_errors++;
 
 			BUGMSG(D_RECON, "Network reconfiguration detected (status=%Xh)\n",
 			       status);
@@ -1038,8 +1036,8 @@ static void arcnet_rx(struct net_device *dev, int bufnum)
 	       "(%d+4 bytes)\n",
 	       bufnum, pkt.hard.source, pkt.hard.dest, length);
 
-	lp->stats.rx_packets++;
-	lp->stats.rx_bytes += length + ARC_HDR_SIZE;
+	dev->stats.rx_packets++;
+	dev->stats.rx_bytes += length + ARC_HDR_SIZE;
 
 	/* call the right receiver for the protocol */
 	if (arc_proto_map[soft->proto]->is_ip) {
@@ -1067,18 +1065,6 @@ static void arcnet_rx(struct net_device *dev, int bufnum)
 }
 
 
-
-/* 
- * Get the current statistics.  This may be called with the card open or
- * closed.
- */
-static struct net_device_stats *arcnet_get_stats(struct net_device *dev)
-{
-	struct arcnet_local *lp = netdev_priv(dev);
-	return &lp->stats;
-}
-
-
 static void null_rx(struct net_device *dev, int bufnum,
 		    struct archdr *pkthdr, int length)
 {
diff --git a/drivers/net/arcnet/capmode.c b/drivers/net/arcnet/capmode.c
index 30580bbe252d..1613929ff301 100644
--- a/drivers/net/arcnet/capmode.c
+++ b/drivers/net/arcnet/capmode.c
@@ -119,7 +119,7 @@ static void rx(struct net_device *dev, int bufnum,
 	skb = alloc_skb(length + ARC_HDR_SIZE + sizeof(int), GFP_ATOMIC);
 	if (skb == NULL) {
 		BUGMSG(D_NORMAL, "Memory squeeze, dropping packet.\n");
-		lp->stats.rx_dropped++;
+		dev->stats.rx_dropped++;
 		return;
 	}
 	skb_put(skb, length + ARC_HDR_SIZE + sizeof(int));
diff --git a/drivers/net/arcnet/rfc1051.c b/drivers/net/arcnet/rfc1051.c
index 49d39a9cb696..06f8fa2f8f2f 100644
--- a/drivers/net/arcnet/rfc1051.c
+++ b/drivers/net/arcnet/rfc1051.c
@@ -88,7 +88,6 @@ MODULE_LICENSE("GPL");
  */
 static __be16 type_trans(struct sk_buff *skb, struct net_device *dev)
 {
-	struct arcnet_local *lp = netdev_priv(dev);
 	struct archdr *pkt = (struct archdr *) skb->data;
 	struct arc_rfc1051 *soft = &pkt->soft.rfc1051;
 	int hdr_size = ARC_HDR_SIZE + RFC1051_HDR_SIZE;
@@ -112,8 +111,8 @@ static __be16 type_trans(struct sk_buff *skb, struct net_device *dev)
 		return htons(ETH_P_ARP);
 
 	default:
-		lp->stats.rx_errors++;
-		lp->stats.rx_crc_errors++;
+		dev->stats.rx_errors++;
+		dev->stats.rx_crc_errors++;
 		return 0;
 	}
 
@@ -140,7 +139,7 @@ static void rx(struct net_device *dev, int bufnum,
 	skb = alloc_skb(length + ARC_HDR_SIZE, GFP_ATOMIC);
 	if (skb == NULL) {
 		BUGMSG(D_NORMAL, "Memory squeeze, dropping packet.\n");
-		lp->stats.rx_dropped++;
+		dev->stats.rx_dropped++;
 		return;
 	}
 	skb_put(skb, length + ARC_HDR_SIZE);
@@ -168,7 +167,6 @@ static void rx(struct net_device *dev, int bufnum,
 static int build_header(struct sk_buff *skb, struct net_device *dev,
 			unsigned short type, uint8_t daddr)
 {
-	struct arcnet_local *lp = netdev_priv(dev);
 	int hdr_size = ARC_HDR_SIZE + RFC1051_HDR_SIZE;
 	struct archdr *pkt = (struct archdr *) skb_push(skb, hdr_size);
 	struct arc_rfc1051 *soft = &pkt->soft.rfc1051;
@@ -184,8 +182,8 @@ static int build_header(struct sk_buff *skb, struct net_device *dev,
 	default:
 		BUGMSG(D_NORMAL, "RFC1051: I don't understand protocol %d (%Xh)\n",
 		       type, type);
-		lp->stats.tx_errors++;
-		lp->stats.tx_aborted_errors++;
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
 		return 0;
 	}
 
diff --git a/drivers/net/arcnet/rfc1201.c b/drivers/net/arcnet/rfc1201.c
index 2303d3a1f4b6..745530651c45 100644
--- a/drivers/net/arcnet/rfc1201.c
+++ b/drivers/net/arcnet/rfc1201.c
@@ -92,7 +92,6 @@ static __be16 type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct archdr *pkt = (struct archdr *) skb->data;
 	struct arc_rfc1201 *soft = &pkt->soft.rfc1201;
-	struct arcnet_local *lp = netdev_priv(dev);
 	int hdr_size = ARC_HDR_SIZE + RFC1201_HDR_SIZE;
 
 	/* Pull off the arcnet header. */
@@ -121,8 +120,8 @@ static __be16 type_trans(struct sk_buff *skb, struct net_device *dev)
 	case ARC_P_NOVELL_EC:
 		return htons(ETH_P_802_3);
 	default:
-		lp->stats.rx_errors++;
-		lp->stats.rx_crc_errors++;
+		dev->stats.rx_errors++;
+		dev->stats.rx_crc_errors++;
 		return 0;
 	}
 
@@ -172,8 +171,8 @@ static void rx(struct net_device *dev, int bufnum,
 			 in->sequence, soft->split_flag, soft->sequence);
 			lp->rfc1201.aborted_seq = soft->sequence;
 			dev_kfree_skb_irq(in->skb);
-			lp->stats.rx_errors++;
-			lp->stats.rx_missed_errors++;
+			dev->stats.rx_errors++;
+			dev->stats.rx_missed_errors++;
 			in->skb = NULL;
 		}
 		in->sequence = soft->sequence;
@@ -181,7 +180,7 @@ static void rx(struct net_device *dev, int bufnum,
 		skb = alloc_skb(length + ARC_HDR_SIZE, GFP_ATOMIC);
 		if (skb == NULL) {
 			BUGMSG(D_NORMAL, "Memory squeeze, dropping packet.\n");
-			lp->stats.rx_dropped++;
+			dev->stats.rx_dropped++;
 			return;
 		}
 		skb_put(skb, length + ARC_HDR_SIZE);
@@ -213,7 +212,7 @@ static void rx(struct net_device *dev, int bufnum,
 					BUGMSG(D_EXTRA,
 					       "ARP source address was 00h, set to %02Xh.\n",
 					       saddr);
-					lp->stats.rx_crc_errors++;
+					dev->stats.rx_crc_errors++;
 					*cptr = saddr;
 				} else {
 					BUGMSG(D_DURING, "ARP source address (%Xh) is fine.\n",
@@ -222,8 +221,8 @@ static void rx(struct net_device *dev, int bufnum,
 			} else {
 				BUGMSG(D_NORMAL, "funny-shaped ARP packet. (%Xh, %Xh)\n",
 				       arp->ar_hln, arp->ar_pln);
-				lp->stats.rx_errors++;
-				lp->stats.rx_crc_errors++;
+				dev->stats.rx_errors++;
+				dev->stats.rx_crc_errors++;
 			}
 		}
 		BUGLVL(D_SKB) arcnet_dump_skb(dev, skb, "rx");
@@ -257,8 +256,8 @@ static void rx(struct net_device *dev, int bufnum,
 			       soft->split_flag);
 			dev_kfree_skb_irq(in->skb);
 			in->skb = NULL;
-			lp->stats.rx_errors++;
-			lp->stats.rx_missed_errors++;
+			dev->stats.rx_errors++;
+			dev->stats.rx_missed_errors++;
 			in->lastpacket = in->numpackets = 0;
 		}
 		if (soft->split_flag & 1) {	/* first packet in split */
@@ -269,8 +268,8 @@ static void rx(struct net_device *dev, int bufnum,
 				       "(splitflag=%d, seq=%d)\n",
 				       in->sequence, soft->split_flag,
 				       soft->sequence);
-				lp->stats.rx_errors++;
-				lp->stats.rx_missed_errors++;
+				dev->stats.rx_errors++;
+				dev->stats.rx_missed_errors++;
 				dev_kfree_skb_irq(in->skb);
 			}
 			in->sequence = soft->sequence;
@@ -281,8 +280,8 @@ static void rx(struct net_device *dev, int bufnum,
 				BUGMSG(D_EXTRA, "incoming packet more than 16 segments; dropping. (splitflag=%d)\n",
 				       soft->split_flag);
 				lp->rfc1201.aborted_seq = soft->sequence;
-				lp->stats.rx_errors++;
-				lp->stats.rx_length_errors++;
+				dev->stats.rx_errors++;
+				dev->stats.rx_length_errors++;
 				return;
 			}
 			in->skb = skb = alloc_skb(508 * in->numpackets + ARC_HDR_SIZE,
@@ -290,7 +289,7 @@ static void rx(struct net_device *dev, int bufnum,
 			if (skb == NULL) {
 				BUGMSG(D_NORMAL, "(split) memory squeeze, dropping packet.\n");
 				lp->rfc1201.aborted_seq = soft->sequence;
-				lp->stats.rx_dropped++;
+				dev->stats.rx_dropped++;
 				return;
 			}
 			skb->dev = dev;
@@ -314,8 +313,8 @@ static void rx(struct net_device *dev, int bufnum,
 					       "first! (splitflag=%d, seq=%d, aborted=%d)\n",
 					soft->split_flag, soft->sequence,
 					       lp->rfc1201.aborted_seq);
-					lp->stats.rx_errors++;
-					lp->stats.rx_missed_errors++;
+					dev->stats.rx_errors++;
+					dev->stats.rx_missed_errors++;
 				}
 				return;
 			}
@@ -325,8 +324,8 @@ static void rx(struct net_device *dev, int bufnum,
 				if (packetnum <= in->lastpacket - 1) {
 					BUGMSG(D_EXTRA, "duplicate splitpacket ignored! (splitflag=%d)\n",
 					       soft->split_flag);
-					lp->stats.rx_errors++;
-					lp->stats.rx_frame_errors++;
+					dev->stats.rx_errors++;
+					dev->stats.rx_frame_errors++;
 					return;
 				}
 				/* "bad" duplicate, kill reassembly */
@@ -336,8 +335,8 @@ static void rx(struct net_device *dev, int bufnum,
 				lp->rfc1201.aborted_seq = soft->sequence;
 				dev_kfree_skb_irq(in->skb);
 				in->skb = NULL;
-				lp->stats.rx_errors++;
-				lp->stats.rx_missed_errors++;
+				dev->stats.rx_errors++;
+				dev->stats.rx_missed_errors++;
 				in->lastpacket = in->numpackets = 0;
 				return;
 			}
@@ -404,8 +403,8 @@ static int build_header(struct sk_buff *skb, struct net_device *dev,
 	default:
 		BUGMSG(D_NORMAL, "RFC1201: I don't understand protocol %d (%Xh)\n",
 		       type, type);
-		lp->stats.tx_errors++;
-		lp->stats.tx_aborted_errors++;
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
 		return 0;
 	}
 
diff --git a/include/linux/arcdevice.h b/include/linux/arcdevice.h
index a1916078fd08..ef0d6b7df44c 100644
--- a/include/linux/arcdevice.h
+++ b/include/linux/arcdevice.h
@@ -235,8 +235,6 @@ struct Outgoing {
 
 
 struct arcnet_local {
-	struct net_device_stats stats;
-
 	uint8_t config,		/* current value of CONFIG register */
 		timeout,	/* Extended timeout for COM20020 */
 		backplane,	/* Backplane flag for COM20020 */
-- 
cgit v1.2.3-71-gd317


From bca5b8939f107e498b3fdc92b3a2d286a868d347 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Fri, 9 Jan 2009 13:01:09 +0000
Subject: arcnet: convert to net_device_ops

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/arcnet/arcnet.c | 33 ++++++++++++++++-----------------
 include/linux/arcdevice.h   |  7 ++++++-
 2 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index 34b9a4d0da30..a80d4a30a464 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -95,12 +95,12 @@ EXPORT_SYMBOL(arcnet_unregister_proto);
 EXPORT_SYMBOL(arcnet_debug);
 EXPORT_SYMBOL(alloc_arcdev);
 EXPORT_SYMBOL(arcnet_interrupt);
+EXPORT_SYMBOL(arcnet_open);
+EXPORT_SYMBOL(arcnet_close);
+EXPORT_SYMBOL(arcnet_send_packet);
+EXPORT_SYMBOL(arcnet_timeout);
 
 /* Internal function prototypes */
-static int arcnet_open(struct net_device *dev);
-static int arcnet_close(struct net_device *dev);
-static int arcnet_send_packet(struct sk_buff *skb, struct net_device *dev);
-static void arcnet_timeout(struct net_device *dev);
 static int arcnet_header(struct sk_buff *skb, struct net_device *dev,
 			 unsigned short type, const void *daddr,
 			 const void *saddr, unsigned len);
@@ -321,11 +321,18 @@ static const struct header_ops arcnet_header_ops = {
 	.rebuild = arcnet_rebuild_header,
 };
 
+static const struct net_device_ops arcnet_netdev_ops = {
+	.ndo_open	= arcnet_open,
+	.ndo_stop	= arcnet_close,
+	.ndo_start_xmit = arcnet_send_packet,
+	.ndo_tx_timeout = arcnet_timeout,
+};
 
 /* Setup a struct device for ARCnet. */
 static void arcdev_setup(struct net_device *dev)
 {
 	dev->type = ARPHRD_ARCNET;
+	dev->netdev_ops = &arcnet_netdev_ops;
 	dev->header_ops = &arcnet_header_ops;
 	dev->hard_header_len = sizeof(struct archdr);
 	dev->mtu = choose_mtu();
@@ -338,17 +345,9 @@ static void arcdev_setup(struct net_device *dev)
 	/* New-style flags. */
 	dev->flags = IFF_BROADCAST;
 
-	/*
-	 * Put in this stuff here, so we don't have to export the symbols to
-	 * the chipset drivers.
-	 */
-	dev->open = arcnet_open;
-	dev->stop = arcnet_close;
-	dev->hard_start_xmit = arcnet_send_packet;
-	dev->tx_timeout = arcnet_timeout;
 }
 
-struct net_device *alloc_arcdev(char *name)
+struct net_device *alloc_arcdev(const char *name)
 {
 	struct net_device *dev;
 
@@ -370,7 +369,7 @@ struct net_device *alloc_arcdev(char *name)
  * that "should" only need to be set once at boot, so that there is
  * non-reboot way to recover if something goes wrong.
  */
-static int arcnet_open(struct net_device *dev)
+int arcnet_open(struct net_device *dev)
 {
 	struct arcnet_local *lp = netdev_priv(dev);
 	int count, newmtu, error;
@@ -470,7 +469,7 @@ static int arcnet_open(struct net_device *dev)
 
 
 /* The inverse routine to arcnet_open - shuts down the card. */
-static int arcnet_close(struct net_device *dev)
+int arcnet_close(struct net_device *dev)
 {
 	struct arcnet_local *lp = netdev_priv(dev);
 
@@ -599,7 +598,7 @@ static int arcnet_rebuild_header(struct sk_buff *skb)
 
 
 /* Called by the kernel in order to transmit a packet. */
-static int arcnet_send_packet(struct sk_buff *skb, struct net_device *dev)
+int arcnet_send_packet(struct sk_buff *skb, struct net_device *dev)
 {
 	struct arcnet_local *lp = netdev_priv(dev);
 	struct archdr *pkt;
@@ -718,7 +717,7 @@ static int go_tx(struct net_device *dev)
 
 
 /* Called by the kernel when transmit times out */
-static void arcnet_timeout(struct net_device *dev)
+void arcnet_timeout(struct net_device *dev)
 {
 	unsigned long flags;
 	struct arcnet_local *lp = netdev_priv(dev);
diff --git a/include/linux/arcdevice.h b/include/linux/arcdevice.h
index ef0d6b7df44c..cd4bcb6989ce 100644
--- a/include/linux/arcdevice.h
+++ b/include/linux/arcdevice.h
@@ -333,7 +333,12 @@ void arcnet_dump_skb(struct net_device *dev, struct sk_buff *skb, char *desc);
 
 void arcnet_unregister_proto(struct ArcProto *proto);
 irqreturn_t arcnet_interrupt(int irq, void *dev_id);
-struct net_device *alloc_arcdev(char *name);
+struct net_device *alloc_arcdev(const char *name);
+
+int arcnet_open(struct net_device *dev);
+int arcnet_close(struct net_device *dev);
+int arcnet_send_packet(struct sk_buff *skb, struct net_device *dev);
+void arcnet_timeout(struct net_device *dev);
 
 #endif				/* __KERNEL__ */
 #endif				/* _LINUX_ARCDEVICE_H */
-- 
cgit v1.2.3-71-gd317


From a1799af4d7deefccdaa9d222a886fa1373dbb49a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Fri, 9 Jan 2009 13:01:10 +0000
Subject: com20020: convert to net_devic_ops

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/arcnet/com20020-isa.c |  2 ++
 drivers/net/arcnet/com20020-pci.c |  3 +++
 drivers/net/arcnet/com20020.c     | 10 ++++++++--
 include/linux/com20020.h          |  1 +
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/arcnet/com20020-isa.c b/drivers/net/arcnet/com20020-isa.c
index ea53a940272f..db08fc24047a 100644
--- a/drivers/net/arcnet/com20020-isa.c
+++ b/drivers/net/arcnet/com20020-isa.c
@@ -151,6 +151,8 @@ static int __init com20020_init(void)
 	if (node && node != 0xff)
 		dev->dev_addr[0] = node;
 
+	dev->netdev_ops = &com20020_netdev_ops;
+
 	lp = netdev_priv(dev);
 	lp->backplane = backplane;
 	lp->clockp = clockp & 7;
diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c
index 8b51f632581d..dbf4de39754d 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -72,6 +72,9 @@ static int __devinit com20020pci_probe(struct pci_dev *pdev, const struct pci_de
 	dev = alloc_arcdev(device);
 	if (!dev)
 		return -ENOMEM;
+
+	dev->netdev_ops = &com20020_netdev_ops;
+
 	lp = netdev_priv(dev);
 
 	pci_set_drvdata(pdev, dev);
diff --git a/drivers/net/arcnet/com20020.c b/drivers/net/arcnet/com20020.c
index 103688358fb8..bbe8f2ccdadb 100644
--- a/drivers/net/arcnet/com20020.c
+++ b/drivers/net/arcnet/com20020.c
@@ -149,6 +149,14 @@ int com20020_check(struct net_device *dev)
 	return 0;
 }
 
+const struct net_device_ops com20020_netdev_ops = {
+	.ndo_open	= arcnet_open,
+	.ndo_stop	= arcnet_close,
+	.ndo_start_xmit = arcnet_send_packet,
+	.ndo_tx_timeout = arcnet_timeout,
+	.ndo_set_multicast_list = com20020_set_mc_list,
+};
+
 /* Set up the struct net_device associated with this card.  Called after
  * probing succeeds.
  */
@@ -170,8 +178,6 @@ int com20020_found(struct net_device *dev, int shared)
 	lp->hw.copy_from_card = com20020_copy_from_card;
 	lp->hw.close = com20020_close;
 
-	dev->set_multicast_list = com20020_set_mc_list;
-
 	if (!dev->dev_addr[0])
 		dev->dev_addr[0] = inb(ioaddr + BUS_ALIGN*8);	/* FIXME: do this some other way! */
 
diff --git a/include/linux/com20020.h b/include/linux/com20020.h
index ac6d9a43e085..350afa773f8f 100644
--- a/include/linux/com20020.h
+++ b/include/linux/com20020.h
@@ -29,6 +29,7 @@
 
 int com20020_check(struct net_device *dev);
 int com20020_found(struct net_device *dev, int shared);
+const struct net_device_ops com20020_netdev_ops;
 
 /* The number of low I/O ports used by the card. */
 #define ARCNET_TOTAL_SIZE 8
-- 
cgit v1.2.3-71-gd317


From 9fd3238e95046b61d518ddacaa767fa09f31b0d0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Fri, 9 Jan 2009 13:01:19 +0000
Subject: ibmtr: convert to internal network_device_stats

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tokenring/ibmtr.c | 29 ++++++-----------------------
 include/linux/ibmtr.h         |  2 +-
 2 files changed, 7 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tokenring/ibmtr.c b/drivers/net/tokenring/ibmtr.c
index fa7bce6e0c6d..f195d54ae421 100644
--- a/drivers/net/tokenring/ibmtr.c
+++ b/drivers/net/tokenring/ibmtr.c
@@ -200,7 +200,6 @@ static void 	tr_rx(struct net_device *dev);
 static void	ibmtr_reset_timer(struct timer_list*tmr,struct net_device *dev);
 static void	tok_rerun(unsigned long dev_addr);
 static void	ibmtr_readlog(struct net_device *dev);
-static struct 	net_device_stats *tok_get_stats(struct net_device *dev);
 static int	ibmtr_change_mtu(struct net_device *dev, int mtu);
 static void	find_turbo_adapters(int *iolist);
 
@@ -825,7 +824,6 @@ static int __devinit trdev_init(struct net_device *dev)
 	dev->open = tok_open;
 	dev->stop = tok_close;
 	dev->hard_start_xmit = tok_send_packet;
-	dev->get_stats = tok_get_stats;
 	dev->set_multicast_list = tok_set_multicast_list;
 	dev->change_mtu = ibmtr_change_mtu;
 
@@ -1460,7 +1458,7 @@ static irqreturn_t tok_interrupt(int irq, void *dev_id)
 					"%02X\n",
 					(int)retcode, (int)readb(ti->ssb + 6));
 			else
-				ti->tr_stats.tx_packets++;
+				dev->stats.tx_packets++;
 			break;
 		case XMIT_XID_CMD:
 			DPRINTK("xmit xid ret_code: %02X\n",
@@ -1646,7 +1644,7 @@ static void tr_tx(struct net_device *dev)
 		break;
 	}
 	writeb(RESP_IN_ASB, ti->mmio + ACA_OFFSET + ACA_SET + ISRA_ODD);
-	ti->tr_stats.tx_bytes += ti->current_skb->len;
+	dev->stats.tx_bytes += ti->current_skb->len;
 	dev_kfree_skb_irq(ti->current_skb);
 	ti->current_skb = NULL;
 	netif_wake_queue(dev);
@@ -1722,7 +1720,7 @@ static void tr_rx(struct net_device *dev)
 	if (readb(llc + offsetof(struct trllc, llc)) != UI_CMD) {
 		SET_PAGE(ti->asb_page);
 		writeb(DATA_LOST, ti->asb + RETCODE_OFST);
-		ti->tr_stats.rx_dropped++;
+		dev->stats.rx_dropped++;
 		writeb(RESP_IN_ASB, ti->mmio + ACA_OFFSET + ACA_SET + ISRA_ODD);
 		return;
 	}
@@ -1757,7 +1755,7 @@ static void tr_rx(struct net_device *dev)
 
 	if (!(skb = dev_alloc_skb(skb_size))) {
 		DPRINTK("out of memory. frame dropped.\n");
-		ti->tr_stats.rx_dropped++;
+		dev->stats.rx_dropped++;
 		SET_PAGE(ti->asb_page);
 		writeb(DATA_LOST, ti->asb + offsetof(struct asb_rec, ret_code));
 		writeb(RESP_IN_ASB, ti->mmio + ACA_OFFSET + ACA_SET + ISRA_ODD);
@@ -1813,8 +1811,8 @@ static void tr_rx(struct net_device *dev)
 
 	writeb(RESP_IN_ASB, ti->mmio + ACA_OFFSET + ACA_SET + ISRA_ODD);
 
-	ti->tr_stats.rx_bytes += skb->len;
-	ti->tr_stats.rx_packets++;
+	dev->stats.rx_bytes += skb->len;
+	dev->stats.rx_packets++;
 
 	skb->protocol = tr_type_trans(skb, dev);
 	if (IPv4_p) {
@@ -1876,21 +1874,6 @@ static void ibmtr_readlog(struct net_device *dev)
 
 /*****************************************************************************/
 
-/* tok_get_stats():  Basically a scaffold routine which will return
-   the address of the tr_statistics structure associated with
-   this device -- the tr.... structure is an ethnet look-alike
-   so at least for this iteration may suffice.   */
-
-static struct net_device_stats *tok_get_stats(struct net_device *dev)
-{
-
-	struct tok_info *toki;
-	toki = netdev_priv(dev);
-	return (struct net_device_stats *) &toki->tr_stats;
-}
-
-/*****************************************************************************/
-
 static int ibmtr_change_mtu(struct net_device *dev, int mtu)
 {
 	struct tok_info *ti = netdev_priv(dev);
diff --git a/include/linux/ibmtr.h b/include/linux/ibmtr.h
index 1c7a0dd5536a..06695b74d405 100644
--- a/include/linux/ibmtr.h
+++ b/include/linux/ibmtr.h
@@ -207,7 +207,7 @@ struct tok_info {
 	unsigned short exsap_station_id;
 	unsigned short global_int_enable;
 	struct sk_buff *current_skb;
-	struct net_device_stats tr_stats;
+
 	unsigned char auto_speedsave;
 	open_state			open_status, sap_status;
 	enum {MANUAL, AUTOMATIC}	open_mode;
-- 
cgit v1.2.3-71-gd317


From 5a7616af604caf0d436a1ed0d4298bb25cd77d67 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Fri, 9 Jan 2009 13:01:35 +0000
Subject: hdlcdrv: convert to internal net_device_stats

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Thomas Sailer <t.sailer@alumni.ethz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/hdlcdrv.c | 27 +++++++--------------------
 include/linux/hdlcdrv.h        |  1 -
 2 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index 8eba61a1d4ab..1215a49c38f1 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -154,7 +154,7 @@ static void hdlc_rx_flag(struct net_device *dev, struct hdlcdrv_state *s)
 	pkt_len = s->hdlcrx.len - 2 + 1; /* KISS kludge */
 	if (!(skb = dev_alloc_skb(pkt_len))) {
 		printk("%s: memory squeeze, dropping packet\n", dev->name);
-		s->stats.rx_dropped++;
+		dev->stats.rx_dropped++;
 		return;
 	}
 	cp = skb_put(skb, pkt_len);
@@ -162,7 +162,7 @@ static void hdlc_rx_flag(struct net_device *dev, struct hdlcdrv_state *s)
 	memcpy(cp, s->hdlcrx.buffer, pkt_len - 1);
 	skb->protocol = ax25_type_trans(skb, dev);
 	netif_rx(skb);
-	s->stats.rx_packets++;
+	dev->stats.rx_packets++;
 }
 
 void hdlcdrv_receiver(struct net_device *dev, struct hdlcdrv_state *s)
@@ -326,7 +326,7 @@ void hdlcdrv_transmitter(struct net_device *dev, struct hdlcdrv_state *s)
 			s->hdlctx.len = pkt_len+2; /* the appended CRC */
 			s->hdlctx.tx_state = 2;
 			s->hdlctx.bitstream = 0;
-			s->stats.tx_packets++;
+			dev->stats.tx_packets++;
 			break;
 		case 2:
 			if (!s->hdlctx.len) {
@@ -426,19 +426,6 @@ static int hdlcdrv_set_mac_address(struct net_device *dev, void *addr)
 	return 0;                                         
 }
 
-/* --------------------------------------------------------------------- */
-
-static struct net_device_stats *hdlcdrv_get_stats(struct net_device *dev)
-{
-	struct hdlcdrv_state *sm = netdev_priv(dev);
-
-	/* 
-	 * Get the current statistics.  This may be called with the
-	 * card open or closed. 
-	 */
-	return &sm->stats;
-}
-
 /* --------------------------------------------------------------------- */
 /*
  * Open/initialize the board. This is called (in the current kernel)
@@ -568,10 +555,10 @@ static int hdlcdrv_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		bi.data.cs.ptt = hdlcdrv_ptt(s);
 		bi.data.cs.dcd = s->hdlcrx.dcd;
 		bi.data.cs.ptt_keyed = s->ptt_keyed;
-		bi.data.cs.tx_packets = s->stats.tx_packets;
-		bi.data.cs.tx_errors = s->stats.tx_errors;
-		bi.data.cs.rx_packets = s->stats.rx_packets;
-		bi.data.cs.rx_errors = s->stats.rx_errors;
+		bi.data.cs.tx_packets = dev->stats.tx_packets;
+		bi.data.cs.tx_errors = dev->stats.tx_errors;
+		bi.data.cs.rx_packets = dev->stats.rx_packets;
+		bi.data.cs.rx_errors = dev->stats.rx_errors;
 		break;		
 
 	case HDLCDRVCTL_OLDGETSTAT:
diff --git a/include/linux/hdlcdrv.h b/include/linux/hdlcdrv.h
index bf6302f6b5f8..0821bac62b83 100644
--- a/include/linux/hdlcdrv.h
+++ b/include/linux/hdlcdrv.h
@@ -241,7 +241,6 @@ struct hdlcdrv_state {
 	struct hdlcdrv_bitbuffer bitbuf_hdlc;
 #endif /* HDLCDRV_DEBUG */
 
-	struct net_device_stats stats;
 	int ptt_keyed;
 
 	/* queued skb for transmission */
-- 
cgit v1.2.3-71-gd317


From 7cdc15f5f9db71e9c92422918ab9f8df0d31f81f Mon Sep 17 00:00:00 2001
From: Krzysztof Hałasa <khc@pm.waw.pl>
Date: Thu, 8 Jan 2009 19:46:54 +0100
Subject: WAN: Generic HDLC now uses IFF_WAN_HDLC private flag.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Krzysztof Hałasa <khc@pm.waw.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/hdlc.c | 3 ++-
 include/linux/if.h     | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c
index 1f2a140c9f7c..d83cd7884e05 100644
--- a/drivers/net/wan/hdlc.c
+++ b/drivers/net/wan/hdlc.c
@@ -106,7 +106,7 @@ static int hdlc_device_event(struct notifier_block *this, unsigned long event,
 	if (dev_net(dev) != &init_net)
 		return NOTIFY_DONE;
 
-	if (dev->get_stats != hdlc_get_stats)
+	if (!(dev->priv_flags & IFF_WAN_HDLC))
 		return NOTIFY_DONE; /* not an HDLC device */
 
 	if (event != NETDEV_CHANGE)
@@ -235,6 +235,7 @@ static void hdlc_setup_dev(struct net_device *dev)
 	 */
 	dev->get_stats		 = hdlc_get_stats;
 	dev->flags		 = IFF_POINTOPOINT | IFF_NOARP;
+	dev->priv_flags		 = IFF_WAN_HDLC;
 	dev->mtu		 = HDLC_MAX_MTU;
 	dev->type		 = ARPHRD_RAWHDLC;
 	dev->hard_header_len	 = 16;
diff --git a/include/linux/if.h b/include/linux/if.h
index 2a6e29620a96..1108f3e099e3 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -66,6 +66,7 @@
 #define IFF_SLAVE_NEEDARP 0x40		/* need ARPs for validation	*/
 #define IFF_ISATAP	0x80		/* ISATAP interface (RFC4214)	*/
 #define IFF_MASTER_ARPMON 0x100		/* bonding master, ARP mon in use */
+#define IFF_WAN_HDLC	0x200		/* WAN HDLC device		*/
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
-- 
cgit v1.2.3-71-gd317


From 991990a12de42281f81b4e3a6471586d2d0caf6a Mon Sep 17 00:00:00 2001
From: Krzysztof Hałasa <khc@pm.waw.pl>
Date: Thu, 8 Jan 2009 22:52:11 +0100
Subject: WAN: Convert generic HDLC drivers to netdev_ops.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also remove unneeded last_rx update from Synclink drivers.
Synclink part mostly by Stephen Hemminger.

Signed-off-by: Krzysztof Hałasa <khc@pm.waw.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/pcmcia/synclink_cs.c | 18 +++++++++++-------
 drivers/char/synclink.c           | 18 +++++++++++-------
 drivers/char/synclink_gt.c        | 18 +++++++++++-------
 drivers/char/synclinkmp.c         | 18 +++++++++++-------
 drivers/net/wan/c101.c            | 12 ++++++++----
 drivers/net/wan/cosa.c            | 14 ++++++++++----
 drivers/net/wan/dscc4.c           | 18 +++++++++++-------
 drivers/net/wan/farsync.c         | 18 ++++++++++++------
 drivers/net/wan/hdlc.c            | 14 +++++++++++---
 drivers/net/wan/hdlc_cisco.c      |  1 -
 drivers/net/wan/hdlc_fr.c         | 28 +++++++++-------------------
 drivers/net/wan/hdlc_ppp.c        |  2 --
 drivers/net/wan/hdlc_raw.c        |  3 ---
 drivers/net/wan/hdlc_raw_eth.c    |  8 ++------
 drivers/net/wan/hdlc_x25.c        |  2 +-
 drivers/net/wan/hostess_sv11.c    | 12 +++++++++---
 drivers/net/wan/ixp4xx_hss.c      | 12 +++++++++---
 drivers/net/wan/lmc/lmc_main.c    | 19 +++++++++++--------
 drivers/net/wan/lmc/lmc_proto.c   | 17 +----------------
 drivers/net/wan/n2.c              | 12 ++++++++----
 drivers/net/wan/pc300too.c        | 12 ++++++++----
 drivers/net/wan/pci200syn.c       | 12 ++++++++----
 drivers/net/wan/sealevel.c        | 12 +++++++++---
 drivers/net/wan/wanxl.c           | 14 ++++++++++----
 include/linux/hdlc.h              |  5 +++++
 25 files changed, 186 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index dc073e167abc..5608a1e5a3b3 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -4311,10 +4311,17 @@ static void hdlcdev_rx(MGSLPC_INFO *info, char *buf, int size)
 	dev->stats.rx_bytes += size;
 
 	netif_rx(skb);
-
-	dev->last_rx = jiffies;
 }
 
+static const struct net_device_ops hdlcdev_ops = {
+	.ndo_open       = hdlcdev_open,
+	.ndo_stop       = hdlcdev_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = hdlcdev_ioctl,
+	.ndo_tx_timeout = hdlcdev_tx_timeout,
+};
+
 /**
  * called by device driver when adding device instance
  * do generic HDLC initialization
@@ -4341,11 +4348,8 @@ static int hdlcdev_init(MGSLPC_INFO *info)
 	dev->irq       = info->irq_level;
 
 	/* network layer callbacks and settings */
-	dev->do_ioctl       = hdlcdev_ioctl;
-	dev->open           = hdlcdev_open;
-	dev->stop           = hdlcdev_close;
-	dev->tx_timeout     = hdlcdev_tx_timeout;
-	dev->watchdog_timeo = 10*HZ;
+	dev->netdev_ops	    = &hdlcdev_ops;
+	dev->watchdog_timeo = 10 * HZ;
 	dev->tx_queue_len   = 50;
 
 	/* generic HDLC layer callbacks and settings */
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index b8063d4cad32..0057a8f58cb1 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -8007,10 +8007,17 @@ static void hdlcdev_rx(struct mgsl_struct *info, char *buf, int size)
 	dev->stats.rx_bytes += size;
 
 	netif_rx(skb);
-
-	dev->last_rx = jiffies;
 }
 
+static const struct net_device_ops hdlcdev_ops = {
+	.ndo_open       = hdlcdev_open,
+	.ndo_stop       = hdlcdev_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = hdlcdev_ioctl,
+	.ndo_tx_timeout = hdlcdev_tx_timeout,
+};
+
 /**
  * called by device driver when adding device instance
  * do generic HDLC initialization
@@ -8038,11 +8045,8 @@ static int hdlcdev_init(struct mgsl_struct *info)
 	dev->dma       = info->dma_level;
 
 	/* network layer callbacks and settings */
-	dev->do_ioctl       = hdlcdev_ioctl;
-	dev->open           = hdlcdev_open;
-	dev->stop           = hdlcdev_close;
-	dev->tx_timeout     = hdlcdev_tx_timeout;
-	dev->watchdog_timeo = 10*HZ;
+	dev->netdev_ops     = &hdlcdev_ops;
+	dev->watchdog_timeo = 10 * HZ;
 	dev->tx_queue_len   = 50;
 
 	/* generic HDLC layer callbacks and settings */
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index f329f459817c..efb3dc928a43 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -1763,10 +1763,17 @@ static void hdlcdev_rx(struct slgt_info *info, char *buf, int size)
 	dev->stats.rx_bytes += size;
 
 	netif_rx(skb);
-
-	dev->last_rx = jiffies;
 }
 
+static const struct net_device_ops hdlcdev_ops = {
+	.ndo_open       = hdlcdev_open,
+	.ndo_stop       = hdlcdev_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = hdlcdev_ioctl,
+	.ndo_tx_timeout = hdlcdev_tx_timeout,
+};
+
 /**
  * called by device driver when adding device instance
  * do generic HDLC initialization
@@ -1794,11 +1801,8 @@ static int hdlcdev_init(struct slgt_info *info)
 	dev->irq       = info->irq_level;
 
 	/* network layer callbacks and settings */
-	dev->do_ioctl       = hdlcdev_ioctl;
-	dev->open           = hdlcdev_open;
-	dev->stop           = hdlcdev_close;
-	dev->tx_timeout     = hdlcdev_tx_timeout;
-	dev->watchdog_timeo = 10*HZ;
+	dev->netdev_ops	    = &hdlcdev_ops;
+	dev->watchdog_timeo = 10 * HZ;
 	dev->tx_queue_len   = 50;
 
 	/* generic HDLC layer callbacks and settings */
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 7b0c5b2dd263..8eb6c89a980e 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -1907,10 +1907,17 @@ static void hdlcdev_rx(SLMP_INFO *info, char *buf, int size)
 	dev->stats.rx_bytes += size;
 
 	netif_rx(skb);
-
-	dev->last_rx = jiffies;
 }
 
+static const struct net_device_ops hdlcdev_ops = {
+	.ndo_open       = hdlcdev_open,
+	.ndo_stop       = hdlcdev_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = hdlcdev_ioctl,
+	.ndo_tx_timeout = hdlcdev_tx_timeout,
+};
+
 /**
  * called by device driver when adding device instance
  * do generic HDLC initialization
@@ -1938,11 +1945,8 @@ static int hdlcdev_init(SLMP_INFO *info)
 	dev->irq       = info->irq_level;
 
 	/* network layer callbacks and settings */
-	dev->do_ioctl       = hdlcdev_ioctl;
-	dev->open           = hdlcdev_open;
-	dev->stop           = hdlcdev_close;
-	dev->tx_timeout     = hdlcdev_tx_timeout;
-	dev->watchdog_timeo = 10*HZ;
+	dev->netdev_ops	    = &hdlcdev_ops;
+	dev->watchdog_timeo = 10 * HZ;
 	dev->tx_queue_len   = 50;
 
 	/* generic HDLC layer callbacks and settings */
diff --git a/drivers/net/wan/c101.c b/drivers/net/wan/c101.c
index b46897996f7e..9693b0fd323d 100644
--- a/drivers/net/wan/c101.c
+++ b/drivers/net/wan/c101.c
@@ -296,7 +296,13 @@ static void c101_destroy_card(card_t *card)
 	kfree(card);
 }
 
-
+static const struct net_device_ops c101_ops = {
+	.ndo_open       = c101_open,
+	.ndo_stop       = c101_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = c101_ioctl,
+};
 
 static int __init c101_run(unsigned long irq, unsigned long winbase)
 {
@@ -367,9 +373,7 @@ static int __init c101_run(unsigned long irq, unsigned long winbase)
 	dev->mem_start = winbase;
 	dev->mem_end = winbase + C101_MAPPED_RAM_SIZE - 1;
 	dev->tx_queue_len = 50;
-	dev->do_ioctl = c101_ioctl;
-	dev->open = c101_open;
-	dev->stop = c101_close;
+	dev->netdev_ops = &c101_ops;
 	hdlc->attach = sca_attach;
 	hdlc->xmit = sca_xmit;
 	card->settings.clock_type = CLOCK_EXT;
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index d80b72e22dea..0d7ba117ef60 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -427,6 +427,15 @@ static void __exit cosa_exit(void)
 }
 module_exit(cosa_exit);
 
+static const struct net_device_ops cosa_ops = {
+	.ndo_open       = cosa_net_open,
+	.ndo_stop       = cosa_net_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = cosa_net_ioctl,
+	.ndo_tx_timeout = cosa_net_timeout,
+};
+
 static int cosa_probe(int base, int irq, int dma)
 {
 	struct cosa_data *cosa = cosa_cards+nr_cards;
@@ -575,10 +584,7 @@ static int cosa_probe(int base, int irq, int dma)
 		}
 		dev_to_hdlc(chan->netdev)->attach = cosa_net_attach;
 		dev_to_hdlc(chan->netdev)->xmit = cosa_net_tx;
-		chan->netdev->open = cosa_net_open;
-		chan->netdev->stop = cosa_net_close;
-		chan->netdev->do_ioctl = cosa_net_ioctl;
-		chan->netdev->tx_timeout = cosa_net_timeout;
+		chan->netdev->netdev_ops = &cosa_ops;
 		chan->netdev->watchdog_timeo = TX_TIMEOUT;
 		chan->netdev->base_addr = chan->cosa->datareg;
 		chan->netdev->irq = chan->cosa->irq;
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index 888025db2f02..8face5db8f32 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -883,6 +883,15 @@ static inline int dscc4_set_quartz(struct dscc4_dev_priv *dpriv, int hz)
 	return ret;
 }
 
+static const struct net_device_ops dscc4_ops = {
+	.ndo_open       = dscc4_open,
+	.ndo_stop       = dscc4_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = dscc4_ioctl,
+	.ndo_tx_timeout = dscc4_tx_timeout,
+};
+
 static int dscc4_found1(struct pci_dev *pdev, void __iomem *ioaddr)
 {
 	struct dscc4_pci_priv *ppriv;
@@ -916,13 +925,8 @@ static int dscc4_found1(struct pci_dev *pdev, void __iomem *ioaddr)
 		hdlc_device *hdlc = dev_to_hdlc(d);
 
 	        d->base_addr = (unsigned long)ioaddr;
-		d->init = NULL;
 	        d->irq = pdev->irq;
-	        d->open = dscc4_open;
-	        d->stop = dscc4_close;
-		d->set_multicast_list = NULL;
-	        d->do_ioctl = dscc4_ioctl;
-		d->tx_timeout = dscc4_tx_timeout;
+		d->netdev_ops = &dscc4_ops;
 		d->watchdog_timeo = TX_TIMEOUT;
 		SET_NETDEV_DEV(d, &pdev->dev);
 
@@ -1048,7 +1052,7 @@ static int dscc4_open(struct net_device *dev)
 	struct dscc4_pci_priv *ppriv;
 	int ret = -EAGAIN;
 
-	if ((dscc4_loopback_check(dpriv) < 0) || !dev->hard_start_xmit)
+	if ((dscc4_loopback_check(dpriv) < 0))
 		goto err;
 
 	if ((ret = hdlc_open(dev)))
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 48a2c9d28950..00945f7c1e9b 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -2424,6 +2424,15 @@ fst_init_card(struct fst_card_info *card)
 	       type_strings[card->type], card->irq, card->nports);
 }
 
+static const struct net_device_ops fst_ops = {
+	.ndo_open       = fst_open,
+	.ndo_stop       = fst_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = fst_ioctl,
+	.ndo_tx_timeout = fst_tx_timeout,
+};
+
 /*
  *      Initialise card when detected.
  *      Returns 0 to indicate success, or errno otherwise.
@@ -2565,12 +2574,9 @@ fst_add_one(struct pci_dev *pdev, const struct pci_device_id *ent)
                 dev->base_addr   = card->pci_conf;
                 dev->irq         = card->irq;
 
-                dev->tx_queue_len          = FST_TX_QUEUE_LEN;
-                dev->open                  = fst_open;
-                dev->stop                  = fst_close;
-                dev->do_ioctl              = fst_ioctl;
-                dev->watchdog_timeo        = FST_TX_TIMEOUT;
-                dev->tx_timeout            = fst_tx_timeout;
+		dev->netdev_ops = &fst_ops;
+		dev->tx_queue_len = FST_TX_QUEUE_LEN;
+		dev->watchdog_timeo = FST_TX_TIMEOUT;
                 hdlc->attach = fst_attach;
                 hdlc->xmit   = fst_start_xmit;
 	}
diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c
index dbc179887f8b..43da8bd72973 100644
--- a/drivers/net/wan/hdlc.c
+++ b/drivers/net/wan/hdlc.c
@@ -44,7 +44,7 @@ static const char* version = "HDLC support module revision 1.22";
 
 static struct hdlc_proto *first_proto;
 
-static int hdlc_change_mtu(struct net_device *dev, int new_mtu)
+int hdlc_change_mtu(struct net_device *dev, int new_mtu)
 {
 	if ((new_mtu < 68) || (new_mtu > HDLC_MAX_MTU))
 		return -EINVAL;
@@ -66,7 +66,15 @@ static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev,
 	return hdlc->proto->netif_rx(skb);
 }
 
+int hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	hdlc_device *hdlc = dev_to_hdlc(dev);
 
+	if (hdlc->proto->xmit)
+		return hdlc->proto->xmit(skb, dev);
+
+	return hdlc->xmit(skb, dev); /* call hardware driver directly */
+}
 
 static inline void hdlc_proto_start(struct net_device *dev)
 {
@@ -231,8 +239,6 @@ static void hdlc_setup_dev(struct net_device *dev)
 	dev->hard_header_len	 = 16;
 	dev->addr_len		 = 0;
 	dev->header_ops		 = &hdlc_null_ops;
-
-	dev->change_mtu		 = hdlc_change_mtu;
 }
 
 static void hdlc_setup(struct net_device *dev)
@@ -330,6 +336,8 @@ MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("HDLC support module");
 MODULE_LICENSE("GPL v2");
 
+EXPORT_SYMBOL(hdlc_change_mtu);
+EXPORT_SYMBOL(hdlc_start_xmit);
 EXPORT_SYMBOL(hdlc_open);
 EXPORT_SYMBOL(hdlc_close);
 EXPORT_SYMBOL(hdlc_ioctl);
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index 44e64b15dbd1..af3fd4fead8a 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -382,7 +382,6 @@ static int cisco_ioctl(struct net_device *dev, struct ifreq *ifr)
 
 		memcpy(&state(hdlc)->settings, &new_settings, size);
 		spin_lock_init(&state(hdlc)->lock);
-		dev->hard_start_xmit = hdlc->xmit;
 		dev->header_ops = &cisco_header_ops;
 		dev->type = ARPHRD_CISCO;
 		netif_dormant_on(dev);
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index f1ddd7c3459c..70e57cebc955 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -444,18 +444,6 @@ static int pvc_xmit(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
-
-
-static int pvc_change_mtu(struct net_device *dev, int new_mtu)
-{
-	if ((new_mtu < 68) || (new_mtu > HDLC_MAX_MTU))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-
-
-
 static inline void fr_log_dlci_active(pvc_device *pvc)
 {
 	printk(KERN_INFO "%s: DLCI %d [%s%s%s]%s %s\n",
@@ -1068,6 +1056,14 @@ static void pvc_setup(struct net_device *dev)
 	dev->addr_len = 2;
 }
 
+static const struct net_device_ops pvc_ops = {
+	.ndo_open       = pvc_open,
+	.ndo_stop       = pvc_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = pvc_xmit,
+	.ndo_do_ioctl   = pvc_ioctl,
+};
+
 static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 {
 	hdlc_device *hdlc = dev_to_hdlc(frad);
@@ -1104,11 +1100,7 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 		*(__be16*)dev->dev_addr = htons(dlci);
 		dlci_to_q922(dev->broadcast, dlci);
 	}
-	dev->hard_start_xmit = pvc_xmit;
-	dev->open = pvc_open;
-	dev->stop = pvc_close;
-	dev->do_ioctl = pvc_ioctl;
-	dev->change_mtu = pvc_change_mtu;
+	dev->netdev_ops = &pvc_ops;
 	dev->mtu = HDLC_MAX_MTU;
 	dev->tx_queue_len = 0;
 	dev->ml_priv = pvc;
@@ -1260,8 +1252,6 @@ static int fr_ioctl(struct net_device *dev, struct ifreq *ifr)
 			state(hdlc)->dce_pvc_count = 0;
 		}
 		memcpy(&state(hdlc)->settings, &new_settings, size);
-
-		dev->hard_start_xmit = hdlc->xmit;
 		dev->type = ARPHRD_FRAD;
 		return 0;
 
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index 57fe714c1c7f..7b8a5eae201d 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -558,7 +558,6 @@ out:
 	return NET_RX_DROP;
 }
 
-
 static void ppp_timer(unsigned long arg)
 {
 	struct proto *proto = (struct proto *)arg;
@@ -679,7 +678,6 @@ static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr)
 		ppp->keepalive_interval = 10;
 		ppp->keepalive_timeout = 60;
 
-		dev->hard_start_xmit = hdlc->xmit;
 		dev->hard_header_len = sizeof(struct hdlc_header);
 		dev->header_ops = &ppp_header_ops;
 		dev->type = ARPHRD_PPP;
diff --git a/drivers/net/wan/hdlc_raw.c b/drivers/net/wan/hdlc_raw.c
index 8612311748f4..6e92c64ebd0f 100644
--- a/drivers/net/wan/hdlc_raw.c
+++ b/drivers/net/wan/hdlc_raw.c
@@ -30,8 +30,6 @@ static __be16 raw_type_trans(struct sk_buff *skb, struct net_device *dev)
 	return __constant_htons(ETH_P_IP);
 }
 
-
-
 static struct hdlc_proto proto = {
 	.type_trans	= raw_type_trans,
 	.ioctl		= raw_ioctl,
@@ -86,7 +84,6 @@ static int raw_ioctl(struct net_device *dev, struct ifreq *ifr)
 		if (result)
 			return result;
 		memcpy(hdlc->state, &new_settings, size);
-		dev->hard_start_xmit = hdlc->xmit;
 		dev->type = ARPHRD_RAWHDLC;
 		netif_dormant_off(dev);
 		return 0;
diff --git a/drivers/net/wan/hdlc_raw_eth.c b/drivers/net/wan/hdlc_raw_eth.c
index a13fc3207520..49e68f5ca5f2 100644
--- a/drivers/net/wan/hdlc_raw_eth.c
+++ b/drivers/net/wan/hdlc_raw_eth.c
@@ -45,6 +45,7 @@ static int eth_tx(struct sk_buff *skb, struct net_device *dev)
 
 static struct hdlc_proto proto = {
 	.type_trans	= eth_type_trans,
+	.xmit		= eth_tx,
 	.ioctl		= raw_eth_ioctl,
 	.module		= THIS_MODULE,
 };
@@ -56,9 +57,7 @@ static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr)
 	const size_t size = sizeof(raw_hdlc_proto);
 	raw_hdlc_proto new_settings;
 	hdlc_device *hdlc = dev_to_hdlc(dev);
-	int result;
-	int (*old_ch_mtu)(struct net_device *, int);
-	int old_qlen;
+	int result, old_qlen;
 
 	switch (ifr->ifr_settings.type) {
 	case IF_GET_PROTO:
@@ -99,11 +98,8 @@ static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr)
 		if (result)
 			return result;
 		memcpy(hdlc->state, &new_settings, size);
-		dev->hard_start_xmit = eth_tx;
-		old_ch_mtu = dev->change_mtu;
 		old_qlen = dev->tx_queue_len;
 		ether_setup(dev);
-		dev->change_mtu = old_ch_mtu;
 		dev->tx_queue_len = old_qlen;
 		random_ether_addr(dev->dev_addr);
 		netif_dormant_off(dev);
diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c
index cbcbf6f0414c..b1dc29ed1583 100644
--- a/drivers/net/wan/hdlc_x25.c
+++ b/drivers/net/wan/hdlc_x25.c
@@ -184,6 +184,7 @@ static struct hdlc_proto proto = {
 	.close		= x25_close,
 	.ioctl		= x25_ioctl,
 	.netif_rx	= x25_rx,
+	.xmit		= x25_xmit,
 	.module		= THIS_MODULE,
 };
 
@@ -213,7 +214,6 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr)
 
 		if ((result = attach_hdlc_protocol(dev, &proto, 0)))
 			return result;
-		dev->hard_start_xmit = x25_xmit;
 		dev->type = ARPHRD_X25;
 		netif_dormant_off(dev);
 		return 0;
diff --git a/drivers/net/wan/hostess_sv11.c b/drivers/net/wan/hostess_sv11.c
index af54f0cf1b35..567d4f5062d6 100644
--- a/drivers/net/wan/hostess_sv11.c
+++ b/drivers/net/wan/hostess_sv11.c
@@ -173,6 +173,14 @@ static int hostess_attach(struct net_device *dev, unsigned short encoding,
  *	Description block for a Comtrol Hostess SV11 card
  */
 
+static const struct net_device_ops hostess_ops = {
+	.ndo_open       = hostess_open,
+	.ndo_stop       = hostess_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = hostess_ioctl,
+};
+
 static struct z8530_dev *sv11_init(int iobase, int irq)
 {
 	struct z8530_dev *sv;
@@ -267,9 +275,7 @@ static struct z8530_dev *sv11_init(int iobase, int irq)
 
 	dev_to_hdlc(netdev)->attach = hostess_attach;
 	dev_to_hdlc(netdev)->xmit = hostess_queue_xmit;
-	netdev->open = hostess_open;
-	netdev->stop = hostess_close;
-	netdev->do_ioctl = hostess_ioctl;
+	netdev->netdev_ops = &hostess_ops;
 	netdev->base_addr = iobase;
 	netdev->irq = irq;
 
diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index 0dbd85b0162d..7e8bbba2cc1b 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c
@@ -1230,6 +1230,14 @@ static int hss_hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
  * initialization
  ****************************************************************************/
 
+static const struct net_device_ops hss_hdlc_ops = {
+	.ndo_open       = hss_hdlc_open,
+	.ndo_stop       = hss_hdlc_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = hss_hdlc_ioctl,
+};
+
 static int __devinit hss_init_one(struct platform_device *pdev)
 {
 	struct port *port;
@@ -1254,9 +1262,7 @@ static int __devinit hss_init_one(struct platform_device *pdev)
 	hdlc = dev_to_hdlc(dev);
 	hdlc->attach = hss_hdlc_attach;
 	hdlc->xmit = hss_hdlc_xmit;
-	dev->open = hss_hdlc_open;
-	dev->stop = hss_hdlc_close;
-	dev->do_ioctl = hss_hdlc_ioctl;
+	dev->netdev_ops = &hss_hdlc_ops;
 	dev->tx_queue_len = 100;
 	port->clock_type = CLOCK_EXT;
 	port->clock_rate = 2048000;
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index feac3b99f8fe..45b1822c962d 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -806,6 +806,16 @@ static int lmc_attach(struct net_device *dev, unsigned short encoding,
 	return -EINVAL;
 }
 
+static const struct net_device_ops lmc_ops = {
+	.ndo_open       = lmc_open,
+	.ndo_stop       = lmc_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = lmc_ioctl,
+	.ndo_tx_timeout = lmc_driver_timeout,
+	.ndo_get_stats  = lmc_get_stats,
+};
+
 static int __devinit lmc_init_one(struct pci_dev *pdev,
 				  const struct pci_device_id *ent)
 {
@@ -849,11 +859,7 @@ static int __devinit lmc_init_one(struct pci_dev *pdev,
 	dev->type = ARPHRD_HDLC;
 	dev_to_hdlc(dev)->xmit = lmc_start_xmit;
 	dev_to_hdlc(dev)->attach = lmc_attach;
-	dev->open = lmc_open;
-	dev->stop = lmc_close;
-	dev->get_stats = lmc_get_stats;
-	dev->do_ioctl = lmc_ioctl;
-	dev->tx_timeout = lmc_driver_timeout;
+	dev->netdev_ops = &lmc_ops;
 	dev->watchdog_timeo = HZ; /* 1 second */
 	dev->tx_queue_len = 100;
 	sc->lmc_device = dev;
@@ -1059,9 +1065,6 @@ static int lmc_open(struct net_device *dev)
     if ((err = lmc_proto_open(sc)) != 0)
 	    return err;
 
-    dev->do_ioctl = lmc_ioctl;
-
-
     netif_start_queue(dev);
     sc->extra_stats.tx_tbusy0++;
 
diff --git a/drivers/net/wan/lmc/lmc_proto.c b/drivers/net/wan/lmc/lmc_proto.c
index 94b4c208b013..044a48175c42 100644
--- a/drivers/net/wan/lmc/lmc_proto.c
+++ b/drivers/net/wan/lmc/lmc_proto.c
@@ -51,30 +51,15 @@
 void lmc_proto_attach(lmc_softc_t *sc) /*FOLD00*/
 {
     lmc_trace(sc->lmc_device, "lmc_proto_attach in");
-    switch(sc->if_type){
-    case LMC_PPP:
-        {
-            struct net_device *dev = sc->lmc_device;
-            dev->do_ioctl = lmc_ioctl;
-        }
-        break;
-    case LMC_NET:
-        {
+    if (sc->if_type == LMC_NET) {
             struct net_device *dev = sc->lmc_device;
             /*
 	     * They set a few basics because they don't use HDLC
              */
             dev->flags |= IFF_POINTOPOINT;
-
             dev->hard_header_len = 0;
             dev->addr_len = 0;
         }
-    case LMC_RAW: /* Setup the task queue, maybe we should notify someone? */
-        {
-        }
-    default:
-        break;
-    }
     lmc_trace(sc->lmc_device, "lmc_proto_attach out");
 }
 
diff --git a/drivers/net/wan/n2.c b/drivers/net/wan/n2.c
index 697715ae80f4..83da596e2052 100644
--- a/drivers/net/wan/n2.c
+++ b/drivers/net/wan/n2.c
@@ -324,7 +324,13 @@ static void n2_destroy_card(card_t *card)
 	kfree(card);
 }
 
-
+static const struct net_device_ops n2_ops = {
+	.ndo_open       = n2_open,
+	.ndo_stop       = n2_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = n2_ioctl,
+};
 
 static int __init n2_run(unsigned long io, unsigned long irq,
 			 unsigned long winbase, long valid0, long valid1)
@@ -460,9 +466,7 @@ static int __init n2_run(unsigned long io, unsigned long irq,
 		dev->mem_start = winbase;
 		dev->mem_end = winbase + USE_WINDOWSIZE - 1;
 		dev->tx_queue_len = 50;
-		dev->do_ioctl = n2_ioctl;
-		dev->open = n2_open;
-		dev->stop = n2_close;
+		dev->netdev_ops = &n2_ops;
 		hdlc->attach = sca_attach;
 		hdlc->xmit = sca_xmit;
 		port->settings.clock_type = CLOCK_EXT;
diff --git a/drivers/net/wan/pc300too.c b/drivers/net/wan/pc300too.c
index f247e5d9002a..60ece54bdd94 100644
--- a/drivers/net/wan/pc300too.c
+++ b/drivers/net/wan/pc300too.c
@@ -287,7 +287,13 @@ static void pc300_pci_remove_one(struct pci_dev *pdev)
 	kfree(card);
 }
 
-
+static const struct net_device_ops pc300_ops = {
+	.ndo_open       = pc300_open,
+	.ndo_stop       = pc300_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = pc300_ioctl,
+};
 
 static int __devinit pc300_pci_init_one(struct pci_dev *pdev,
 					const struct pci_device_id *ent)
@@ -448,9 +454,7 @@ static int __devinit pc300_pci_init_one(struct pci_dev *pdev,
 		dev->mem_start = ramphys;
 		dev->mem_end = ramphys + ramsize - 1;
 		dev->tx_queue_len = 50;
-		dev->do_ioctl = pc300_ioctl;
-		dev->open = pc300_open;
-		dev->stop = pc300_close;
+		dev->netdev_ops = &pc300_ops;
 		hdlc->attach = sca_attach;
 		hdlc->xmit = sca_xmit;
 		port->settings.clock_type = CLOCK_EXT;
diff --git a/drivers/net/wan/pci200syn.c b/drivers/net/wan/pci200syn.c
index 1104d3a692f7..e035d8c57e11 100644
--- a/drivers/net/wan/pci200syn.c
+++ b/drivers/net/wan/pci200syn.c
@@ -265,7 +265,13 @@ static void pci200_pci_remove_one(struct pci_dev *pdev)
 	kfree(card);
 }
 
-
+static const struct net_device_ops pci200_ops = {
+	.ndo_open       = pci200_open,
+	.ndo_stop       = pci200_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = pci200_ioctl,
+};
 
 static int __devinit pci200_pci_init_one(struct pci_dev *pdev,
 					 const struct pci_device_id *ent)
@@ -395,9 +401,7 @@ static int __devinit pci200_pci_init_one(struct pci_dev *pdev,
 		dev->mem_start = ramphys;
 		dev->mem_end = ramphys + ramsize - 1;
 		dev->tx_queue_len = 50;
-		dev->do_ioctl = pci200_ioctl;
-		dev->open = pci200_open;
-		dev->stop = pci200_close;
+		dev->netdev_ops = &pci200_ops;
 		hdlc->attach = sca_attach;
 		hdlc->xmit = sca_xmit;
 		port->settings.clock_type = CLOCK_EXT;
diff --git a/drivers/net/wan/sealevel.c b/drivers/net/wan/sealevel.c
index 0941a26f6e3f..23b269027453 100644
--- a/drivers/net/wan/sealevel.c
+++ b/drivers/net/wan/sealevel.c
@@ -169,6 +169,14 @@ static int sealevel_attach(struct net_device *dev, unsigned short encoding,
 	return -EINVAL;
 }
 
+static const struct net_device_ops sealevel_ops = {
+	.ndo_open       = sealevel_open,
+	.ndo_stop       = sealevel_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = sealevel_ioctl,
+};
+
 static int slvl_setup(struct slvl_device *sv, int iobase, int irq)
 {
 	struct net_device *dev = alloc_hdlcdev(sv);
@@ -177,9 +185,7 @@ static int slvl_setup(struct slvl_device *sv, int iobase, int irq)
 
 	dev_to_hdlc(dev)->attach = sealevel_attach;
 	dev_to_hdlc(dev)->xmit = sealevel_queue_xmit;
-	dev->open = sealevel_open;
-	dev->stop = sealevel_close;
-	dev->do_ioctl = sealevel_ioctl;
+	dev->netdev_ops = &sealevel_ops;
 	dev->base_addr = iobase;
 	dev->irq = irq;
 
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index 4bffb67ebcae..887acb0dc807 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -547,6 +547,15 @@ static void wanxl_pci_remove_one(struct pci_dev *pdev)
 
 #include "wanxlfw.inc"
 
+static const struct net_device_ops wanxl_ops = {
+	.ndo_open       = wanxl_open,
+	.ndo_stop       = wanxl_close,
+	.ndo_change_mtu = hdlc_change_mtu,
+	.ndo_start_xmit = hdlc_start_xmit,
+	.ndo_do_ioctl   = wanxl_ioctl,
+	.ndo_get_stats  = wanxl_get_stats,
+};
+
 static int __devinit wanxl_pci_init_one(struct pci_dev *pdev,
 					const struct pci_device_id *ent)
 {
@@ -777,12 +786,9 @@ static int __devinit wanxl_pci_init_one(struct pci_dev *pdev,
 		hdlc = dev_to_hdlc(dev);
 		spin_lock_init(&port->lock);
 		dev->tx_queue_len = 50;
-		dev->do_ioctl = wanxl_ioctl;
-		dev->open = wanxl_open;
-		dev->stop = wanxl_close;
+		dev->netdev_ops = &wanxl_ops;
 		hdlc->attach = wanxl_attach;
 		hdlc->xmit = wanxl_xmit;
-		dev->get_stats = wanxl_get_stats;
 		port->card = card;
 		port->node = i;
 		get_status(port)->clocking = CLOCK_EXT;
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index fd47a151665e..6a6e701f1631 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -38,6 +38,7 @@ struct hdlc_proto {
 	int (*ioctl)(struct net_device *dev, struct ifreq *ifr);
 	__be16 (*type_trans)(struct sk_buff *skb, struct net_device *dev);
 	int (*netif_rx)(struct sk_buff *skb);
+	int (*xmit)(struct sk_buff *skb, struct net_device *dev);
 	struct module *module;
 	struct hdlc_proto *next; /* next protocol in the list */
 };
@@ -102,6 +103,10 @@ static __inline__ void debug_frame(const struct sk_buff *skb)
 int hdlc_open(struct net_device *dev);
 /* Must be called by hardware driver when HDLC device is being closed */
 void hdlc_close(struct net_device *dev);
+/* May be used by hardware driver */
+int hdlc_change_mtu(struct net_device *dev, int new_mtu);
+/* Must be pointed to by hw driver's dev->netdev_ops->ndo_start_xmit */
+int hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
 int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto,
 			 size_t size);
-- 
cgit v1.2.3-71-gd317


From 5ec99fdf8e1a6ee90fce22c2fce94871ab44e8ba Mon Sep 17 00:00:00 2001
From: Cesar Eduardo Barros <cesarb@cesarb.net>
Date: Wed, 14 Jan 2009 20:34:04 -0800
Subject: sc92031: use device id directly instead of made-up name

Instead of making up a name for the device ids, put them directly in the
device id table. Also move the vendor id to pci_ids.h.

Signed-off-by: Cesar Eduardo Barros <cesarb@cesarb.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/sc92031.c   | 8 ++------
 include/linux/pci_ids.h | 2 ++
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/sc92031.c b/drivers/net/sc92031.c
index d24acdcdb157..00dfddbf2f9b 100644
--- a/drivers/net/sc92031.c
+++ b/drivers/net/sc92031.c
@@ -31,10 +31,6 @@
 
 #include <asm/irq.h>
 
-#define PCI_VENDOR_ID_SILAN		0x1904
-#define PCI_DEVICE_ID_SILAN_SC92031	0x2031
-#define PCI_DEVICE_ID_SILAN_8139D	0x8139
-
 #define SC92031_NAME "sc92031"
 
 /* BAR 0 is MMIO, BAR 1 is PIO */
@@ -1592,8 +1588,8 @@ out:
 }
 
 static struct pci_device_id sc92031_pci_device_id_table[] __devinitdata = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_SILAN, PCI_DEVICE_ID_SILAN_SC92031) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_SILAN, PCI_DEVICE_ID_SILAN_8139D) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SILAN, 0x2031) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SILAN, 0x8139) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, sc92031_pci_device_id_table);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d56ad9c21c09..302423afa136 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2209,6 +2209,8 @@
 
 #define PCI_VENDOR_ID_TOPSPIN		0x1867
 
+#define PCI_VENDOR_ID_SILAN		0x1904
+
 #define PCI_VENDOR_ID_TDI               0x192E
 #define PCI_DEVICE_ID_TDI_EHCI          0x0101
 
-- 
cgit v1.2.3-71-gd317


From 288379f050284087578b77e04f040b57db3db3f8 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 19 Jan 2009 16:43:59 -0800
Subject: net: Remove redundant NAPI functions

Following the removal of the unused struct net_device * parameter from
the NAPI functions named *netif_rx_* in commit 908a7a1, they are
exactly equivalent to the corresponding *napi_* functions and are
therefore redundant.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/nes/nes_hw.c      |  2 +-
 drivers/infiniband/hw/nes/nes_nic.c     |  2 +-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c |  6 ++--
 drivers/net/8139cp.c                    |  6 ++--
 drivers/net/8139too.c                   |  6 ++--
 drivers/net/amd8111e.c                  |  6 ++--
 drivers/net/arm/ep93xx_eth.c            |  8 +++---
 drivers/net/arm/ixp4xx_eth.c            | 12 ++++----
 drivers/net/atl1e/atl1e_main.c          |  6 ++--
 drivers/net/b44.c                       |  6 ++--
 drivers/net/bnx2.c                      | 12 ++++----
 drivers/net/bnx2x_main.c                |  6 ++--
 drivers/net/cassini.c                   |  8 +++---
 drivers/net/chelsio/sge.c               |  4 +--
 drivers/net/cpmac.c                     | 10 +++----
 drivers/net/e100.c                      |  6 ++--
 drivers/net/e1000/e1000_main.c          | 10 +++----
 drivers/net/e1000e/netdev.c             | 14 ++++-----
 drivers/net/ehea/ehea_main.c            |  8 +++---
 drivers/net/enic/enic_main.c            | 12 ++++----
 drivers/net/epic100.c                   |  6 ++--
 drivers/net/forcedeth.c                 | 10 +++----
 drivers/net/fs_enet/fs_enet-main.c      |  4 +--
 drivers/net/gianfar.c                   |  6 ++--
 drivers/net/ibmveth.c                   |  8 +++---
 drivers/net/igb/igb_main.c              | 12 ++++----
 drivers/net/ixgb/ixgb_main.c            |  6 ++--
 drivers/net/ixgbe/ixgbe_main.c          | 12 ++++----
 drivers/net/ixp2000/ixpdev.c            |  4 +--
 drivers/net/jme.h                       |  6 ++--
 drivers/net/korina.c                    |  4 +--
 drivers/net/macb.c                      | 10 +++----
 drivers/net/mlx4/en_rx.c                |  4 +--
 drivers/net/myri10ge/myri10ge.c         |  6 ++--
 drivers/net/natsemi.c                   |  6 ++--
 drivers/net/netxen/netxen_nic_main.c    |  2 +-
 drivers/net/niu.c                       |  6 ++--
 drivers/net/pasemi_mac.c                |  6 ++--
 drivers/net/pcnet32.c                   |  6 ++--
 drivers/net/qla3xxx.c                   |  6 ++--
 drivers/net/qlge/qlge_main.c            |  6 ++--
 drivers/net/r6040.c                     |  4 +--
 drivers/net/r8169.c                     |  6 ++--
 drivers/net/s2io.c                      |  8 +++---
 drivers/net/sb1250-mac.c                |  6 ++--
 drivers/net/sfc/efx.c                   |  4 +--
 drivers/net/sfc/efx.h                   |  2 +-
 drivers/net/skge.c                      |  6 ++--
 drivers/net/smsc911x.c                  |  8 +++---
 drivers/net/smsc9420.c                  |  4 +--
 drivers/net/spider_net.c                | 12 ++++----
 drivers/net/starfire.c                  |  6 ++--
 drivers/net/sungem.c                    |  6 ++--
 drivers/net/tc35815.c                   |  6 ++--
 drivers/net/tehuti.c                    |  6 ++--
 drivers/net/tg3.c                       | 14 ++++-----
 drivers/net/tsi108_eth.c                |  8 +++---
 drivers/net/tulip/interrupt.c           | 10 +++----
 drivers/net/typhoon.c                   |  6 ++--
 drivers/net/ucc_geth.c                  |  6 ++--
 drivers/net/via-rhine.c                 |  4 +--
 drivers/net/virtio_net.c                | 12 ++++----
 drivers/net/wan/hd64572.c               |  4 +--
 drivers/net/wan/ixp4xx_hss.c            | 12 ++++----
 drivers/net/xen-netfront.c              |  8 +++---
 include/linux/netdevice.h               | 50 ---------------------------------
 66 files changed, 227 insertions(+), 277 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index 5d139db1b771..53df9de23423 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -2541,7 +2541,7 @@ static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic
 {
 	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
 
-	netif_rx_schedule(&nesvnic->napi);
+	napi_schedule(&nesvnic->napi);
 }
 
 
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 57a47cf7e513..f5484ad1279b 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -111,7 +111,7 @@ static int nes_netdev_poll(struct napi_struct *napi, int budget)
 	nes_nic_ce_handler(nesdev, nescq);
 
 	if (nescq->cqes_pending == 0) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		/* clear out completed cqes and arm */
 		nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
 				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index a1925810be3c..da6082739839 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -446,11 +446,11 @@ poll_more:
 		if (dev->features & NETIF_F_LRO)
 			lro_flush_all(&priv->lro.lro_mgr);
 
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (unlikely(ib_req_notify_cq(priv->recv_cq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
-		    netif_rx_reschedule(napi))
+		    napi_reschedule(napi))
 			goto poll_more;
 	}
 
@@ -462,7 +462,7 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
 	struct net_device *dev = dev_ptr;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	netif_rx_schedule(&priv->napi);
+	napi_schedule(&priv->napi);
 }
 
 static void drain_tx_cq(struct net_device *dev)
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index 4e19ae3ce6be..35517b06ec3f 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -604,7 +604,7 @@ rx_next:
 
 		spin_lock_irqsave(&cp->lock, flags);
 		cpw16_f(IntrMask, cp_intr_mask);
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 		spin_unlock_irqrestore(&cp->lock, flags);
 	}
 
@@ -641,9 +641,9 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 	}
 
 	if (status & (RxOK | RxErr | RxEmpty | RxFIFOOvr))
-		if (netif_rx_schedule_prep(&cp->napi)) {
+		if (napi_schedule_prep(&cp->napi)) {
 			cpw16_f(IntrMask, cp_norx_intr_mask);
-			__netif_rx_schedule(&cp->napi);
+			__napi_schedule(&cp->napi);
 		}
 
 	if (status & (TxOK | TxErr | TxEmpty | SWInt))
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index a5b24202d564..5341da604e84 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -2128,7 +2128,7 @@ static int rtl8139_poll(struct napi_struct *napi, int budget)
 		 */
 		spin_lock_irqsave(&tp->lock, flags);
 		RTL_W16_F(IntrMask, rtl8139_intr_mask);
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 		spin_unlock_irqrestore(&tp->lock, flags);
 	}
 	spin_unlock(&tp->rx_lock);
@@ -2178,9 +2178,9 @@ static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance)
 	/* Receive packets are processed by poll routine.
 	   If not running start it now. */
 	if (status & RxAckBits){
-		if (netif_rx_schedule_prep(&tp->napi)) {
+		if (napi_schedule_prep(&tp->napi)) {
 			RTL_W16_F (IntrMask, rtl8139_norx_intr_mask);
-			__netif_rx_schedule(&tp->napi);
+			__napi_schedule(&tp->napi);
 		}
 	}
 
diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c
index 7709992bb6bf..cb9c95d3ed0a 100644
--- a/drivers/net/amd8111e.c
+++ b/drivers/net/amd8111e.c
@@ -831,7 +831,7 @@ static int amd8111e_rx_poll(struct napi_struct *napi, int budget)
 	if (rx_pkt_limit > 0) {
 		/* Receive descriptor is empty now */
 		spin_lock_irqsave(&lp->lock, flags);
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 		writel(VAL0|RINTEN0, mmio + INTEN0);
 		writel(VAL2 | RDMD0, mmio + CMD0);
 		spin_unlock_irqrestore(&lp->lock, flags);
@@ -1170,11 +1170,11 @@ static irqreturn_t amd8111e_interrupt(int irq, void *dev_id)
 
 	/* Check if Receive Interrupt has occurred. */
 	if (intr0 & RINT0) {
-		if (netif_rx_schedule_prep(&lp->napi)) {
+		if (napi_schedule_prep(&lp->napi)) {
 			/* Disable receive interupts */
 			writel(RINTEN0, mmio + INTEN0);
 			/* Schedule a polling routine */
-			__netif_rx_schedule(&lp->napi);
+			__napi_schedule(&lp->napi);
 		} else if (intren0 & RINTEN0) {
 			printk("************Driver bug! \
 				interrupt while in poll\n");
diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 3ec20cc18b0c..cc7708775da0 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -298,7 +298,7 @@ poll_some_more:
 		int more = 0;
 
 		spin_lock_irq(&ep->rx_lock);
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 		wrl(ep, REG_INTEN, REG_INTEN_TX | REG_INTEN_RX);
 		if (ep93xx_have_more_rx(ep)) {
 			wrl(ep, REG_INTEN, REG_INTEN_TX);
@@ -307,7 +307,7 @@ poll_some_more:
 		}
 		spin_unlock_irq(&ep->rx_lock);
 
-		if (more && netif_rx_reschedule(napi))
+		if (more && napi_reschedule(napi))
 			goto poll_some_more;
 	}
 
@@ -415,9 +415,9 @@ static irqreturn_t ep93xx_irq(int irq, void *dev_id)
 
 	if (status & REG_INTSTS_RX) {
 		spin_lock(&ep->rx_lock);
-		if (likely(netif_rx_schedule_prep(&ep->napi))) {
+		if (likely(napi_schedule_prep(&ep->napi))) {
 			wrl(ep, REG_INTEN, REG_INTEN_TX);
-			__netif_rx_schedule(&ep->napi);
+			__napi_schedule(&ep->napi);
 		}
 		spin_unlock(&ep->rx_lock);
 	}
diff --git a/drivers/net/arm/ixp4xx_eth.c b/drivers/net/arm/ixp4xx_eth.c
index 5fce1d5c1a1a..5fe17d5eaa54 100644
--- a/drivers/net/arm/ixp4xx_eth.c
+++ b/drivers/net/arm/ixp4xx_eth.c
@@ -473,7 +473,7 @@ static void eth_rx_irq(void *pdev)
 	printk(KERN_DEBUG "%s: eth_rx_irq\n", dev->name);
 #endif
 	qmgr_disable_irq(port->plat->rxq);
-	netif_rx_schedule(&port->napi);
+	napi_schedule(&port->napi);
 }
 
 static int eth_poll(struct napi_struct *napi, int budget)
@@ -498,16 +498,16 @@ static int eth_poll(struct napi_struct *napi, int budget)
 
 		if ((n = queue_get_desc(rxq, port, 0)) < 0) {
 #if DEBUG_RX
-			printk(KERN_DEBUG "%s: eth_poll netif_rx_complete\n",
+			printk(KERN_DEBUG "%s: eth_poll napi_complete\n",
 			       dev->name);
 #endif
-			netif_rx_complete(napi);
+			napi_complete(napi);
 			qmgr_enable_irq(rxq);
 			if (!qmgr_stat_empty(rxq) &&
-			    netif_rx_reschedule(napi)) {
+			    napi_reschedule(napi)) {
 #if DEBUG_RX
 				printk(KERN_DEBUG "%s: eth_poll"
-				       " netif_rx_reschedule successed\n",
+				       " napi_reschedule successed\n",
 				       dev->name);
 #endif
 				qmgr_disable_irq(rxq);
@@ -1036,7 +1036,7 @@ static int eth_open(struct net_device *dev)
 	}
 	ports_open++;
 	/* we may already have RX data, enables IRQ */
-	netif_rx_schedule(&port->napi);
+	napi_schedule(&port->napi);
 	return 0;
 }
 
diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c
index bb9094d4cbc9..c758884728a5 100644
--- a/drivers/net/atl1e/atl1e_main.c
+++ b/drivers/net/atl1e/atl1e_main.c
@@ -1326,9 +1326,9 @@ static irqreturn_t atl1e_intr(int irq, void *data)
 			AT_WRITE_REG(hw, REG_IMR,
 				     IMR_NORMAL_MASK & ~ISR_RX_EVENT);
 			AT_WRITE_FLUSH(hw);
-			if (likely(netif_rx_schedule_prep(
+			if (likely(napi_schedule_prep(
 				   &adapter->napi)))
-				__netif_rx_schedule(&adapter->napi);
+				__napi_schedule(&adapter->napi);
 		}
 	} while (--max_ints > 0);
 	/* re-enable Interrupt*/
@@ -1514,7 +1514,7 @@ static int atl1e_clean(struct napi_struct *napi, int budget)
 	/* If no Tx and not enough Rx work done, exit the polling mode */
 	if (work_done < budget) {
 quit_polling:
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		imr_data = AT_READ_REG(&adapter->hw, REG_IMR);
 		AT_WRITE_REG(&adapter->hw, REG_IMR, imr_data | ISR_RX_EVENT);
 		/* test debug */
diff --git a/drivers/net/b44.c b/drivers/net/b44.c
index c38512ebcea6..92aaaa1ee9f1 100644
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -874,7 +874,7 @@ static int b44_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		b44_enable_ints(bp);
 	}
 
@@ -906,13 +906,13 @@ static irqreturn_t b44_interrupt(int irq, void *dev_id)
 			goto irq_ack;
 		}
 
-		if (netif_rx_schedule_prep(&bp->napi)) {
+		if (napi_schedule_prep(&bp->napi)) {
 			/* NOTE: These writes are posted by the readback of
 			 *       the ISTAT register below.
 			 */
 			bp->istat = istat;
 			__b44_disable_ints(bp);
-			__netif_rx_schedule(&bp->napi);
+			__napi_schedule(&bp->napi);
 		} else {
 			printk(KERN_ERR PFX "%s: Error, poll already scheduled\n",
 			       dev->name);
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index d4a3dac21dcf..e817802b2483 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -3053,7 +3053,7 @@ bnx2_msi(int irq, void *dev_instance)
 	if (unlikely(atomic_read(&bp->intr_sem) != 0))
 		return IRQ_HANDLED;
 
-	netif_rx_schedule(&bnapi->napi);
+	napi_schedule(&bnapi->napi);
 
 	return IRQ_HANDLED;
 }
@@ -3070,7 +3070,7 @@ bnx2_msi_1shot(int irq, void *dev_instance)
 	if (unlikely(atomic_read(&bp->intr_sem) != 0))
 		return IRQ_HANDLED;
 
-	netif_rx_schedule(&bnapi->napi);
+	napi_schedule(&bnapi->napi);
 
 	return IRQ_HANDLED;
 }
@@ -3106,9 +3106,9 @@ bnx2_interrupt(int irq, void *dev_instance)
 	if (unlikely(atomic_read(&bp->intr_sem) != 0))
 		return IRQ_HANDLED;
 
-	if (netif_rx_schedule_prep(&bnapi->napi)) {
+	if (napi_schedule_prep(&bnapi->napi)) {
 		bnapi->last_status_idx = sblk->status_idx;
-		__netif_rx_schedule(&bnapi->napi);
+		__napi_schedule(&bnapi->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -3218,7 +3218,7 @@ static int bnx2_poll_msix(struct napi_struct *napi, int budget)
 		rmb();
 		if (likely(!bnx2_has_fast_work(bnapi))) {
 
-			netif_rx_complete(napi);
+			napi_complete(napi);
 			REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD, bnapi->int_num |
 			       BNX2_PCICFG_INT_ACK_CMD_INDEX_VALID |
 			       bnapi->last_status_idx);
@@ -3251,7 +3251,7 @@ static int bnx2_poll(struct napi_struct *napi, int budget)
 
 		rmb();
 		if (likely(!bnx2_has_work(bnapi))) {
-			netif_rx_complete(napi);
+			napi_complete(napi);
 			if (likely(bp->flags & BNX2_FLAG_USING_MSI_OR_MSIX)) {
 				REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD,
 				       BNX2_PCICFG_INT_ACK_CMD_INDEX_VALID |
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 074374ff93f3..21764bfc048e 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -1647,7 +1647,7 @@ static irqreturn_t bnx2x_msix_fp_int(int irq, void *fp_cookie)
 	prefetch(&fp->status_blk->c_status_block.status_block_index);
 	prefetch(&fp->status_blk->u_status_block.status_block_index);
 
-	netif_rx_schedule(&bnx2x_fp(bp, index, napi));
+	napi_schedule(&bnx2x_fp(bp, index, napi));
 
 	return IRQ_HANDLED;
 }
@@ -1686,7 +1686,7 @@ static irqreturn_t bnx2x_interrupt(int irq, void *dev_instance)
 		prefetch(&fp->status_blk->c_status_block.status_block_index);
 		prefetch(&fp->status_blk->u_status_block.status_block_index);
 
-		netif_rx_schedule(&bnx2x_fp(bp, 0, napi));
+		napi_schedule(&bnx2x_fp(bp, 0, napi));
 
 		status &= ~mask;
 	}
@@ -9339,7 +9339,7 @@ static int bnx2x_poll(struct napi_struct *napi, int budget)
 #ifdef BNX2X_STOP_ON_ERROR
 poll_panic:
 #endif
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		bnx2x_ack_sb(bp, FP_SB_ID(fp), USTORM_ID,
 			     le16_to_cpu(fp->fp_u_idx), IGU_INT_NOP, 1);
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 840b3d1a22f5..bb46be275339 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -2506,7 +2506,7 @@ static irqreturn_t cas_interruptN(int irq, void *dev_id)
 	if (status & INTR_RX_DONE_ALT) { /* handle rx separately */
 #ifdef USE_NAPI
 		cas_mask_intr(cp);
-		netif_rx_schedule(&cp->napi);
+		napi_schedule(&cp->napi);
 #else
 		cas_rx_ringN(cp, ring, 0);
 #endif
@@ -2557,7 +2557,7 @@ static irqreturn_t cas_interrupt1(int irq, void *dev_id)
 	if (status & INTR_RX_DONE_ALT) { /* handle rx separately */
 #ifdef USE_NAPI
 		cas_mask_intr(cp);
-		netif_rx_schedule(&cp->napi);
+		napi_schedule(&cp->napi);
 #else
 		cas_rx_ringN(cp, 1, 0);
 #endif
@@ -2613,7 +2613,7 @@ static irqreturn_t cas_interrupt(int irq, void *dev_id)
 	if (status & INTR_RX_DONE) {
 #ifdef USE_NAPI
 		cas_mask_intr(cp);
-		netif_rx_schedule(&cp->napi);
+		napi_schedule(&cp->napi);
 #else
 		cas_rx_ringN(cp, 0, 0);
 #endif
@@ -2691,7 +2691,7 @@ rx_comp:
 #endif
 	spin_unlock_irqrestore(&cp->lock, flags);
 	if (enable_intr) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		cas_unmask_intr(cp);
 	}
 	return credits;
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
index d984b7995763..840da83fb3cf 100644
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1612,7 +1612,7 @@ int t1_poll(struct napi_struct *napi, int budget)
 	int work_done = process_responses(adapter, budget);
 
 	if (likely(work_done < budget)) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		writel(adapter->sge->respQ.cidx,
 		       adapter->regs + A_SG_SLEEPING);
 	}
@@ -1630,7 +1630,7 @@ irqreturn_t t1_interrupt(int irq, void *data)
 
 		if (napi_schedule_prep(&adapter->napi)) {
 			if (process_pure_responses(adapter))
-				__netif_rx_schedule(&adapter->napi);
+				__napi_schedule(&adapter->napi);
 			else {
 				/* no data, no NAPI needed */
 				writel(sge->respQ.cidx, adapter->regs + A_SG_SLEEPING);
diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c
index f66548751c38..4dad04e91f6d 100644
--- a/drivers/net/cpmac.c
+++ b/drivers/net/cpmac.c
@@ -428,7 +428,7 @@ static int cpmac_poll(struct napi_struct *napi, int budget)
 			printk(KERN_WARNING "%s: rx: polling, but no queue\n",
 			       priv->dev->name);
 		spin_unlock(&priv->rx_lock);
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		return 0;
 	}
 
@@ -514,7 +514,7 @@ static int cpmac_poll(struct napi_struct *napi, int budget)
 	if (processed == 0) {
 		/* we ran out of packets to read,
 		 * revert to interrupt-driven mode */
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		cpmac_write(priv->regs, CPMAC_RX_INT_ENABLE, 1);
 		return 0;
 	}
@@ -536,7 +536,7 @@ fatal_error:
 	}
 
 	spin_unlock(&priv->rx_lock);
-	netif_rx_complete(napi);
+	napi_complete(napi);
 	netif_tx_stop_all_queues(priv->dev);
 	napi_disable(&priv->napi);
 
@@ -802,9 +802,9 @@ static irqreturn_t cpmac_irq(int irq, void *dev_id)
 
 	if (status & MAC_INT_RX) {
 		queue = (status >> 8) & 7;
-		if (netif_rx_schedule_prep(&priv->napi)) {
+		if (napi_schedule_prep(&priv->napi)) {
 			cpmac_write(priv->regs, CPMAC_RX_INT_CLEAR, 1 << queue);
-			__netif_rx_schedule(&priv->napi);
+			__napi_schedule(&priv->napi);
 		}
 	}
 
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 86bb876fb123..861d2eeaa43c 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -1944,9 +1944,9 @@ static irqreturn_t e100_intr(int irq, void *dev_id)
 	if (stat_ack & stat_ack_rnr)
 		nic->ru_running = RU_SUSPENDED;
 
-	if (likely(netif_rx_schedule_prep(&nic->napi))) {
+	if (likely(napi_schedule_prep(&nic->napi))) {
 		e100_disable_irq(nic);
-		__netif_rx_schedule(&nic->napi);
+		__napi_schedule(&nic->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -1962,7 +1962,7 @@ static int e100_poll(struct napi_struct *napi, int budget)
 
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		e100_enable_irq(nic);
 	}
 
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 26474c92193f..ffe466e0afb9 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -3687,12 +3687,12 @@ static irqreturn_t e1000_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (likely(netif_rx_schedule_prep(&adapter->napi))) {
+	if (likely(napi_schedule_prep(&adapter->napi))) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi);
 	} else
 		e1000_irq_enable(adapter);
 
@@ -3747,12 +3747,12 @@ static irqreturn_t e1000_intr(int irq, void *data)
 		ew32(IMC, ~0);
 		E1000_WRITE_FLUSH();
 	}
-	if (likely(netif_rx_schedule_prep(&adapter->napi))) {
+	if (likely(napi_schedule_prep(&adapter->napi))) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi);
 	} else
 		/* this really should not happen! if it does it is basically a
 		 * bug, but not a hard error, so enable ints and continue */
@@ -3793,7 +3793,7 @@ static int e1000_clean(struct napi_struct *napi, int budget)
 	if (work_done < budget) {
 		if (likely(adapter->itr_setting & 3))
 			e1000_set_itr(adapter);
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		e1000_irq_enable(adapter);
 	}
 
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 91817d0afcaf..ff5b66adfc42 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -1179,12 +1179,12 @@ static irqreturn_t e1000_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (netif_rx_schedule_prep(&adapter->napi)) {
+	if (napi_schedule_prep(&adapter->napi)) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -1246,12 +1246,12 @@ static irqreturn_t e1000_intr(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (netif_rx_schedule_prep(&adapter->napi)) {
+	if (napi_schedule_prep(&adapter->napi)) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -1320,10 +1320,10 @@ static irqreturn_t e1000_intr_msix_rx(int irq, void *data)
 		adapter->rx_ring->set_itr = 0;
 	}
 
-	if (netif_rx_schedule_prep(&adapter->napi)) {
+	if (napi_schedule_prep(&adapter->napi)) {
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi);
 	}
 	return IRQ_HANDLED;
 }
@@ -2028,7 +2028,7 @@ clean_rx:
 	if (work_done < budget) {
 		if (adapter->itr_setting & 3)
 			e1000_set_itr(adapter);
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (adapter->msix_entries)
 			ew32(IMS, adapter->rx_ring->ims_val);
 		else
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index dfe92264e825..8dc2047da5c0 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -830,7 +830,7 @@ static int ehea_poll(struct napi_struct *napi, int budget)
 	while ((rx != budget) || force_irq) {
 		pr->poll_counter = 0;
 		force_irq = 0;
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		ehea_reset_cq_ep(pr->recv_cq);
 		ehea_reset_cq_ep(pr->send_cq);
 		ehea_reset_cq_n1(pr->recv_cq);
@@ -841,7 +841,7 @@ static int ehea_poll(struct napi_struct *napi, int budget)
 		if (!cqe && !cqe_skb)
 			return rx;
 
-		if (!netif_rx_reschedule(napi))
+		if (!napi_reschedule(napi))
 			return rx;
 
 		cqe_skb = ehea_proc_cqes(pr, EHEA_POLL_MAX_CQES);
@@ -859,7 +859,7 @@ static void ehea_netpoll(struct net_device *dev)
 	int i;
 
 	for (i = 0; i < port->num_def_qps; i++)
-		netif_rx_schedule(&port->port_res[i].napi);
+		napi_schedule(&port->port_res[i].napi);
 }
 #endif
 
@@ -867,7 +867,7 @@ static irqreturn_t ehea_recv_irq_handler(int irq, void *param)
 {
 	struct ehea_port_res *pr = param;
 
-	netif_rx_schedule(&pr->napi);
+	napi_schedule(&pr->napi);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 7d60551d538f..4617956821cd 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -411,8 +411,8 @@ static irqreturn_t enic_isr_legacy(int irq, void *data)
 	}
 
 	if (ENIC_TEST_INTR(pba, ENIC_INTX_WQ_RQ)) {
-		if (netif_rx_schedule_prep(&enic->napi))
-			__netif_rx_schedule(&enic->napi);
+		if (napi_schedule_prep(&enic->napi))
+			__napi_schedule(&enic->napi);
 	} else {
 		vnic_intr_unmask(&enic->intr[ENIC_INTX_WQ_RQ]);
 	}
@@ -440,7 +440,7 @@ static irqreturn_t enic_isr_msi(int irq, void *data)
 	 * writes).
 	 */
 
-	netif_rx_schedule(&enic->napi);
+	napi_schedule(&enic->napi);
 
 	return IRQ_HANDLED;
 }
@@ -450,7 +450,7 @@ static irqreturn_t enic_isr_msix_rq(int irq, void *data)
 	struct enic *enic = data;
 
 	/* schedule NAPI polling for RQ cleanup */
-	netif_rx_schedule(&enic->napi);
+	napi_schedule(&enic->napi);
 
 	return IRQ_HANDLED;
 }
@@ -1068,7 +1068,7 @@ static int enic_poll(struct napi_struct *napi, int budget)
 		if (netdev->features & NETIF_F_LRO)
 			lro_flush_all(&enic->lro_mgr);
 
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		vnic_intr_unmask(&enic->intr[ENIC_MSIX_RQ]);
 	}
 
@@ -1112,7 +1112,7 @@ static int enic_poll_msix(struct napi_struct *napi, int budget)
 		if (netdev->features & NETIF_F_LRO)
 			lro_flush_all(&enic->lro_mgr);
 
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		vnic_intr_unmask(&enic->intr[ENIC_MSIX_RQ]);
 	}
 
diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c
index a539bc3163cf..b60e27dfcfa7 100644
--- a/drivers/net/epic100.c
+++ b/drivers/net/epic100.c
@@ -1114,9 +1114,9 @@ static irqreturn_t epic_interrupt(int irq, void *dev_instance)
 
 	if ((status & EpicNapiEvent) && !ep->reschedule_in_poll) {
 		spin_lock(&ep->napi_lock);
-		if (netif_rx_schedule_prep(&ep->napi)) {
+		if (napi_schedule_prep(&ep->napi)) {
 			epic_napi_irq_off(dev, ep);
-			__netif_rx_schedule(&ep->napi);
+			__napi_schedule(&ep->napi);
 		} else
 			ep->reschedule_in_poll++;
 		spin_unlock(&ep->napi_lock);
@@ -1293,7 +1293,7 @@ rx_action:
 
 		more = ep->reschedule_in_poll;
 		if (!more) {
-			__netif_rx_complete(napi);
+			__napi_complete(napi);
 			outl(EpicNapiEvent, ioaddr + INTSTAT);
 			epic_napi_irq_on(dev, ep);
 		} else
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 5b910cf63740..875509d7d86b 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1760,7 +1760,7 @@ static void nv_do_rx_refill(unsigned long data)
 	struct fe_priv *np = netdev_priv(dev);
 
 	/* Just reschedule NAPI rx processing */
-	netif_rx_schedule(&np->napi);
+	napi_schedule(&np->napi);
 }
 #else
 static void nv_do_rx_refill(unsigned long data)
@@ -3406,7 +3406,7 @@ static irqreturn_t nv_nic_irq(int foo, void *data)
 #ifdef CONFIG_FORCEDETH_NAPI
 		if (events & NVREG_IRQ_RX_ALL) {
 			spin_lock(&np->lock);
-			netif_rx_schedule(&np->napi);
+			napi_schedule(&np->napi);
 
 			/* Disable furthur receive irq's */
 			np->irqmask &= ~NVREG_IRQ_RX_ALL;
@@ -3523,7 +3523,7 @@ static irqreturn_t nv_nic_irq_optimized(int foo, void *data)
 #ifdef CONFIG_FORCEDETH_NAPI
 		if (events & NVREG_IRQ_RX_ALL) {
 			spin_lock(&np->lock);
-			netif_rx_schedule(&np->napi);
+			napi_schedule(&np->napi);
 
 			/* Disable furthur receive irq's */
 			np->irqmask &= ~NVREG_IRQ_RX_ALL;
@@ -3680,7 +3680,7 @@ static int nv_napi_poll(struct napi_struct *napi, int budget)
 		/* re-enable receive interrupts */
 		spin_lock_irqsave(&np->lock, flags);
 
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 
 		np->irqmask |= NVREG_IRQ_RX_ALL;
 		if (np->msi_flags & NV_MSI_X_ENABLED)
@@ -3706,7 +3706,7 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data)
 	writel(NVREG_IRQ_RX_ALL, base + NvRegMSIXIrqStatus);
 
 	if (events) {
-		netif_rx_schedule(&np->napi);
+		napi_schedule(&np->napi);
 		/* disable receive interrupts on the nic */
 		writel(NVREG_IRQ_RX_ALL, base + NvRegIrqMask);
 		pci_push(base);
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index ce900e54d8d1..b037ce9857bf 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -209,7 +209,7 @@ static int fs_enet_rx_napi(struct napi_struct *napi, int budget)
 
 	if (received < budget) {
 		/* done */
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		(*fep->ops->napi_enable_rx)(dev);
 	}
 	return received;
@@ -478,7 +478,7 @@ fs_enet_interrupt(int irq, void *dev_id)
 				/* NOTE: it is possible for FCCs in NAPI mode    */
 				/* to submit a spurious interrupt while in poll  */
 				if (napi_ok)
-					__netif_rx_schedule(&fep->napi);
+					__napi_schedule(&fep->napi);
 			}
 		}
 
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index ea530673236e..2e76699f8104 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -1627,9 +1627,9 @@ static void gfar_schedule_cleanup(struct net_device *dev)
 	spin_lock_irqsave(&priv->txlock, flags);
 	spin_lock(&priv->rxlock);
 
-	if (netif_rx_schedule_prep(&priv->napi)) {
+	if (napi_schedule_prep(&priv->napi)) {
 		gfar_write(&priv->regs->imask, IMASK_RTX_DISABLED);
-		__netif_rx_schedule(&priv->napi);
+		__napi_schedule(&priv->napi);
 	}
 
 	spin_unlock(&priv->rxlock);
@@ -1886,7 +1886,7 @@ static int gfar_poll(struct napi_struct *napi, int budget)
 		return budget;
 
 	if (rx_cleaned < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		/* Clear the halt bit in RSTAT */
 		gfar_write(&priv->regs->rstat, RSTAT_CLEAR_RHALT);
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index dfa6348ac1dc..5c6315df86b9 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -1028,10 +1028,10 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 
 		ibmveth_assert(lpar_rc == H_SUCCESS);
 
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		if (ibmveth_rxq_pending_buffer(adapter) &&
-		    netif_rx_reschedule(napi)) {
+		    napi_reschedule(napi)) {
 			lpar_rc = h_vio_signal(adapter->vdev->unit_address,
 					       VIO_IRQ_DISABLE);
 			goto restart_poll;
@@ -1047,11 +1047,11 @@ static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance)
 	struct ibmveth_adapter *adapter = netdev_priv(netdev);
 	unsigned long lpar_rc;
 
-	if (netif_rx_schedule_prep(&adapter->napi)) {
+	if (napi_schedule_prep(&adapter->napi)) {
 		lpar_rc = h_vio_signal(adapter->vdev->unit_address,
 				       VIO_IRQ_DISABLE);
 		ibmveth_assert(lpar_rc == H_SUCCESS);
-		__netif_rx_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi);
 	}
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index b82b0fb2056c..3806bb9d8bfa 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -3386,8 +3386,8 @@ static irqreturn_t igb_msix_rx(int irq, void *data)
 
 	igb_write_itr(rx_ring);
 
-	if (netif_rx_schedule_prep(&rx_ring->napi))
-		__netif_rx_schedule(&rx_ring->napi);
+	if (napi_schedule_prep(&rx_ring->napi))
+		__napi_schedule(&rx_ring->napi);
 
 #ifdef CONFIG_IGB_DCA
 	if (rx_ring->adapter->flags & IGB_FLAG_DCA_ENABLED)
@@ -3539,7 +3539,7 @@ static irqreturn_t igb_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	netif_rx_schedule(&adapter->rx_ring[0].napi);
+	napi_schedule(&adapter->rx_ring[0].napi);
 
 	return IRQ_HANDLED;
 }
@@ -3577,7 +3577,7 @@ static irqreturn_t igb_intr(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	netif_rx_schedule(&adapter->rx_ring[0].napi);
+	napi_schedule(&adapter->rx_ring[0].napi);
 
 	return IRQ_HANDLED;
 }
@@ -3612,7 +3612,7 @@ static int igb_poll(struct napi_struct *napi, int budget)
 	    !netif_running(netdev)) {
 		if (adapter->itr_setting & 3)
 			igb_set_itr(adapter);
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (!test_bit(__IGB_DOWN, &adapter->state))
 			igb_irq_enable(adapter);
 		return 0;
@@ -3638,7 +3638,7 @@ static int igb_clean_rx_ring_msix(struct napi_struct *napi, int budget)
 
 	/* If not enough Rx work done, exit the polling mode */
 	if ((work_done == 0) || !netif_running(netdev)) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		if (adapter->itr_setting & 3) {
 			if (adapter->num_rx_queues == 1)
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index eee28d395682..e2ef16b29700 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1721,14 +1721,14 @@ ixgb_intr(int irq, void *data)
 		if (!test_bit(__IXGB_DOWN, &adapter->flags))
 			mod_timer(&adapter->watchdog_timer, jiffies);
 
-	if (netif_rx_schedule_prep(&adapter->napi)) {
+	if (napi_schedule_prep(&adapter->napi)) {
 
 		/* Disable interrupts and register for poll. The flush
 		  of the posted write is intentionally left out.
 		*/
 
 		IXGB_WRITE_REG(&adapter->hw, IMC, ~0);
-		__netif_rx_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi);
 	}
 	return IRQ_HANDLED;
 }
@@ -1749,7 +1749,7 @@ ixgb_clean(struct napi_struct *napi, int budget)
 
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (!test_bit(__IXGB_DOWN, &adapter->flags))
 			ixgb_irq_enable(adapter);
 	}
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index d2f4d5f508b7..7489094bbbc8 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1015,7 +1015,7 @@ static irqreturn_t ixgbe_msix_clean_rx(int irq, void *data)
 	rx_ring = &(adapter->rx_ring[r_idx]);
 	/* disable interrupts on this vector only */
 	IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, rx_ring->v_idx);
-	netif_rx_schedule(&q_vector->napi);
+	napi_schedule(&q_vector->napi);
 
 	return IRQ_HANDLED;
 }
@@ -1056,7 +1056,7 @@ static int ixgbe_clean_rxonly(struct napi_struct *napi, int budget)
 
 	/* If all Rx work done, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (adapter->itr_setting & 3)
 			ixgbe_set_itr_msix(q_vector);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
@@ -1105,7 +1105,7 @@ static int ixgbe_clean_rxonly_many(struct napi_struct *napi, int budget)
 	rx_ring = &(adapter->rx_ring[r_idx]);
 	/* If all Rx work done, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (adapter->itr_setting & 3)
 			ixgbe_set_itr_msix(q_vector);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
@@ -1381,13 +1381,13 @@ static irqreturn_t ixgbe_intr(int irq, void *data)
 
 	ixgbe_check_fan_failure(adapter, eicr);
 
-	if (netif_rx_schedule_prep(&adapter->q_vector[0].napi)) {
+	if (napi_schedule_prep(&adapter->q_vector[0].napi)) {
 		adapter->tx_ring[0].total_packets = 0;
 		adapter->tx_ring[0].total_bytes = 0;
 		adapter->rx_ring[0].total_packets = 0;
 		adapter->rx_ring[0].total_bytes = 0;
 		/* would disable interrupts here but EIAM disabled it */
-		__netif_rx_schedule(&adapter->q_vector[0].napi);
+		__napi_schedule(&adapter->q_vector[0].napi);
 	}
 
 	return IRQ_HANDLED;
@@ -2317,7 +2317,7 @@ static int ixgbe_poll(struct napi_struct *napi, int budget)
 
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (adapter->itr_setting & 3)
 			ixgbe_set_itr(adapter);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
diff --git a/drivers/net/ixp2000/ixpdev.c b/drivers/net/ixp2000/ixpdev.c
index 014745720560..d3bf2f017cc2 100644
--- a/drivers/net/ixp2000/ixpdev.c
+++ b/drivers/net/ixp2000/ixpdev.c
@@ -141,7 +141,7 @@ static int ixpdev_poll(struct napi_struct *napi, int budget)
 			break;
 	} while (ixp2000_reg_read(IXP2000_IRQ_THD_RAW_STATUS_A_0) & 0x00ff);
 
-	netif_rx_complete(napi);
+	napi_complete(napi);
 	ixp2000_reg_write(IXP2000_IRQ_THD_ENABLE_SET_A_0, 0x00ff);
 
 	return rx;
@@ -204,7 +204,7 @@ static irqreturn_t ixpdev_interrupt(int irq, void *dev_id)
 
 		ixp2000_reg_wrb(IXP2000_IRQ_THD_ENABLE_CLEAR_A_0, 0x00ff);
 		if (likely(napi_schedule_prep(&ip->napi))) {
-			__netif_rx_schedule(&ip->napi);
+			__napi_schedule(&ip->napi);
 		} else {
 			printk(KERN_CRIT "ixp2000: irq while polling!!\n");
 		}
diff --git a/drivers/net/jme.h b/drivers/net/jme.h
index 5154411b5e6b..e321c678b11c 100644
--- a/drivers/net/jme.h
+++ b/drivers/net/jme.h
@@ -398,15 +398,15 @@ struct jme_ring {
 #define JME_NAPI_WEIGHT(w) int w
 #define JME_NAPI_WEIGHT_VAL(w) w
 #define JME_NAPI_WEIGHT_SET(w, r)
-#define JME_RX_COMPLETE(dev, napis) netif_rx_complete(napis)
+#define JME_RX_COMPLETE(dev, napis) napi_complete(napis)
 #define JME_NAPI_ENABLE(priv) napi_enable(&priv->napi);
 #define JME_NAPI_DISABLE(priv) \
 	if (!napi_disable_pending(&priv->napi)) \
 		napi_disable(&priv->napi);
 #define JME_RX_SCHEDULE_PREP(priv) \
-	netif_rx_schedule_prep(&priv->napi)
+	napi_schedule_prep(&priv->napi)
 #define JME_RX_SCHEDULE(priv) \
-	__netif_rx_schedule(&priv->napi);
+	__napi_schedule(&priv->napi);
 
 /*
  * Jmac Adapter Private data
diff --git a/drivers/net/korina.c b/drivers/net/korina.c
index 75010cac76ac..38d6649a29c4 100644
--- a/drivers/net/korina.c
+++ b/drivers/net/korina.c
@@ -334,7 +334,7 @@ static irqreturn_t korina_rx_dma_interrupt(int irq, void *dev_id)
 				DMA_STAT_HALT | DMA_STAT_ERR),
 				&lp->rx_dma_regs->dmasm);
 
-		netif_rx_schedule(&lp->napi);
+		napi_schedule(&lp->napi);
 
 		if (dmas & DMA_STAT_ERR)
 			printk(KERN_ERR DRV_NAME "%s: DMA error\n", dev->name);
@@ -468,7 +468,7 @@ static int korina_poll(struct napi_struct *napi, int budget)
 
 	work_done = korina_rx(dev, budget);
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		writel(readl(&lp->rx_dma_regs->dmasm) &
 			~(DMA_STAT_DONE | DMA_STAT_HALT | DMA_STAT_ERR),
diff --git a/drivers/net/macb.c b/drivers/net/macb.c
index f6c4936e2fa8..dc33d51213d7 100644
--- a/drivers/net/macb.c
+++ b/drivers/net/macb.c
@@ -527,7 +527,7 @@ static int macb_poll(struct napi_struct *napi, int budget)
 		 * this function was called last time, and no packets
 		 * have been received since.
 		 */
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		goto out;
 	}
 
@@ -538,13 +538,13 @@ static int macb_poll(struct napi_struct *napi, int budget)
 		dev_warn(&bp->pdev->dev,
 			 "No RX buffers complete, status = %02lx\n",
 			 (unsigned long)status);
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		goto out;
 	}
 
 	work_done = macb_rx(bp, budget);
 	if (work_done < budget)
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 	/*
 	 * We've done what we can to clean the buffers. Make sure we
@@ -579,7 +579,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 		}
 
 		if (status & MACB_RX_INT_FLAGS) {
-			if (netif_rx_schedule_prep(&bp->napi)) {
+			if (napi_schedule_prep(&bp->napi)) {
 				/*
 				 * There's no point taking any more interrupts
 				 * until we have processed the buffers
@@ -587,7 +587,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 				macb_writel(bp, IDR, MACB_RX_INT_FLAGS);
 				dev_dbg(&bp->pdev->dev,
 					"scheduling RX softirq\n");
-				__netif_rx_schedule(&bp->napi);
+				__napi_schedule(&bp->napi);
 			}
 		}
 
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index c61b0bdca1a4..ac55ebd2f146 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -814,7 +814,7 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq)
 	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
 
 	if (priv->port_up)
-		netif_rx_schedule(&cq->napi);
+		napi_schedule(&cq->napi);
 	else
 		mlx4_en_arm_cq(priv, cq);
 }
@@ -834,7 +834,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
 		INC_PERF_COUNTER(priv->pstats.napi_quota);
 	else {
 		/* Done for now */
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		mlx4_en_arm_cq(priv, cq);
 	}
 	return done;
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index e9c1296b267e..2dacb8852dc3 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1514,7 +1514,7 @@ static int myri10ge_poll(struct napi_struct *napi, int budget)
 	work_done = myri10ge_clean_rx_done(ss, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		put_be32(htonl(3), ss->irq_claim);
 	}
 	return work_done;
@@ -1532,7 +1532,7 @@ static irqreturn_t myri10ge_intr(int irq, void *arg)
 	/* an interrupt on a non-zero receive-only slice is implicitly
 	 * valid  since MSI-X irqs are not shared */
 	if ((mgp->dev->real_num_tx_queues == 1) && (ss != mgp->ss)) {
-		netif_rx_schedule(&ss->napi);
+		napi_schedule(&ss->napi);
 		return (IRQ_HANDLED);
 	}
 
@@ -1543,7 +1543,7 @@ static irqreturn_t myri10ge_intr(int irq, void *arg)
 	/* low bit indicates receives are present, so schedule
 	 * napi poll handler */
 	if (stats->valid & 1)
-		netif_rx_schedule(&ss->napi);
+		napi_schedule(&ss->napi);
 
 	if (!mgp->msi_enabled && !mgp->msix_enabled) {
 		put_be32(0, mgp->irq_deassert);
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index c5dec54251bf..c23a58624a33 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -2198,10 +2198,10 @@ static irqreturn_t intr_handler(int irq, void *dev_instance)
 
 	prefetch(&np->rx_skbuff[np->cur_rx % RX_RING_SIZE]);
 
-	if (netif_rx_schedule_prep(&np->napi)) {
+	if (napi_schedule_prep(&np->napi)) {
 		/* Disable interrupts and register for poll */
 		natsemi_irq_disable(dev);
-		__netif_rx_schedule(&np->napi);
+		__napi_schedule(&np->napi);
 	} else
 		printk(KERN_WARNING
 	       	       "%s: Ignoring interrupt, status %#08x, mask %#08x.\n",
@@ -2253,7 +2253,7 @@ static int natsemi_poll(struct napi_struct *napi, int budget)
 		np->intr_status = readl(ioaddr + IntrStatus);
 	} while (np->intr_status);
 
-	netif_rx_complete(napi);
+	napi_complete(napi);
 
 	/* Reenable interrupts providing nothing is trying to shut
 	 * the chip down. */
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index d854f07ef4d3..1139e637f5da 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -1631,7 +1631,7 @@ static int netxen_nic_poll(struct napi_struct *napi, int budget)
 	}
 
 	if ((work_done < budget) && tx_complete) {
-		netif_rx_complete(&adapter->napi);
+		napi_complete(&adapter->napi);
 		netxen_nic_enable_int(adapter);
 	}
 
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 0c0b752315ca..4a5a089fa301 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -3669,7 +3669,7 @@ static int niu_poll(struct napi_struct *napi, int budget)
 	work_done = niu_poll_core(np, lp, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		niu_ldg_rearm(np, lp, 1);
 	}
 	return work_done;
@@ -4088,12 +4088,12 @@ static void __niu_fastpath_interrupt(struct niu *np, int ldg, u64 v0)
 static void niu_schedule_napi(struct niu *np, struct niu_ldg *lp,
 			      u64 v0, u64 v1, u64 v2)
 {
-	if (likely(netif_rx_schedule_prep(&lp->napi))) {
+	if (likely(napi_schedule_prep(&lp->napi))) {
 		lp->v0 = v0;
 		lp->v1 = v1;
 		lp->v2 = v2;
 		__niu_fastpath_interrupt(np, lp->ldg_num, v0);
-		__netif_rx_schedule(&lp->napi);
+		__napi_schedule(&lp->napi);
 	}
 }
 
diff --git a/drivers/net/pasemi_mac.c b/drivers/net/pasemi_mac.c
index d0349e7d73ea..5eeb5a87b738 100644
--- a/drivers/net/pasemi_mac.c
+++ b/drivers/net/pasemi_mac.c
@@ -970,7 +970,7 @@ static irqreturn_t pasemi_mac_rx_intr(int irq, void *data)
 	if (*chan->status & PAS_STATUS_ERROR)
 		reg |= PAS_IOB_DMA_RXCH_RESET_DINTC;
 
-	netif_rx_schedule(&mac->napi);
+	napi_schedule(&mac->napi);
 
 	write_iob_reg(PAS_IOB_DMA_RXCH_RESET(chan->chno), reg);
 
@@ -1010,7 +1010,7 @@ static irqreturn_t pasemi_mac_tx_intr(int irq, void *data)
 
 	mod_timer(&txring->clean_timer, jiffies + (TX_CLEAN_INTERVAL)*2);
 
-	netif_rx_schedule(&mac->napi);
+	napi_schedule(&mac->napi);
 
 	if (reg)
 		write_iob_reg(PAS_IOB_DMA_TXCH_RESET(chan->chno), reg);
@@ -1639,7 +1639,7 @@ static int pasemi_mac_poll(struct napi_struct *napi, int budget)
 	pkts = pasemi_mac_clean_rx(rx_ring(mac), budget);
 	if (pkts < budget) {
 		/* all done, no more packets present */
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		pasemi_mac_restart_rx_intr(mac);
 		pasemi_mac_restart_tx_intr(mac);
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 665a4286da39..80124fac65fa 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -1397,7 +1397,7 @@ static int pcnet32_poll(struct napi_struct *napi, int budget)
 	if (work_done < budget) {
 		spin_lock_irqsave(&lp->lock, flags);
 
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 
 		/* clear interrupt masks */
 		val = lp->a.read_csr(ioaddr, CSR3);
@@ -2592,14 +2592,14 @@ pcnet32_interrupt(int irq, void *dev_id)
 				       dev->name, csr0);
 			/* unlike for the lance, there is no restart needed */
 		}
-		if (netif_rx_schedule_prep(&lp->napi)) {
+		if (napi_schedule_prep(&lp->napi)) {
 			u16 val;
 			/* set interrupt masks */
 			val = lp->a.read_csr(ioaddr, CSR3);
 			val |= 0x5f00;
 			lp->a.write_csr(ioaddr, CSR3, val);
 			mmiowb();
-			__netif_rx_schedule(&lp->napi);
+			__napi_schedule(&lp->napi);
 			break;
 		}
 		csr0 = lp->a.read_csr(ioaddr, CSR0);
diff --git a/drivers/net/qla3xxx.c b/drivers/net/qla3xxx.c
index 189ec29ac7a4..8b2823c8dccf 100644
--- a/drivers/net/qla3xxx.c
+++ b/drivers/net/qla3xxx.c
@@ -2292,7 +2292,7 @@ static int ql_poll(struct napi_struct *napi, int budget)
 
 	if (tx_cleaned + rx_cleaned != budget) {
 		spin_lock_irqsave(&qdev->hw_lock, hw_flags);
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 		ql_update_small_bufq_prod_index(qdev);
 		ql_update_lrg_bufq_prod_index(qdev);
 		writel(qdev->rsp_consumer_index,
@@ -2351,8 +2351,8 @@ static irqreturn_t ql3xxx_isr(int irq, void *dev_id)
 		spin_unlock(&qdev->adapter_lock);
 	} else if (value & ISP_IMR_DISABLE_CMPL_INT) {
 		ql_disable_interrupts(qdev);
-		if (likely(netif_rx_schedule_prep(&qdev->napi))) {
-			__netif_rx_schedule(&qdev->napi);
+		if (likely(napi_schedule_prep(&qdev->napi))) {
+			__napi_schedule(&qdev->napi);
 		}
 	} else {
 		return IRQ_NONE;
diff --git a/drivers/net/qlge/qlge_main.c b/drivers/net/qlge/qlge_main.c
index 45421c8b6010..16eb9dd85286 100644
--- a/drivers/net/qlge/qlge_main.c
+++ b/drivers/net/qlge/qlge_main.c
@@ -1642,7 +1642,7 @@ static int ql_napi_poll_msix(struct napi_struct *napi, int budget)
 		rx_ring->cq_id);
 
 	if (work_done < budget) {
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 		ql_enable_completion_interrupt(qdev, rx_ring->irq);
 	}
 	return work_done;
@@ -1727,7 +1727,7 @@ static irqreturn_t qlge_msix_tx_isr(int irq, void *dev_id)
 static irqreturn_t qlge_msix_rx_isr(int irq, void *dev_id)
 {
 	struct rx_ring *rx_ring = dev_id;
-	netif_rx_schedule(&rx_ring->napi);
+	napi_schedule(&rx_ring->napi);
 	return IRQ_HANDLED;
 }
 
@@ -1813,7 +1813,7 @@ static irqreturn_t qlge_isr(int irq, void *dev_id)
 							      &rx_ring->rx_work,
 							      0);
 				else
-					netif_rx_schedule(&rx_ring->napi);
+					napi_schedule(&rx_ring->napi);
 				work_done++;
 			}
 		}
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 72fd9e97c190..cc0f886b0c29 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -677,7 +677,7 @@ static int r6040_poll(struct napi_struct *napi, int budget)
 	work_done = r6040_rx(dev, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		/* Enable RX interrupt */
 		iowrite16(ioread16(ioaddr + MIER) | RX_INTS, ioaddr + MIER);
 	}
@@ -714,7 +714,7 @@ static irqreturn_t r6040_interrupt(int irq, void *dev_id)
 
 		/* Mask off RX interrupt */
 		misr &= ~RX_INTS;
-		netif_rx_schedule(&lp->napi);
+		napi_schedule(&lp->napi);
 	}
 
 	/* TX interrupt request */
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 2c73ca606b35..1c4a980253fe 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3581,8 +3581,8 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 		RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event);
 		tp->intr_mask = ~tp->napi_event;
 
-		if (likely(netif_rx_schedule_prep(&tp->napi)))
-			__netif_rx_schedule(&tp->napi);
+		if (likely(napi_schedule_prep(&tp->napi)))
+			__napi_schedule(&tp->napi);
 		else if (netif_msg_intr(tp)) {
 			printk(KERN_INFO "%s: interrupt %04x in poll\n",
 			       dev->name, status);
@@ -3603,7 +3603,7 @@ static int rtl8169_poll(struct napi_struct *napi, int budget)
 	rtl8169_tx_interrupt(dev, tp, ioaddr);
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		tp->intr_mask = 0xffff;
 		/*
 		 * 20040426: the barrier is not strictly required but the
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index f5c57c059bca..2a96a10fd0cf 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -2852,7 +2852,7 @@ static int s2io_poll_msix(struct napi_struct *napi, int budget)
 	s2io_chk_rx_buffers(nic, ring);
 
 	if (pkts_processed < budget_org) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		/*Re Enable MSI-Rx Vector*/
 		addr = (u8 __iomem *)&bar0->xmsi_mask_reg;
 		addr += 7 - ring->ring_no;
@@ -2889,7 +2889,7 @@ static int s2io_poll_inta(struct napi_struct *napi, int budget)
 			break;
 	}
 	if (pkts_processed < budget_org) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		/* Re enable the Rx interrupts for the ring */
 		writeq(0, &bar0->rx_traffic_mask);
 		readl(&bar0->rx_traffic_mask);
@@ -4342,7 +4342,7 @@ static irqreturn_t s2io_msix_ring_handle(int irq, void *dev_id)
 		val8 = (ring->ring_no == 0) ? 0x7f : 0xff;
 		writeb(val8, addr);
 		val8 = readb(addr);
-		netif_rx_schedule(&ring->napi);
+		napi_schedule(&ring->napi);
 	} else {
 		rx_intr_handler(ring, 0);
 		s2io_chk_rx_buffers(sp, ring);
@@ -4789,7 +4789,7 @@ static irqreturn_t s2io_isr(int irq, void *dev_id)
 
 		if (config->napi) {
 			if (reason & GEN_INTR_RXTRAFFIC) {
-				netif_rx_schedule(&sp->napi);
+				napi_schedule(&sp->napi);
 				writeq(S2IO_MINUS_ONE, &bar0->rx_traffic_mask);
 				writeq(S2IO_MINUS_ONE, &bar0->rx_traffic_int);
 				readl(&bar0->rx_traffic_int);
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 31e38fae017f..3e11c1d6d792 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2039,9 +2039,9 @@ static irqreturn_t sbmac_intr(int irq,void *dev_instance)
 		sbdma_tx_process(sc,&(sc->sbm_txdma), 0);
 
 	if (isr & (M_MAC_INT_CHANNEL << S_MAC_RX_CH0)) {
-		if (netif_rx_schedule_prep(&sc->napi)) {
+		if (napi_schedule_prep(&sc->napi)) {
 			__raw_writeq(0, sc->sbm_imr);
-			__netif_rx_schedule(&sc->napi);
+			__napi_schedule(&sc->napi);
 			/* Depend on the exit from poll to reenable intr */
 		}
 		else {
@@ -2667,7 +2667,7 @@ static int sbmac_poll(struct napi_struct *napi, int budget)
 	sbdma_tx_process(sc, &(sc->sbm_txdma), 1);
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 #ifdef CONFIG_SBMAC_COALESCE
 		__raw_writeq(((M_MAC_INT_EOP_COUNT | M_MAC_INT_EOP_TIMER) << S_MAC_TX_CH0) |
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 7673fd92eaf5..77aca5d67b57 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -225,11 +225,11 @@ static int efx_poll(struct napi_struct *napi, int budget)
 
 	if (rx_packets < budget) {
 		/* There is no race here; although napi_disable() will
-		 * only wait for netif_rx_complete(), this isn't a problem
+		 * only wait for napi_complete(), this isn't a problem
 		 * since efx_channel_processed() will have no effect if
 		 * interrupts have already been disabled.
 		 */
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		efx_channel_processed(channel);
 	}
 
diff --git a/drivers/net/sfc/efx.h b/drivers/net/sfc/efx.h
index 0dd7a532c78a..fb1ac0e63c0b 100644
--- a/drivers/net/sfc/efx.h
+++ b/drivers/net/sfc/efx.h
@@ -77,7 +77,7 @@ static inline void efx_schedule_channel(struct efx_channel *channel)
 		  channel->channel, raw_smp_processor_id());
 	channel->work_pending = true;
 
-	netif_rx_schedule(&channel->napi_str);
+	napi_schedule(&channel->napi_str);
 }
 
 #endif /* EFX_EFX_H */
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index c9dbb06f8c94..952d37ffee51 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -3214,7 +3214,7 @@ static int skge_poll(struct napi_struct *napi, int to_do)
 		unsigned long flags;
 
 		spin_lock_irqsave(&hw->hw_lock, flags);
-		__netif_rx_complete(napi);
+		__napi_complete(napi);
 		hw->intr_mask |= napimask[skge->port];
 		skge_write32(hw, B0_IMSK, hw->intr_mask);
 		skge_read32(hw, B0_IMSK);
@@ -3377,7 +3377,7 @@ static irqreturn_t skge_intr(int irq, void *dev_id)
 	if (status & (IS_XA1_F|IS_R1_F)) {
 		struct skge_port *skge = netdev_priv(hw->dev[0]);
 		hw->intr_mask &= ~(IS_XA1_F|IS_R1_F);
-		netif_rx_schedule(&skge->napi);
+		napi_schedule(&skge->napi);
 	}
 
 	if (status & IS_PA_TO_TX1)
@@ -3397,7 +3397,7 @@ static irqreturn_t skge_intr(int irq, void *dev_id)
 
 		if (status & (IS_XA2_F|IS_R2_F)) {
 			hw->intr_mask &= ~(IS_XA2_F|IS_R2_F);
-			netif_rx_schedule(&skge->napi);
+			napi_schedule(&skge->napi);
 		}
 
 		if (status & IS_PA_TO_RX2) {
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index f513bdf1c887..d271ae39c6f3 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -984,7 +984,7 @@ static int smsc911x_poll(struct napi_struct *napi, int budget)
 			/* We processed all packets available.  Tell NAPI it can
 			 * stop polling then re-enable rx interrupts */
 			smsc911x_reg_write(pdata, INT_STS, INT_STS_RSFL_);
-			netif_rx_complete(napi);
+			napi_complete(napi);
 			temp = smsc911x_reg_read(pdata, INT_EN);
 			temp |= INT_EN_RSFL_EN_;
 			smsc911x_reg_write(pdata, INT_EN, temp);
@@ -1485,16 +1485,16 @@ static irqreturn_t smsc911x_irqhandler(int irq, void *dev_id)
 	}
 
 	if (likely(intsts & inten & INT_STS_RSFL_)) {
-		if (likely(netif_rx_schedule_prep(&pdata->napi))) {
+		if (likely(napi_schedule_prep(&pdata->napi))) {
 			/* Disable Rx interrupts */
 			temp = smsc911x_reg_read(pdata, INT_EN);
 			temp &= (~INT_EN_RSFL_EN_);
 			smsc911x_reg_write(pdata, INT_EN, temp);
 			/* Schedule a NAPI poll */
-			__netif_rx_schedule(&pdata->napi);
+			__napi_schedule(&pdata->napi);
 		} else {
 			SMSC_WARNING(RX_ERR,
-				"netif_rx_schedule_prep failed");
+				"napi_schedule_prep failed");
 		}
 		serviced = IRQ_HANDLED;
 	}
diff --git a/drivers/net/smsc9420.c b/drivers/net/smsc9420.c
index c14a4c6452c7..79f4c228b030 100644
--- a/drivers/net/smsc9420.c
+++ b/drivers/net/smsc9420.c
@@ -666,7 +666,7 @@ static irqreturn_t smsc9420_isr(int irq, void *dev_id)
 			smsc9420_pci_flush_write(pd);
 
 			ints_to_clear |= (DMAC_STS_RX_ | DMAC_STS_NIS_);
-			netif_rx_schedule(&pd->napi);
+			napi_schedule(&pd->napi);
 		}
 
 		if (ints_to_clear)
@@ -889,7 +889,7 @@ static int smsc9420_rx_poll(struct napi_struct *napi, int budget)
 	smsc9420_pci_flush_write(pd);
 
 	if (work_done < budget) {
-		netif_rx_complete(&pd->napi);
+		napi_complete(&pd->napi);
 
 		/* re-enable RX DMA interrupts */
 		dma_intr_ena = smsc9420_reg_read(pd, DMAC_INTR_ENA);
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 88d2c67788df..7f6b4a4052ee 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -1301,7 +1301,7 @@ static int spider_net_poll(struct napi_struct *napi, int budget)
 	/* if all packets are in the stack, enable interrupts and return 0 */
 	/* if not, return 1 */
 	if (packets_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		spider_net_rx_irq_on(card);
 		card->ignore_rx_ramfull = 0;
 	}
@@ -1528,7 +1528,7 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg,
 			spider_net_refill_rx_chain(card);
 			spider_net_enable_rxdmac(card);
 			card->num_rx_ints ++;
-			netif_rx_schedule(&card->napi);
+			napi_schedule(&card->napi);
 		}
 		show_error = 0;
 		break;
@@ -1548,7 +1548,7 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg,
 		spider_net_refill_rx_chain(card);
 		spider_net_enable_rxdmac(card);
 		card->num_rx_ints ++;
-		netif_rx_schedule(&card->napi);
+		napi_schedule(&card->napi);
 		show_error = 0;
 		break;
 
@@ -1562,7 +1562,7 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg,
 		spider_net_refill_rx_chain(card);
 		spider_net_enable_rxdmac(card);
 		card->num_rx_ints ++;
-		netif_rx_schedule(&card->napi);
+		napi_schedule(&card->napi);
 		show_error = 0;
 		break;
 
@@ -1656,11 +1656,11 @@ spider_net_interrupt(int irq, void *ptr)
 
 	if (status_reg & SPIDER_NET_RXINT ) {
 		spider_net_rx_irq_off(card);
-		netif_rx_schedule(&card->napi);
+		napi_schedule(&card->napi);
 		card->num_rx_ints ++;
 	}
 	if (status_reg & SPIDER_NET_TXINT)
-		netif_rx_schedule(&card->napi);
+		napi_schedule(&card->napi);
 
 	if (status_reg & SPIDER_NET_LINKINT)
 		spider_net_link_reset(netdev);
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index da3a76b18eff..98fe79515bab 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -1342,8 +1342,8 @@ static irqreturn_t intr_handler(int irq, void *dev_instance)
 		if (intr_status & (IntrRxDone | IntrRxEmpty)) {
 			u32 enable;
 
-			if (likely(netif_rx_schedule_prep(&np->napi))) {
-				__netif_rx_schedule(&np->napi);
+			if (likely(napi_schedule_prep(&np->napi))) {
+				__napi_schedule(&np->napi);
 				enable = readl(ioaddr + IntrEnable);
 				enable &= ~(IntrRxDone | IntrRxEmpty);
 				writel(enable, ioaddr + IntrEnable);
@@ -1587,7 +1587,7 @@ static int netdev_poll(struct napi_struct *napi, int budget)
 		intr_status = readl(ioaddr + IntrStatus);
 	} while (intr_status & (IntrRxDone | IntrRxEmpty));
 
-	netif_rx_complete(napi);
+	napi_complete(napi);
 	intr_status = readl(ioaddr + IntrEnable);
 	intr_status |= IntrRxDone | IntrRxEmpty;
 	writel(intr_status, ioaddr + IntrEnable);
diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index 86c765d83de1..4942059109f3 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -921,7 +921,7 @@ static int gem_poll(struct napi_struct *napi, int budget)
 		gp->status = readl(gp->regs + GREG_STAT);
 	} while (gp->status & GREG_STAT_NAPI);
 
-	__netif_rx_complete(napi);
+	__napi_complete(napi);
 	gem_enable_ints(gp);
 
 	spin_unlock_irqrestore(&gp->lock, flags);
@@ -944,7 +944,7 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id)
 
 	spin_lock_irqsave(&gp->lock, flags);
 
-	if (netif_rx_schedule_prep(&gp->napi)) {
+	if (napi_schedule_prep(&gp->napi)) {
 		u32 gem_status = readl(gp->regs + GREG_STAT);
 
 		if (gem_status == 0) {
@@ -954,7 +954,7 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id)
 		}
 		gp->status = gem_status;
 		gem_disable_ints(gp);
-		__netif_rx_schedule(&gp->napi);
+		__napi_schedule(&gp->napi);
 	}
 
 	spin_unlock_irqrestore(&gp->lock, flags);
diff --git a/drivers/net/tc35815.c b/drivers/net/tc35815.c
index bcd0e60cbda9..f42c67e93bf4 100644
--- a/drivers/net/tc35815.c
+++ b/drivers/net/tc35815.c
@@ -1609,8 +1609,8 @@ static irqreturn_t tc35815_interrupt(int irq, void *dev_id)
 	if (!(dmactl & DMA_IntMask)) {
 		/* disable interrupts */
 		tc_writel(dmactl | DMA_IntMask, &tr->DMA_Ctl);
-		if (netif_rx_schedule_prep(&lp->napi))
-			__netif_rx_schedule(&lp->napi);
+		if (napi_schedule_prep(&lp->napi))
+			__napi_schedule(&lp->napi);
 		else {
 			printk(KERN_ERR "%s: interrupt taken in poll\n",
 			       dev->name);
@@ -1919,7 +1919,7 @@ static int tc35815_poll(struct napi_struct *napi, int budget)
 	spin_unlock(&lp->lock);
 
 	if (received < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		/* enable interrupts */
 		tc_writel(tc_readl(&tr->DMA_Ctl) & ~DMA_IntMask, &tr->DMA_Ctl);
 	}
diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c
index a7a4dc4d6313..be9f38f8f0bf 100644
--- a/drivers/net/tehuti.c
+++ b/drivers/net/tehuti.c
@@ -265,8 +265,8 @@ static irqreturn_t bdx_isr_napi(int irq, void *dev)
 		bdx_isr_extra(priv, isr);
 
 	if (isr & (IR_RX_DESC_0 | IR_TX_FREE_0)) {
-		if (likely(netif_rx_schedule_prep(&priv->napi))) {
-			__netif_rx_schedule(&priv->napi);
+		if (likely(napi_schedule_prep(&priv->napi))) {
+			__napi_schedule(&priv->napi);
 			RET(IRQ_HANDLED);
 		} else {
 			/* NOTE: we get here if intr has slipped into window
@@ -302,7 +302,7 @@ static int bdx_poll(struct napi_struct *napi, int budget)
 		 * device lock and allow waiting tasks (eg rmmod) to advance) */
 		priv->napi_stop = 0;
 
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		bdx_enable_interrupts(priv);
 	}
 	return work_done;
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 8b3f84685387..5fa65acb68e5 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -4460,7 +4460,7 @@ static int tg3_poll(struct napi_struct *napi, int budget)
 			sblk->status &= ~SD_STATUS_UPDATED;
 
 		if (likely(!tg3_has_work(tp))) {
-			netif_rx_complete(napi);
+			napi_complete(napi);
 			tg3_restart_ints(tp);
 			break;
 		}
@@ -4470,7 +4470,7 @@ static int tg3_poll(struct napi_struct *napi, int budget)
 
 tx_recovery:
 	/* work_done is guaranteed to be less than budget. */
-	netif_rx_complete(napi);
+	napi_complete(napi);
 	schedule_work(&tp->reset_task);
 	return work_done;
 }
@@ -4519,7 +4519,7 @@ static irqreturn_t tg3_msi_1shot(int irq, void *dev_id)
 	prefetch(&tp->rx_rcb[tp->rx_rcb_ptr]);
 
 	if (likely(!tg3_irq_sync(tp)))
-		netif_rx_schedule(&tp->napi);
+		napi_schedule(&tp->napi);
 
 	return IRQ_HANDLED;
 }
@@ -4544,7 +4544,7 @@ static irqreturn_t tg3_msi(int irq, void *dev_id)
 	 */
 	tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001);
 	if (likely(!tg3_irq_sync(tp)))
-		netif_rx_schedule(&tp->napi);
+		napi_schedule(&tp->napi);
 
 	return IRQ_RETVAL(1);
 }
@@ -4586,7 +4586,7 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id)
 	sblk->status &= ~SD_STATUS_UPDATED;
 	if (likely(tg3_has_work(tp))) {
 		prefetch(&tp->rx_rcb[tp->rx_rcb_ptr]);
-		netif_rx_schedule(&tp->napi);
+		napi_schedule(&tp->napi);
 	} else {
 		/* No work, shared interrupt perhaps?  re-enable
 		 * interrupts, and flush that PCI write
@@ -4632,7 +4632,7 @@ static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id)
 	tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001);
 	if (tg3_irq_sync(tp))
 		goto out;
-	if (netif_rx_schedule_prep(&tp->napi)) {
+	if (napi_schedule_prep(&tp->napi)) {
 		prefetch(&tp->rx_rcb[tp->rx_rcb_ptr]);
 		/* Update last_tag to mark that this status has been
 		 * seen. Because interrupt may be shared, we may be
@@ -4640,7 +4640,7 @@ static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id)
 		 * if tg3_poll() is not scheduled.
 		 */
 		tp->last_tag = sblk->status_tag;
-		__netif_rx_schedule(&tp->napi);
+		__napi_schedule(&tp->napi);
 	}
 out:
 	return IRQ_RETVAL(handled);
diff --git a/drivers/net/tsi108_eth.c b/drivers/net/tsi108_eth.c
index 75461dbd4876..1138782e5611 100644
--- a/drivers/net/tsi108_eth.c
+++ b/drivers/net/tsi108_eth.c
@@ -888,7 +888,7 @@ static int tsi108_poll(struct napi_struct *napi, int budget)
 
 	if (num_received < budget) {
 		data->rxpending = 0;
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		TSI_WRITE(TSI108_EC_INTMASK,
 				     TSI_READ(TSI108_EC_INTMASK)
@@ -915,11 +915,11 @@ static void tsi108_rx_int(struct net_device *dev)
 	 *
 	 * This can happen if this code races with tsi108_poll(), which masks
 	 * the interrupts after tsi108_irq_one() read the mask, but before
-	 * netif_rx_schedule is called.  It could also happen due to calls
+	 * napi_schedule is called.  It could also happen due to calls
 	 * from tsi108_check_rxring().
 	 */
 
-	if (netif_rx_schedule_prep(&data->napi)) {
+	if (napi_schedule_prep(&data->napi)) {
 		/* Mask, rather than ack, the receive interrupts.  The ack
 		 * will happen in tsi108_poll().
 		 */
@@ -930,7 +930,7 @@ static void tsi108_rx_int(struct net_device *dev)
 				     | TSI108_INT_RXTHRESH |
 				     TSI108_INT_RXOVERRUN | TSI108_INT_RXERROR |
 				     TSI108_INT_RXWAIT);
-		__netif_rx_schedule(&data->napi);
+		__napi_schedule(&data->napi);
 	} else {
 		if (!netif_running(dev)) {
 			/* This can happen if an interrupt occurs while the
diff --git a/drivers/net/tulip/interrupt.c b/drivers/net/tulip/interrupt.c
index 6c3428a37c0b..9f946d421088 100644
--- a/drivers/net/tulip/interrupt.c
+++ b/drivers/net/tulip/interrupt.c
@@ -103,7 +103,7 @@ void oom_timer(unsigned long data)
 {
         struct net_device *dev = (struct net_device *)data;
 	struct tulip_private *tp = netdev_priv(dev);
-	netif_rx_schedule(&tp->napi);
+	napi_schedule(&tp->napi);
 }
 
 int tulip_poll(struct napi_struct *napi, int budget)
@@ -300,7 +300,7 @@ int tulip_poll(struct napi_struct *napi, int budget)
 
          /* Remove us from polling list and enable RX intr. */
 
-         netif_rx_complete(napi);
+         napi_complete(napi);
          iowrite32(tulip_tbl[tp->chip_id].valid_intrs, tp->base_addr+CSR7);
 
          /* The last op happens after poll completion. Which means the following:
@@ -333,10 +333,10 @@ int tulip_poll(struct napi_struct *napi, int budget)
 
          /* Think: timer_pending() was an explicit signature of bug.
           * Timer can be pending now but fired and completed
-          * before we did netif_rx_complete(). See? We would lose it. */
+          * before we did napi_complete(). See? We would lose it. */
 
          /* remove ourselves from the polling list */
-         netif_rx_complete(napi);
+         napi_complete(napi);
 
          return work_done;
 }
@@ -519,7 +519,7 @@ irqreturn_t tulip_interrupt(int irq, void *dev_instance)
 			rxd++;
 			/* Mask RX intrs and add the device to poll list. */
 			iowrite32(tulip_tbl[tp->chip_id].valid_intrs&~RxPollInt, ioaddr + CSR7);
-			netif_rx_schedule(&tp->napi);
+			napi_schedule(&tp->napi);
 
 			if (!(csr5&~(AbnormalIntr|NormalIntr|RxPollInt|TPLnkPass)))
                                break;
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index 3af9a9516ccb..dcff5ade6d08 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -1783,7 +1783,7 @@ typhoon_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		iowrite32(TYPHOON_INTR_NONE,
 				tp->ioaddr + TYPHOON_REG_INTR_MASK);
 		typhoon_post_pci_writes(tp->ioaddr);
@@ -1806,10 +1806,10 @@ typhoon_interrupt(int irq, void *dev_instance)
 
 	iowrite32(intr_status, ioaddr + TYPHOON_REG_INTR_STATUS);
 
-	if (netif_rx_schedule_prep(&tp->napi)) {
+	if (napi_schedule_prep(&tp->napi)) {
 		iowrite32(TYPHOON_INTR_ALL, ioaddr + TYPHOON_REG_INTR_MASK);
 		typhoon_post_pci_writes(ioaddr);
-		__netif_rx_schedule(&tp->napi);
+		__napi_schedule(&tp->napi);
 	} else {
 		printk(KERN_ERR "%s: Error, poll already scheduled\n",
                        dev->name);
diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 11441225bf41..6def6f826a54 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -3251,7 +3251,7 @@ static int ucc_geth_poll(struct napi_struct *napi, int budget)
 		howmany += ucc_geth_rx(ugeth, i, budget - howmany);
 
 	if (howmany < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		setbits32(ugeth->uccf->p_uccm, UCCE_RX_EVENTS);
 	}
 
@@ -3282,10 +3282,10 @@ static irqreturn_t ucc_geth_irq_handler(int irq, void *info)
 
 	/* check for receive events that require processing */
 	if (ucce & UCCE_RX_EVENTS) {
-		if (netif_rx_schedule_prep(&ugeth->napi)) {
+		if (napi_schedule_prep(&ugeth->napi)) {
 			uccm &= ~UCCE_RX_EVENTS;
 			out_be32(uccf->p_uccm, uccm);
-			__netif_rx_schedule(&ugeth->napi);
+			__napi_schedule(&ugeth->napi);
 		}
 	}
 
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index 3b8e63254277..4671436ecf0e 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -589,7 +589,7 @@ static int rhine_napipoll(struct napi_struct *napi, int budget)
 	work_done = rhine_rx(dev, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 
 		iowrite16(IntrRxDone | IntrRxErr | IntrRxEmpty| IntrRxOverflow |
 			  IntrRxDropped | IntrRxNoBuf | IntrTxAborted |
@@ -1319,7 +1319,7 @@ static irqreturn_t rhine_interrupt(int irq, void *dev_instance)
 				  IntrPCIErr | IntrStatsMax | IntrLinkChange,
 				  ioaddr + IntrEnable);
 
-			netif_rx_schedule(&rp->napi);
+			napi_schedule(&rp->napi);
 		}
 
 		if (intr_status & (IntrTxErrSummary | IntrTxDone)) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 43f6523c40be..30ae6d9a12af 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -374,9 +374,9 @@ static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
 	/* Schedule NAPI, Suppress further interrupts if successful. */
-	if (netif_rx_schedule_prep(&vi->napi)) {
+	if (napi_schedule_prep(&vi->napi)) {
 		rvq->vq_ops->disable_cb(rvq);
-		__netif_rx_schedule(&vi->napi);
+		__napi_schedule(&vi->napi);
 	}
 }
 
@@ -402,11 +402,11 @@ again:
 
 	/* Out of packets? */
 	if (received < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		if (unlikely(!vi->rvq->vq_ops->enable_cb(vi->rvq))
 		    && napi_schedule_prep(napi)) {
 			vi->rvq->vq_ops->disable_cb(vi->rvq);
-			__netif_rx_schedule(napi);
+			__napi_schedule(napi);
 			goto again;
 		}
 	}
@@ -580,9 +580,9 @@ static int virtnet_open(struct net_device *dev)
 	 * won't get another interrupt, so process any outstanding packets
 	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
 	 * We synchronize against interrupts via NAPI_STATE_SCHED */
-	if (netif_rx_schedule_prep(&vi->napi)) {
+	if (napi_schedule_prep(&vi->napi)) {
 		vi->rvq->vq_ops->disable_cb(vi->rvq);
-		__netif_rx_schedule(&vi->napi);
+		__napi_schedule(&vi->napi);
 	}
 	return 0;
 }
diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c
index 08b3536944fe..497b003d7239 100644
--- a/drivers/net/wan/hd64572.c
+++ b/drivers/net/wan/hd64572.c
@@ -341,7 +341,7 @@ static int sca_poll(struct napi_struct *napi, int budget)
 		received = sca_rx_done(port, budget);
 
 	if (received < budget) {
-		netif_rx_complete(napi);
+		napi_complete(napi);
 		enable_intr(port);
 	}
 
@@ -359,7 +359,7 @@ static irqreturn_t sca_intr(int irq, void *dev_id)
 		if (port && (isr0 & (i ? 0x08002200 : 0x00080022))) {
 			handled = 1;
 			disable_intr(port);
-			netif_rx_schedule(&port->napi);
+			napi_schedule(&port->napi);
 		}
 	}
 
diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index 7e8bbba2cc1b..3bf7d3f447db 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c
@@ -622,7 +622,7 @@ static void hss_hdlc_rx_irq(void *pdev)
 	printk(KERN_DEBUG "%s: hss_hdlc_rx_irq\n", dev->name);
 #endif
 	qmgr_disable_irq(queue_ids[port->id].rx);
-	netif_rx_schedule(&port->napi);
+	napi_schedule(&port->napi);
 }
 
 static int hss_hdlc_poll(struct napi_struct *napi, int budget)
@@ -649,15 +649,15 @@ static int hss_hdlc_poll(struct napi_struct *napi, int budget)
 		if ((n = queue_get_desc(rxq, port, 0)) < 0) {
 #if DEBUG_RX
 			printk(KERN_DEBUG "%s: hss_hdlc_poll"
-			       " netif_rx_complete\n", dev->name);
+			       " napi_complete\n", dev->name);
 #endif
-			netif_rx_complete(napi);
+			napi_complete(napi);
 			qmgr_enable_irq(rxq);
 			if (!qmgr_stat_empty(rxq) &&
-			    netif_rx_reschedule(napi)) {
+			    napi_reschedule(napi)) {
 #if DEBUG_RX
 				printk(KERN_DEBUG "%s: hss_hdlc_poll"
-				       " netif_rx_reschedule succeeded\n",
+				       " napi_reschedule succeeded\n",
 				       dev->name);
 #endif
 				qmgr_disable_irq(rxq);
@@ -1069,7 +1069,7 @@ static int hss_hdlc_open(struct net_device *dev)
 	hss_start_hdlc(port);
 
 	/* we may already have RX data, enables IRQ */
-	netif_rx_schedule(&port->napi);
+	napi_schedule(&port->napi);
 	return 0;
 
 err_unlock:
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index cd6184ee08ee..9f102a6535c4 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -196,7 +196,7 @@ static void rx_refill_timeout(unsigned long data)
 {
 	struct net_device *dev = (struct net_device *)data;
 	struct netfront_info *np = netdev_priv(dev);
-	netif_rx_schedule(&np->napi);
+	napi_schedule(&np->napi);
 }
 
 static int netfront_tx_slot_available(struct netfront_info *np)
@@ -328,7 +328,7 @@ static int xennet_open(struct net_device *dev)
 		xennet_alloc_rx_buffers(dev);
 		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
-			netif_rx_schedule(&np->napi);
+			napi_schedule(&np->napi);
 	}
 	spin_unlock_bh(&np->rx_lock);
 
@@ -979,7 +979,7 @@ err:
 
 		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
 		if (!more_to_do)
-			__netif_rx_complete(napi);
+			__napi_complete(napi);
 
 		local_irq_restore(flags);
 	}
@@ -1317,7 +1317,7 @@ static irqreturn_t xennet_interrupt(int irq, void *dev_id)
 		xennet_tx_buf_gc(dev);
 		/* Under tx_lock: protects access to rx shared-ring indexes. */
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
-			netif_rx_schedule(&np->napi);
+			napi_schedule(&np->napi);
 	}
 
 	spin_unlock_irqrestore(&np->tx_lock, flags);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ec54785d34f9..dd8a35b3e8b2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1574,56 +1574,6 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 	return (1 << debug_value) - 1;
 }
 
-/* Test if receive needs to be scheduled but only if up */
-static inline int netif_rx_schedule_prep(struct napi_struct *napi)
-{
-	return napi_schedule_prep(napi);
-}
-
-/* Add interface to tail of rx poll list. This assumes that _prep has
- * already been called and returned 1.
- */
-static inline void __netif_rx_schedule(struct napi_struct *napi)
-{
-	__napi_schedule(napi);
-}
-
-/* Try to reschedule poll. Called by irq handler. */
-
-static inline void netif_rx_schedule(struct napi_struct *napi)
-{
-	if (netif_rx_schedule_prep(napi))
-		__netif_rx_schedule(napi);
-}
-
-/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete().  */
-static inline int netif_rx_reschedule(struct napi_struct *napi)
-{
-	if (napi_schedule_prep(napi)) {
-		__netif_rx_schedule(napi);
-		return 1;
-	}
-	return 0;
-}
-
-/* same as netif_rx_complete, except that local_irq_save(flags)
- * has already been issued
- */
-static inline void __netif_rx_complete(struct napi_struct *napi)
-{
-	__napi_complete(napi);
-}
-
-/* Remove interface from poll list: it must be in the poll list
- * on current cpu. This primitive is called by dev->poll(), when
- * it completes the work. The device cannot be out of poll list at this
- * moment, it is BUG().
- */
-static inline void netif_rx_complete(struct napi_struct *napi)
-{
-	napi_complete(napi);
-}
-
 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 {
 	spin_lock(&txq->_xmit_lock);
-- 
cgit v1.2.3-71-gd317


From f90f92eed74251034f251e3cdf4fa5c4c1f09df0 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Fri, 16 Jan 2009 23:36:30 +0000
Subject: dccp: Initialisation framework for feature negotiation

This initialises feature negotiation from two tables, which are in
turn are initialised from sysctls.

As a novel feature, specifics of the implementation (e.g. that short
seqnos and ECN are not yet available) are advertised for robustness.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h | 19 ---------------
 net/dccp/feat.c      | 65 ++++++++++++++++++++++++++++++++++++++++++++--------
 net/dccp/feat.h      |  2 +-
 3 files changed, 57 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 61734e27abb7..990e97fa1f07 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -369,28 +369,9 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   * Will be used to pass the state from dccp_request_sock to dccp_sock.
   *
   * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
-  * @dccpms_pending - List of features being negotiated
-  * @dccpms_conf -
   */
 struct dccp_minisock {
 	__u64			dccpms_sequence_window;
-	struct list_head	dccpms_pending;
-	struct list_head	dccpms_conf;
-};
-
-struct dccp_opt_conf {
-	__u8			*dccpoc_val;
-	__u8			dccpoc_len;
-};
-
-struct dccp_opt_pend {
-	struct list_head	dccpop_node;
-	__u8			dccpop_type;
-	__u8			dccpop_feat;
-	__u8		        *dccpop_val;
-	__u8			dccpop_len;
-	int			dccpop_conf;
-	struct dccp_opt_conf    *dccpop_sc;
 };
 
 extern void dccp_minisock_init(struct dccp_minisock *dmsk);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 4152308958ab..67ffac9905f8 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1115,23 +1115,70 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	return 0;	/* ignore FN options in all other states */
 }
 
+/**
+ * dccp_feat_init  -  Seed feature negotiation with host-specific defaults
+ * This initialises global defaults, depending on the value of the sysctls.
+ * These can later be overridden by registering changes via setsockopt calls.
+ * The last link in the chain is finalise_settings, to make sure that between
+ * here and the start of actual feature negotiation no inconsistencies enter.
+ *
+ * All features not appearing below use either defaults or are otherwise
+ * later adjusted through dccp_feat_finalise_settings().
+ */
 int dccp_feat_init(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
+	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+	u8 on = 1, off = 0;
 	int rc;
+	struct {
+		u8 *val;
+		u8 len;
+	} tx, rx;
+
+	/* Non-negotiable (NN) features */
+	rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
+				    sysctl_dccp_feat_sequence_window);
+	if (rc)
+		return rc;
+
+	/* Server-priority (SP) features */
+
+	/* Advertise that short seqnos are not supported (7.6.1) */
+	rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
+	if (rc)
+		return rc;
 
-	INIT_LIST_HEAD(&dmsk->dccpms_pending);	/* XXX no longer used */
-	INIT_LIST_HEAD(&dmsk->dccpms_conf);	/* XXX no longer used */
+	/* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
+	rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+	if (rc)
+		return rc;
+
+	/*
+	 * We advertise the available list of CCIDs and reorder according to
+	 * preferences, to avoid failure resulting from negotiating different
+	 * singleton values (which always leads to failure).
+	 * These settings can still (later) be overridden via sockopts.
+	 */
+	if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
+	    ccid_get_builtin_ccids(&rx.val, &rx.len))
+		return -ENOBUFS;
 
-	/* Ack ratio */
-	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
-				dp->dccps_l_ack_ratio);
+	if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) ||
+	    !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len))
+		goto free_ccid_lists;
+
+	rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
+	if (rc)
+		goto free_ccid_lists;
+
+	rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
+
+free_ccid_lists:
+	kfree(tx.val);
+	kfree(rx.val);
 	return rc;
 }
 
-EXPORT_SYMBOL_GPL(dccp_feat_init);
-
 int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 9b46e2a7866e..5e7b8481cd04 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -113,13 +113,13 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 #define dccp_feat_debug(type, feat, val)
 #endif /* CONFIG_IP_DCCP_DEBUG */
 
+extern int  dccp_feat_init(struct sock *sk);
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  u8 const *list, u8 len);
 extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
 				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
-extern int  dccp_feat_init(struct sock *sk);
 
 /*
  * Encoding variable-length options and their maximum length.
-- 
cgit v1.2.3-71-gd317


From 792b48780e8b6435d017cef4b5c304876a48653e Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Fri, 16 Jan 2009 23:36:31 +0000
Subject: dccp: Implement both feature-local and feature-remote Sequence Window
 feature

This adds full support for local/remote Sequence Window feature, from which the
  * sequence-number-validity (W) and
  * acknowledgment-number-validity (W') windows
derive as specified in RFC 4340, 7.5.3.

Specifically, the following is contained in this patch:
  * integrated new socket fields into dccp_sk;
  * updated the update_gsr/gss routines with regard to these fields;
  * updated handler code: the Sequence Window feature is located at the TX side,
    so the local feature is meant if the handler-rx flag is false;
  * the initialisation of `rcv_wnd' in reqsk is removed, since
    - rcv_wnd is not used by the code anywhere;
    - sequence number checks are not done in the LISTEN state (cf. 7.5.3);
    - dccp_check_req checks the Ack number validity more rigorously;
  * the `struct dccp_minisock' became empty and is now removed.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dccp.txt |  3 ++-
 include/linux/dccp.h              | 24 ++++--------------------
 net/dccp/dccp.h                   | 16 +++++++---------
 net/dccp/feat.c                   | 13 +++++++++++--
 net/dccp/minisocks.c              | 11 -----------
 net/dccp/proto.c                  |  2 --
 6 files changed, 24 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 7a3bb1abb830..b132e4a3cf0f 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -141,7 +141,8 @@ rx_ccid = 2
 	Default CCID for the receiver-sender half-connection; see tx_ccid.
 
 seq_window = 100
-	The initial sequence window (sec. 7.5.2).
+	The initial sequence window (sec. 7.5.2) of the sender. This influences
+	the local ackno validity and the remote seqno validity windows (7.5.1).
 
 tx_qlen = 5
 	The size of the transmit buffer in packets. A value of 0 corresponds
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 990e97fa1f07..7a0502ab383a 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -363,19 +363,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
 
-/**
-  * struct dccp_minisock - Minimal DCCP connection representation
-  *
-  * Will be used to pass the state from dccp_request_sock to dccp_sock.
-  *
-  * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
-  */
-struct dccp_minisock {
-	__u64			dccpms_sequence_window;
-};
-
-extern void dccp_minisock_init(struct dccp_minisock *dmsk);
-
 /**
  * struct dccp_request_sock  -  represent DCCP-specific connection request
  * @dreq_inet_rsk: structure inherited from
@@ -464,13 +451,14 @@ struct dccp_ackvec;
  * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo
  * @dccps_l_ack_ratio - feature-local Ack Ratio
  * @dccps_r_ack_ratio - feature-remote Ack Ratio
+ * @dccps_l_seq_win - local Sequence Window (influences ack number validity)
+ * @dccps_r_seq_win - remote Sequence Window (influences seq number validity)
  * @dccps_pcslen - sender   partial checksum coverage (via sockopt)
  * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
  * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
  * @dccps_ndp_count - number of Non Data Packets since last data packet
  * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
  * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
- * @dccps_minisock - associated minisock (accessed via dccp_msk)
  * @dccps_featneg - tracks feature-negotiation state (mostly during handshake)
  * @dccps_hc_rx_ackvec - rx half connection ack vector
  * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
@@ -504,12 +492,13 @@ struct dccp_sock {
 	__u32				dccps_timestamp_time;
 	__u16				dccps_l_ack_ratio;
 	__u16				dccps_r_ack_ratio;
+	__u64				dccps_l_seq_win:48;
+	__u64				dccps_r_seq_win:48;
 	__u8				dccps_pcslen:4;
 	__u8				dccps_pcrlen:4;
 	__u8				dccps_send_ndp_count:1;
 	__u64				dccps_ndp_count:48;
 	unsigned long			dccps_rate_last;
-	struct dccp_minisock		dccps_minisock;
 	struct list_head		dccps_featneg;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
 	struct ccid			*dccps_hc_rx_ccid;
@@ -527,11 +516,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk)
 	return (struct dccp_sock *)sk;
 }
 
-static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
-{
-	return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
-}
-
 static inline const char *dccp_role(const struct sock *sk)
 {
 	switch (dccp_sk(sk)->dccps_role) {
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f2230fc168e1..04ae91898a68 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -409,23 +409,21 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
 static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	const struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	dp->dccps_gsr = seq;
-	dccp_set_seqno(&dp->dccps_swl,
-		       dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
-	dccp_set_seqno(&dp->dccps_swh,
-		       dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
+	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
+	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
 }
 
 static inline void dccp_update_gss(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	dp->dccps_awh = dp->dccps_gss = seq;
-	dccp_set_seqno(&dp->dccps_awl,
-		       (dp->dccps_gss -
-			dccp_msk(sk)->dccpms_sequence_window + 1));
+	dp->dccps_gss = seq;
+	/* Ack validity window depends on local Sequence Window value (7.5.1) */
+	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+	dp->dccps_awh = dp->dccps_gss;
 }
 
 static inline int dccp_ack_pending(const struct sock *sk)
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 67ffac9905f8..7303f79705d2 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -51,8 +51,17 @@ static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
 
 static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
 {
-	if (!rx)
-		dccp_msk(sk)->dccpms_sequence_window = seq_win;
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx) {
+		dp->dccps_r_seq_win = seq_win;
+		/* propagate changes to update SWL/SWH */
+		dccp_update_gsr(sk, dp->dccps_gsr);
+	} else {
+		dp->dccps_l_seq_win = seq_win;
+		/* propagate changes to update AWL */
+		dccp_update_gss(sk, dp->dccps_gss);
+	}
 	return 0;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 6821ae33dd37..5ca49cec95f5 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -42,11 +42,6 @@ struct inet_timewait_death_row dccp_death_row = {
 
 EXPORT_SYMBOL_GPL(dccp_death_row);
 
-void dccp_minisock_init(struct dccp_minisock *dmsk)
-{
-	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-}
-
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
@@ -110,7 +105,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		struct dccp_request_sock *dreq = dccp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct dccp_sock *newdp = dccp_sk(newsk);
-		struct dccp_minisock *newdmsk = dccp_msk(newsk);
 
 		newdp->dccps_role	    = DCCP_ROLE_SERVER;
 		newdp->dccps_hc_rx_ackvec   = NULL;
@@ -128,10 +122,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		 *    Initialize S.GAR := S.ISS
 		 *    Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
 		 */
-
-		/* See dccp_v4_conn_request */
-		newdmsk->dccpms_sequence_window = req->rcv_wnd;
-
 		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
 		dccp_update_gss(newsk, dreq->dreq_iss);
 
@@ -290,7 +280,6 @@ int dccp_reqsk_init(struct request_sock *req,
 	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
 	inet_rsk(req)->loc_port	  = dccp_hdr(skb)->dccph_dport;
 	inet_rsk(req)->acked	  = 0;
-	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
 
 	/* inherit feature negotiation options from listening socket */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 945b4d5d23b3..314a1b5c033c 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -174,8 +174,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	dccp_minisock_init(&dp->dccps_minisock);
-
 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
-- 
cgit v1.2.3-71-gd317


From 883ca833e5fb814fb03426c9d35e5489ce43e8da Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Fri, 16 Jan 2009 23:36:32 +0000
Subject: dccp: Initialisation and type-checking of feature sysctls

This patch takes care of initialising and type-checking sysctls
related to feature negotiation. Type checking is important since some
of the sysctls now directly impact the feature-negotiation process.

The sysctls are initialised with the known default values for each
feature.  For the type-checking the value constraints from RFC 4340
are used:

 * Sequence Window uses the specified Wmin=32, the maximum is ulong (4 bytes),
   tested and confirmed that it works up to 4294967295 - for Gbps speed;
 * Ack Ratio is between 0 .. 0xffff (2-byte unsigned integer);
 * CCIDs are between 0 .. 255;
 * request_retries, retries1, retries2 also between 0..255 for good measure;
 * tx_qlen is checked to be non-negative;
 * sync_ratelimit remains as before.

Notes:
------
 1. Die s@sysctl_dccp_feat@sysctl_dccp@g since the sysctls are now in feat.c.
 2. As pointed out by Arnaldo, the pattern of type-checking repeats itself in
    other places, sometimes with exactly the same kind of definitions (e.g.
    "static int zero;"). It may be a good idea (kernel janitors?) to consolidate
    type checking. For the sake of keeping the changeset small and in order not
    to affect other subsystems, I have not strived to generalise here.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  8 --------
 net/dccp/dccp.h      |  3 ---
 net/dccp/feat.c      | 11 ++++++++---
 net/dccp/feat.h      |  8 ++++++++
 net/dccp/options.c   |  4 ----
 net/dccp/sysctl.c    | 43 ++++++++++++++++++++++++++++++-------------
 6 files changed, 46 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 7a0502ab383a..7434a8353e23 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -355,14 +355,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 	return __dccp_hdr_len(dccp_hdr(skb));
 }
 
-
-/* initial values for each feature */
-#define DCCPF_INITIAL_SEQUENCE_WINDOW		100
-#define DCCPF_INITIAL_ACK_RATIO			2
-#define DCCPF_INITIAL_CCID			DCCPC_CCID2
-/* FIXME: for now we're default to 1 but it should really be 0 */
-#define DCCPF_INITIAL_SEND_NDP_COUNT		1
-
 /**
  * struct dccp_request_sock  -  represent DCCP-specific connection request
  * @dreq_inet_rsk: structure inherited from
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 04ae91898a68..44a5bc6f6785 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -95,9 +95,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
 extern int  sysctl_dccp_request_retries;
 extern int  sysctl_dccp_retries1;
 extern int  sysctl_dccp_retries2;
-extern int  sysctl_dccp_feat_sequence_window;
-extern int  sysctl_dccp_feat_rx_ccid;
-extern int  sysctl_dccp_feat_tx_ccid;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 7303f79705d2..12006e9b2472 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -25,6 +25,11 @@
 #include "ccid.h"
 #include "feat.h"
 
+/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
+unsigned long	sysctl_dccp_sequence_window __read_mostly = 100;
+int		sysctl_dccp_rx_ccid	    __read_mostly = 2,
+		sysctl_dccp_tx_ccid	    __read_mostly = 2;
+
 /*
  * Feature activation handlers.
  *
@@ -1146,7 +1151,7 @@ int dccp_feat_init(struct sock *sk)
 
 	/* Non-negotiable (NN) features */
 	rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
-				    sysctl_dccp_feat_sequence_window);
+				    sysctl_dccp_sequence_window);
 	if (rc)
 		return rc;
 
@@ -1172,8 +1177,8 @@ int dccp_feat_init(struct sock *sk)
 	    ccid_get_builtin_ccids(&rx.val, &rx.len))
 		return -ENOBUFS;
 
-	if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) ||
-	    !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len))
+	if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
+	    !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
 		goto free_ccid_lists;
 
 	rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 5e7b8481cd04..40aa7a10bd5f 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -100,6 +100,13 @@ struct ccid_dependency {
 	u8	val;
 };
 
+/*
+ * Sysctls to seed defaults for feature negotiation
+ */
+extern unsigned long sysctl_dccp_sequence_window;
+extern int	     sysctl_dccp_rx_ccid;
+extern int	     sysctl_dccp_tx_ccid;
+
 #ifdef CONFIG_IP_DCCP_DEBUG
 extern const char *dccp_feat_typename(const u8 type);
 extern const char *dccp_feat_name(const u8 feat);
@@ -114,6 +121,7 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 #endif /* CONFIG_IP_DCCP_DEBUG */
 
 extern int  dccp_feat_init(struct sock *sk);
+extern void dccp_feat_initialise_sysctls(void);
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  u8 const *list, u8 len);
 extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 7b1165c21f51..3e2726c7182d 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,10 +23,6 @@
 #include "dccp.h"
 #include "feat.h"
 
-int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
-int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
 	u64 value = 0;
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 018e210875e1..a5a1856234e7 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -18,55 +18,72 @@
 #error This file should not be compiled without CONFIG_SYSCTL defined
 #endif
 
+/* Boundary values */
+static int		zero     = 0,
+			u8_max   = 0xFF;
+static unsigned long	seqw_min = 32;
+
 static struct ctl_table dccp_default_table[] = {
 	{
 		.procname	= "seq_window",
-		.data		= &sysctl_dccp_feat_sequence_window,
-		.maxlen		= sizeof(sysctl_dccp_feat_sequence_window),
+		.data		= &sysctl_dccp_sequence_window,
+		.maxlen		= sizeof(sysctl_dccp_sequence_window),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &seqw_min,		/* RFC 4340, 7.5.2 */
 	},
 	{
 		.procname	= "rx_ccid",
-		.data		= &sysctl_dccp_feat_rx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_feat_rx_ccid),
+		.data		= &sysctl_dccp_rx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
 		.procname	= "tx_ccid",
-		.data		= &sysctl_dccp_feat_tx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_feat_tx_ccid),
+		.data		= &sysctl_dccp_tx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
 		.maxlen		= sizeof(sysctl_dccp_request_retries),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
 	},
 	{
 		.procname	= "retries1",
 		.data		= &sysctl_dccp_retries1,
 		.maxlen		= sizeof(sysctl_dccp_retries1),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
 	},
 	{
 		.procname	= "retries2",
 		.data		= &sysctl_dccp_retries2,
 		.maxlen		= sizeof(sysctl_dccp_retries2),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
 	},
 	{
 		.procname	= "tx_qlen",
 		.data		= &sysctl_dccp_tx_qlen,
 		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
 	},
 	{
 		.procname	= "sync_ratelimit",
-- 
cgit v1.2.3-71-gd317


From 9f4d26d0f3016cf8813977d624751b94465fa317 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 19 Jan 2009 17:09:49 -0800
Subject: virtio_net: add link status handling

Allow the host to inform us that the link is down by adding
a VIRTIO_NET_F_STATUS which indicates that device status is
available in virtio_net config.

This is currently useful for simulating link down conditions
(e.g. using proposed qemu 'set_link' monitor command) but
would also be needed if we were to support device assignment
via virtio.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (added future masking)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c   | 43 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/virtio_net.h |  5 +++++
 2 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 30ae6d9a12af..9b33d6ebf542 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -42,6 +42,7 @@ struct virtnet_info
 	struct virtqueue *rvq, *svq;
 	struct net_device *dev;
 	struct napi_struct napi;
+	unsigned int status;
 
 	/* The skb we couldn't send because buffers were full. */
 	struct sk_buff *last_xmit_skb;
@@ -611,6 +612,7 @@ static struct ethtool_ops virtnet_ethtool_ops = {
 	.set_tx_csum = virtnet_set_tx_csum,
 	.set_sg = ethtool_op_set_sg,
 	.set_tso = ethtool_op_set_tso,
+	.get_link = ethtool_op_get_link,
 };
 
 #define MIN_MTU 68
@@ -636,6 +638,41 @@ static const struct net_device_ops virtnet_netdev = {
 #endif
 };
 
+static void virtnet_update_status(struct virtnet_info *vi)
+{
+	u16 v;
+
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS))
+		return;
+
+	vi->vdev->config->get(vi->vdev,
+			      offsetof(struct virtio_net_config, status),
+			      &v, sizeof(v));
+
+	/* Ignore unknown (future) status bits */
+	v &= VIRTIO_NET_S_LINK_UP;
+
+	if (vi->status == v)
+		return;
+
+	vi->status = v;
+
+	if (vi->status & VIRTIO_NET_S_LINK_UP) {
+		netif_carrier_on(vi->dev);
+		netif_wake_queue(vi->dev);
+	} else {
+		netif_carrier_off(vi->dev);
+		netif_stop_queue(vi->dev);
+	}
+}
+
+static void virtnet_config_changed(struct virtio_device *vdev)
+{
+	struct virtnet_info *vi = vdev->priv;
+
+	virtnet_update_status(vi);
+}
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int err;
@@ -738,6 +775,9 @@ static int virtnet_probe(struct virtio_device *vdev)
 		goto unregister;
 	}
 
+	vi->status = VIRTIO_NET_S_LINK_UP;
+	virtnet_update_status(vi);
+
 	pr_debug("virtnet: registered device %s\n", dev->name);
 	return 0;
 
@@ -793,7 +833,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
 	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
 	VIRTIO_NET_F_GUEST_ECN, /* We don't yet handle UFO input. */
-	VIRTIO_NET_F_MRG_RXBUF,
+	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS,
 	VIRTIO_F_NOTIFY_ON_EMPTY,
 };
 
@@ -805,6 +845,7 @@ static struct virtio_driver virtio_net = {
 	.id_table =	id_table,
 	.probe =	virtnet_probe,
 	.remove =	__devexit_p(virtnet_remove),
+	.config_changed = virtnet_config_changed,
 };
 
 static int __init init(void)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 5cdd0aa8bde9..f76bd4a753ef 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -21,11 +21,16 @@
 #define VIRTIO_NET_F_HOST_ECN	13	/* Host can handle TSO[6] w/ ECN in. */
 #define VIRTIO_NET_F_HOST_UFO	14	/* Host can handle UFO in. */
 #define VIRTIO_NET_F_MRG_RXBUF	15	/* Host can merge receive buffers. */
+#define VIRTIO_NET_F_STATUS	16	/* virtio_net_config.status available */
+
+#define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 
 struct virtio_net_config
 {
 	/* The config defining mac address (if VIRTIO_NET_F_MAC) */
 	__u8 mac[6];
+	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
+	__u16 status;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
-- 
cgit v1.2.3-71-gd317


From 273ec51dd7ceaa76e038875d85061ec856d8905e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 21 Jan 2009 15:55:35 -0800
Subject: net: ppp_generic - introduce net-namespace functionality v2

- Each namespace contains ppp channels and units separately
  with appropriate locks

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp_generic.c   | 275 +++++++++++++++++++++++++++++++-------------
 include/linux/ppp_channel.h |   4 +
 2 files changed, 202 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 7b2728b8f1b7..4405a76ed3da 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -49,6 +49,10 @@
 #include <net/slhc_vj.h>
 #include <asm/atomic.h>
 
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
 #define PPP_VERSION	"2.4.2"
 
 /*
@@ -131,6 +135,7 @@ struct ppp {
 	struct sock_filter *active_filter;/* filter for pkts to reset idle */
 	unsigned pass_len, active_len;
 #endif /* CONFIG_PPP_FILTER */
+	struct net	*ppp_net;	/* the net we belong to */
 };
 
 /*
@@ -155,6 +160,7 @@ struct channel {
 	struct rw_semaphore chan_sem;	/* protects `chan' during chan ioctl */
 	spinlock_t	downl;		/* protects `chan', file.xq dequeue */
 	struct ppp	*ppp;		/* ppp unit we're connected to */
+	struct net	*chan_net;	/* the net channel belongs to */
 	struct list_head clist;		/* link in list of channels per unit */
 	rwlock_t	upl;		/* protects `ppp' */
 #ifdef CONFIG_PPP_MULTILINK
@@ -173,26 +179,35 @@ struct channel {
  * channel.downl.
  */
 
-/*
- * all_ppp_mutex protects the all_ppp_units mapping.
- * It also ensures that finding a ppp unit in the all_ppp_units map
- * and updating its file.refcnt field is atomic.
- */
-static DEFINE_MUTEX(all_ppp_mutex);
 static atomic_t ppp_unit_count = ATOMIC_INIT(0);
-static DEFINE_IDR(ppp_units_idr);
-
-/*
- * all_channels_lock protects all_channels and last_channel_index,
- * and the atomicity of find a channel and updating its file.refcnt
- * field.
- */
-static DEFINE_SPINLOCK(all_channels_lock);
-static LIST_HEAD(all_channels);
-static LIST_HEAD(new_channels);
-static int last_channel_index;
 static atomic_t channel_count = ATOMIC_INIT(0);
 
+/* per-net private data for this module */
+static unsigned int ppp_net_id;
+struct ppp_net {
+	/* units to ppp mapping */
+	struct idr units_idr;
+
+	/*
+	 * all_ppp_mutex protects the units_idr mapping.
+	 * It also ensures that finding a ppp unit in the units_idr
+	 * map and updating its file.refcnt field is atomic.
+	 */
+	struct mutex all_ppp_mutex;
+
+	/* channels */
+	struct list_head all_channels;
+	struct list_head new_channels;
+	int last_channel_index;
+
+	/*
+	 * all_channels_lock protects all_channels and
+	 * last_channel_index, and the atomicity of find
+	 * a channel and updating its file.refcnt field.
+	 */
+	spinlock_t all_channels_lock;
+};
+
 /* Get the PPP protocol number from a skb */
 #define PPP_PROTO(skb)	(((skb)->data[0] << 8) + (skb)->data[1])
 
@@ -216,8 +231,8 @@ static atomic_t channel_count = ATOMIC_INIT(0);
 #define seq_after(a, b)		((s32)((a) - (b)) > 0)
 
 /* Prototypes. */
-static int ppp_unattached_ioctl(struct ppp_file *pf, struct file *file,
-				unsigned int cmd, unsigned long arg);
+static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf,
+			struct file *file, unsigned int cmd, unsigned long arg);
 static void ppp_xmit_process(struct ppp *ppp);
 static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb);
 static void ppp_push(struct ppp *ppp);
@@ -240,12 +255,12 @@ static void ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound);
 static void ppp_ccp_closed(struct ppp *ppp);
 static struct compressor *find_compressor(int type);
 static void ppp_get_stats(struct ppp *ppp, struct ppp_stats *st);
-static struct ppp *ppp_create_interface(int unit, int *retp);
+static struct ppp *ppp_create_interface(struct net *net, int unit, int *retp);
 static void init_ppp_file(struct ppp_file *pf, int kind);
 static void ppp_shutdown_interface(struct ppp *ppp);
 static void ppp_destroy_interface(struct ppp *ppp);
-static struct ppp *ppp_find_unit(int unit);
-static struct channel *ppp_find_channel(int unit);
+static struct ppp *ppp_find_unit(struct ppp_net *pn, int unit);
+static struct channel *ppp_find_channel(struct ppp_net *pn, int unit);
 static int ppp_connect_channel(struct channel *pch, int unit);
 static int ppp_disconnect_channel(struct channel *pch);
 static void ppp_destroy_channel(struct channel *pch);
@@ -256,6 +271,14 @@ static void *unit_find(struct idr *p, int n);
 
 static struct class *ppp_class;
 
+/* per net-namespace data */
+static inline struct ppp_net *ppp_pernet(struct net *net)
+{
+	BUG_ON(!net);
+
+	return net_generic(net, ppp_net_id);
+}
+
 /* Translates a PPP protocol number to a NP index (NP == network protocol) */
 static inline int proto_to_npindex(int proto)
 {
@@ -544,7 +567,8 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	int __user *p = argp;
 
 	if (!pf)
-		return ppp_unattached_ioctl(pf, file, cmd, arg);
+		return ppp_unattached_ioctl(current->nsproxy->net_ns,
+					pf, file, cmd, arg);
 
 	if (cmd == PPPIOCDETACH) {
 		/*
@@ -763,12 +787,13 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static int ppp_unattached_ioctl(struct ppp_file *pf, struct file *file,
-				unsigned int cmd, unsigned long arg)
+static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf,
+			struct file *file, unsigned int cmd, unsigned long arg)
 {
 	int unit, err = -EFAULT;
 	struct ppp *ppp;
 	struct channel *chan;
+	struct ppp_net *pn;
 	int __user *p = (int __user *)arg;
 
 	lock_kernel();
@@ -777,7 +802,7 @@ static int ppp_unattached_ioctl(struct ppp_file *pf, struct file *file,
 		/* Create a new ppp unit */
 		if (get_user(unit, p))
 			break;
-		ppp = ppp_create_interface(unit, &err);
+		ppp = ppp_create_interface(net, unit, &err);
 		if (!ppp)
 			break;
 		file->private_data = &ppp->file;
@@ -792,29 +817,31 @@ static int ppp_unattached_ioctl(struct ppp_file *pf, struct file *file,
 		/* Attach to an existing ppp unit */
 		if (get_user(unit, p))
 			break;
-		mutex_lock(&all_ppp_mutex);
 		err = -ENXIO;
-		ppp = ppp_find_unit(unit);
+		pn = ppp_pernet(net);
+		mutex_lock(&pn->all_ppp_mutex);
+		ppp = ppp_find_unit(pn, unit);
 		if (ppp) {
 			atomic_inc(&ppp->file.refcnt);
 			file->private_data = &ppp->file;
 			err = 0;
 		}
-		mutex_unlock(&all_ppp_mutex);
+		mutex_unlock(&pn->all_ppp_mutex);
 		break;
 
 	case PPPIOCATTCHAN:
 		if (get_user(unit, p))
 			break;
-		spin_lock_bh(&all_channels_lock);
 		err = -ENXIO;
-		chan = ppp_find_channel(unit);
+		pn = ppp_pernet(net);
+		spin_lock_bh(&pn->all_channels_lock);
+		chan = ppp_find_channel(pn, unit);
 		if (chan) {
 			atomic_inc(&chan->file.refcnt);
 			file->private_data = &chan->file;
 			err = 0;
 		}
-		spin_unlock_bh(&all_channels_lock);
+		spin_unlock_bh(&pn->all_channels_lock);
 		break;
 
 	default:
@@ -834,6 +861,51 @@ static const struct file_operations ppp_device_fops = {
 	.release	= ppp_release
 };
 
+static __net_init int ppp_init_net(struct net *net)
+{
+	struct ppp_net *pn;
+	int err;
+
+	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
+	if (!pn)
+		return -ENOMEM;
+
+	idr_init(&pn->units_idr);
+	mutex_init(&pn->all_ppp_mutex);
+
+	INIT_LIST_HEAD(&pn->all_channels);
+	INIT_LIST_HEAD(&pn->new_channels);
+
+	spin_lock_init(&pn->all_channels_lock);
+
+	err = net_assign_generic(net, ppp_net_id, pn);
+	if (err) {
+		kfree(pn);
+		return err;
+	}
+
+	return 0;
+}
+
+static __net_exit void ppp_exit_net(struct net *net)
+{
+	struct ppp_net *pn;
+
+	pn = net_generic(net, ppp_net_id);
+	idr_destroy(&pn->units_idr);
+	/*
+	 * if someone has cached our net then
+	 * further net_generic call will return NULL
+	 */
+	net_assign_generic(net, ppp_net_id, NULL);
+	kfree(pn);
+}
+
+static __net_initdata struct pernet_operations ppp_net_ops = {
+	.init = ppp_init_net,
+	.exit = ppp_exit_net,
+};
+
 #define PPP_MAJOR	108
 
 /* Called at boot time if ppp is compiled into the kernel,
@@ -843,25 +915,36 @@ static int __init ppp_init(void)
 	int err;
 
 	printk(KERN_INFO "PPP generic driver version " PPP_VERSION "\n");
-	err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops);
-	if (!err) {
-		ppp_class = class_create(THIS_MODULE, "ppp");
-		if (IS_ERR(ppp_class)) {
-			err = PTR_ERR(ppp_class);
-			goto out_chrdev;
-		}
-		device_create(ppp_class, NULL, MKDEV(PPP_MAJOR, 0), NULL,
-			      "ppp");
+
+	err = register_pernet_gen_device(&ppp_net_id, &ppp_net_ops);
+	if (err) {
+		printk(KERN_ERR "failed to register PPP pernet device (%d)\n", err);
+		goto out;
 	}
 
-out:
-	if (err)
+	err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops);
+	if (err) {
 		printk(KERN_ERR "failed to register PPP device (%d)\n", err);
-	return err;
+		goto out_net;
+	}
+
+	ppp_class = class_create(THIS_MODULE, "ppp");
+	if (IS_ERR(ppp_class)) {
+		err = PTR_ERR(ppp_class);
+		goto out_chrdev;
+	}
+
+	/* not a big deal if we fail here :-) */
+	device_create(ppp_class, NULL, MKDEV(PPP_MAJOR, 0), NULL, "ppp");
+
+	return 0;
 
 out_chrdev:
 	unregister_chrdev(PPP_MAJOR, "ppp");
-	goto out;
+out_net:
+	unregister_pernet_gen_device(ppp_net_id, &ppp_net_ops);
+out:
+	return err;
 }
 
 /*
@@ -969,6 +1052,7 @@ static void ppp_setup(struct net_device *dev)
 	dev->tx_queue_len = 3;
 	dev->type = ARPHRD_PPP;
 	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+	dev->features |= NETIF_F_NETNS_LOCAL;
 }
 
 /*
@@ -1986,19 +2070,27 @@ ppp_mp_reconstruct(struct ppp *ppp)
  * Channel interface.
  */
 
-/*
- * Create a new, unattached ppp channel.
- */
-int
-ppp_register_channel(struct ppp_channel *chan)
+/* Create a new, unattached ppp channel. */
+int ppp_register_channel(struct ppp_channel *chan)
+{
+	return ppp_register_net_channel(current->nsproxy->net_ns, chan);
+}
+
+/* Create a new, unattached ppp channel for specified net. */
+int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
 {
 	struct channel *pch;
+	struct ppp_net *pn;
 
 	pch = kzalloc(sizeof(struct channel), GFP_KERNEL);
 	if (!pch)
 		return -ENOMEM;
+
+	pn = ppp_pernet(net);
+
 	pch->ppp = NULL;
 	pch->chan = chan;
+	pch->chan_net = net;
 	chan->ppp = pch;
 	init_ppp_file(&pch->file, CHANNEL);
 	pch->file.hdrlen = chan->hdrlen;
@@ -2008,11 +2100,13 @@ ppp_register_channel(struct ppp_channel *chan)
 	init_rwsem(&pch->chan_sem);
 	spin_lock_init(&pch->downl);
 	rwlock_init(&pch->upl);
-	spin_lock_bh(&all_channels_lock);
-	pch->file.index = ++last_channel_index;
-	list_add(&pch->list, &new_channels);
+
+	spin_lock_bh(&pn->all_channels_lock);
+	pch->file.index = ++pn->last_channel_index;
+	list_add(&pch->list, &pn->new_channels);
 	atomic_inc(&channel_count);
-	spin_unlock_bh(&all_channels_lock);
+	spin_unlock_bh(&pn->all_channels_lock);
+
 	return 0;
 }
 
@@ -2053,9 +2147,11 @@ void
 ppp_unregister_channel(struct ppp_channel *chan)
 {
 	struct channel *pch = chan->ppp;
+	struct ppp_net *pn;
 
 	if (!pch)
 		return;		/* should never happen */
+
 	chan->ppp = NULL;
 
 	/*
@@ -2068,9 +2164,12 @@ ppp_unregister_channel(struct ppp_channel *chan)
 	spin_unlock_bh(&pch->downl);
 	up_write(&pch->chan_sem);
 	ppp_disconnect_channel(pch);
-	spin_lock_bh(&all_channels_lock);
+
+	pn = ppp_pernet(pch->chan_net);
+	spin_lock_bh(&pn->all_channels_lock);
 	list_del(&pch->list);
-	spin_unlock_bh(&all_channels_lock);
+	spin_unlock_bh(&pn->all_channels_lock);
+
 	pch->file.dead = 1;
 	wake_up_interruptible(&pch->file.rwait);
 	if (atomic_dec_and_test(&pch->file.refcnt))
@@ -2395,9 +2494,10 @@ ppp_get_stats(struct ppp *ppp, struct ppp_stats *st)
  * unit == -1 means allocate a new number.
  */
 static struct ppp *
-ppp_create_interface(int unit, int *retp)
+ppp_create_interface(struct net *net, int unit, int *retp)
 {
 	struct ppp *ppp;
+	struct ppp_net *pn;
 	struct net_device *dev = NULL;
 	int ret = -ENOMEM;
 	int i;
@@ -2406,6 +2506,8 @@ ppp_create_interface(int unit, int *retp)
 	if (!dev)
 		goto out1;
 
+	pn = ppp_pernet(net);
+
 	ppp = netdev_priv(dev);
 	ppp->dev = dev;
 	ppp->mru = PPP_MRU;
@@ -2421,17 +2523,23 @@ ppp_create_interface(int unit, int *retp)
 	skb_queue_head_init(&ppp->mrq);
 #endif /* CONFIG_PPP_MULTILINK */
 
+	/*
+	 * drum roll: don't forget to set
+	 * the net device is belong to
+	 */
+	dev_net_set(dev, net);
+
 	ret = -EEXIST;
-	mutex_lock(&all_ppp_mutex);
+	mutex_lock(&pn->all_ppp_mutex);
 
 	if (unit < 0) {
-		unit = unit_get(&ppp_units_idr, ppp);
+		unit = unit_get(&pn->units_idr, ppp);
 		if (unit < 0) {
 			*retp = unit;
 			goto out2;
 		}
 	} else {
-		if (unit_find(&ppp_units_idr, unit))
+		if (unit_find(&pn->units_idr, unit))
 			goto out2; /* unit already exists */
 		/*
 		 * if caller need a specified unit number
@@ -2442,7 +2550,7 @@ ppp_create_interface(int unit, int *retp)
 		 * fair but at least pppd will ask us to allocate
 		 * new unit in this case so user is happy :)
 		 */
-		unit = unit_set(&ppp_units_idr, ppp, unit);
+		unit = unit_set(&pn->units_idr, ppp, unit);
 		if (unit < 0)
 			goto out2;
 	}
@@ -2453,20 +2561,22 @@ ppp_create_interface(int unit, int *retp)
 
 	ret = register_netdev(dev);
 	if (ret != 0) {
-		unit_put(&ppp_units_idr, unit);
+		unit_put(&pn->units_idr, unit);
 		printk(KERN_ERR "PPP: couldn't register device %s (%d)\n",
 		       dev->name, ret);
 		goto out2;
 	}
 
+	ppp->ppp_net = net;
+
 	atomic_inc(&ppp_unit_count);
-	mutex_unlock(&all_ppp_mutex);
+	mutex_unlock(&pn->all_ppp_mutex);
 
 	*retp = 0;
 	return ppp;
 
 out2:
-	mutex_unlock(&all_ppp_mutex);
+	mutex_unlock(&pn->all_ppp_mutex);
 	free_netdev(dev);
 out1:
 	*retp = ret;
@@ -2492,7 +2602,11 @@ init_ppp_file(struct ppp_file *pf, int kind)
  */
 static void ppp_shutdown_interface(struct ppp *ppp)
 {
-	mutex_lock(&all_ppp_mutex);
+	struct ppp_net *pn;
+
+	pn = ppp_pernet(ppp->ppp_net);
+	mutex_lock(&pn->all_ppp_mutex);
+
 	/* This will call dev_close() for us. */
 	ppp_lock(ppp);
 	if (!ppp->closing) {
@@ -2502,11 +2616,12 @@ static void ppp_shutdown_interface(struct ppp *ppp)
 	} else
 		ppp_unlock(ppp);
 
-	unit_put(&ppp_units_idr, ppp->file.index);
+	unit_put(&pn->units_idr, ppp->file.index);
 	ppp->file.dead = 1;
 	ppp->owner = NULL;
 	wake_up_interruptible(&ppp->file.rwait);
-	mutex_unlock(&all_ppp_mutex);
+
+	mutex_unlock(&pn->all_ppp_mutex);
 }
 
 /*
@@ -2554,9 +2669,9 @@ static void ppp_destroy_interface(struct ppp *ppp)
  * The caller should have locked the all_ppp_mutex.
  */
 static struct ppp *
-ppp_find_unit(int unit)
+ppp_find_unit(struct ppp_net *pn, int unit)
 {
-	return unit_find(&ppp_units_idr, unit);
+	return unit_find(&pn->units_idr, unit);
 }
 
 /*
@@ -2568,20 +2683,22 @@ ppp_find_unit(int unit)
  * when we have a lot of channels in use.
  */
 static struct channel *
-ppp_find_channel(int unit)
+ppp_find_channel(struct ppp_net *pn, int unit)
 {
 	struct channel *pch;
 
-	list_for_each_entry(pch, &new_channels, list) {
+	list_for_each_entry(pch, &pn->new_channels, list) {
 		if (pch->file.index == unit) {
-			list_move(&pch->list, &all_channels);
+			list_move(&pch->list, &pn->all_channels);
 			return pch;
 		}
 	}
-	list_for_each_entry(pch, &all_channels, list) {
+
+	list_for_each_entry(pch, &pn->all_channels, list) {
 		if (pch->file.index == unit)
 			return pch;
 	}
+
 	return NULL;
 }
 
@@ -2592,11 +2709,14 @@ static int
 ppp_connect_channel(struct channel *pch, int unit)
 {
 	struct ppp *ppp;
+	struct ppp_net *pn;
 	int ret = -ENXIO;
 	int hdrlen;
 
-	mutex_lock(&all_ppp_mutex);
-	ppp = ppp_find_unit(unit);
+	pn = ppp_pernet(pch->chan_net);
+
+	mutex_lock(&pn->all_ppp_mutex);
+	ppp = ppp_find_unit(pn, unit);
 	if (!ppp)
 		goto out;
 	write_lock_bh(&pch->upl);
@@ -2620,7 +2740,7 @@ ppp_connect_channel(struct channel *pch, int unit)
  outl:
 	write_unlock_bh(&pch->upl);
  out:
-	mutex_unlock(&all_ppp_mutex);
+	mutex_unlock(&pn->all_ppp_mutex);
 	return ret;
 }
 
@@ -2677,7 +2797,7 @@ static void __exit ppp_cleanup(void)
 	unregister_chrdev(PPP_MAJOR, "ppp");
 	device_destroy(ppp_class, MKDEV(PPP_MAJOR, 0));
 	class_destroy(ppp_class);
-	idr_destroy(&ppp_units_idr);
+	unregister_pernet_gen_device(ppp_net_id, &ppp_net_ops);
 }
 
 /*
@@ -2743,6 +2863,7 @@ static void *unit_find(struct idr *p, int n)
 module_init(ppp_init);
 module_exit(ppp_cleanup);
 
+EXPORT_SYMBOL(ppp_register_net_channel);
 EXPORT_SYMBOL(ppp_register_channel);
 EXPORT_SYMBOL(ppp_unregister_channel);
 EXPORT_SYMBOL(ppp_channel_index);
diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h
index a942892d6dfe..9d64bdf14770 100644
--- a/include/linux/ppp_channel.h
+++ b/include/linux/ppp_channel.h
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/skbuff.h>
 #include <linux/poll.h>
+#include <net/net_namespace.h>
 
 struct ppp_channel;
 
@@ -56,6 +57,9 @@ extern void ppp_input(struct ppp_channel *, struct sk_buff *);
    that we may have missed a packet. */
 extern void ppp_input_error(struct ppp_channel *, int code);
 
+/* Attach a channel to a given PPP unit in specified net. */
+extern int ppp_register_net_channel(struct net *, struct ppp_channel *);
+
 /* Attach a channel to a given PPP unit. */
 extern int ppp_register_channel(struct ppp_channel *);
 
-- 
cgit v1.2.3-71-gd317


From d52a61c04c6c0814ca270a088feedb126436598e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 22 Jan 2009 00:38:56 -0800
Subject: irq: clean up irq stat methods

David Miller suggested, related to a kstat_irqs related build breakage:

> Either linux/kernel_stat.h provides the kstat_incr_irqs_this_cpu
> interface or linux/irq.h does, not both.

So move them to kernel_stat.h.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h         | 6 ------
 include/linux/kernel_stat.h | 9 ++++++---
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index e9a878978c85..48901e9a33b9 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -202,12 +202,6 @@ extern struct irq_desc irq_desc[NR_IRQS];
 extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
 #endif /* CONFIG_SPARSE_IRQ */
 
-#define kstat_irqs_this_cpu(DESC) \
-	((DESC)->kstat_irqs[smp_processor_id()])
-#define kstat_incr_irqs_this_cpu(irqno, DESC) \
-	((DESC)->kstat_irqs[smp_processor_id()]++)
-
-
 extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 
 static inline struct irq_desc *
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a3431b164bea..0c8b89f28a95 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -52,16 +52,19 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 {
 	kstat_this_cpu.irqs[irq]++;
 }
-#endif
-
 
-#ifndef CONFIG_GENERIC_HARDIRQS
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
 #else
+#include <linux/irq.h>
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+
 #endif
 
 /*
-- 
cgit v1.2.3-71-gd317


From 5c0a66f5f3c9c59e2c341400048e2cff768e67a9 Mon Sep 17 00:00:00 2001
From: Benjamin Thery <benjamin.thery@bull.net>
Date: Thu, 22 Jan 2009 04:56:17 +0000
Subject: netns: ipmr: store netns in struct mfc_cache

This patch stores into struct mfc_cache the network namespace each
mfc_cache belongs to. The new member is mfc_net.

mfc_net is assigned at cache allocation and doesn't change during
the rest of the cache entry life.
A new net parameter is added to ipmr_cache_alloc/ipmr_cache_alloc_unres.

This will help to retrieve the current netns around the IPv4 multicast
routing code.

At the moment, all mfc_cache are allocated in init_net.

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 15 +++++++++++++++
 net/ipv4/ipmr.c        | 26 +++++++++++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 8a455694d682..beaf3aae8aaf 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -193,6 +193,9 @@ struct vif_device
 struct mfc_cache 
 {
 	struct mfc_cache *next;			/* Next entry on cache line 	*/
+#ifdef CONFIG_NET_NS
+	struct net *mfc_net;
+#endif
 	__be32 mfc_mcastgrp;			/* Group the entry belongs to 	*/
 	__be32 mfc_origin;			/* Source of packet 		*/
 	vifi_t mfc_parent;			/* Source interface		*/
@@ -215,6 +218,18 @@ struct mfc_cache
 	} mfc_un;
 };
 
+static inline
+struct net *mfc_net(const struct mfc_cache *mfc)
+{
+	return read_pnet(&mfc->mfc_net);
+}
+
+static inline
+void mfc_net_set(struct mfc_cache *mfc, struct net *net)
+{
+	write_pnet(&mfc->mfc_net, hold_net(net));
+}
+
 #define MFC_STATIC		1
 #define MFC_NOTIFY		2
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 75a5f79cc226..8428a0fb5c10 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -327,6 +327,12 @@ static int vif_delete(int vifi, int notify)
 	return 0;
 }
 
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+	release_net(mfc_net(c));
+	kmem_cache_free(mrt_cachep, c);
+}
+
 /* Destroy an unresolved cache entry, killing queued skbs
    and reporting error to netlink readers.
  */
@@ -353,7 +359,7 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
 			kfree_skb(skb);
 	}
 
-	kmem_cache_free(mrt_cachep, c);
+	ipmr_cache_free(c);
 }
 
 
@@ -528,22 +534,24 @@ static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
 /*
  *	Allocate a multicast cache entry
  */
-static struct mfc_cache *ipmr_cache_alloc(void)
+static struct mfc_cache *ipmr_cache_alloc(struct net *net)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 	if (c == NULL)
 		return NULL;
 	c->mfc_un.res.minvif = MAXVIFS;
+	mfc_net_set(c, net);
 	return c;
 }
 
-static struct mfc_cache *ipmr_cache_alloc_unres(void)
+static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
 	if (c == NULL)
 		return NULL;
 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
 	c->mfc_un.unres.expires = jiffies + 10*HZ;
+	mfc_net_set(c, net);
 	return c;
 }
 
@@ -695,7 +703,7 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
 		 */
 
 		if (atomic_read(&cache_resolve_queue_len) >= 10 ||
-		    (c=ipmr_cache_alloc_unres())==NULL) {
+		    (c = ipmr_cache_alloc_unres(&init_net)) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
 			kfree_skb(skb);
@@ -718,7 +726,7 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
 			 */
 			spin_unlock_bh(&mfc_unres_lock);
 
-			kmem_cache_free(mrt_cachep, c);
+			ipmr_cache_free(c);
 			kfree_skb(skb);
 			return err;
 		}
@@ -763,7 +771,7 @@ static int ipmr_mfc_delete(struct mfcctl *mfc)
 			*cp = c->next;
 			write_unlock_bh(&mrt_lock);
 
-			kmem_cache_free(mrt_cachep, c);
+			ipmr_cache_free(c);
 			return 0;
 		}
 	}
@@ -796,7 +804,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
 		return -EINVAL;
 
-	c = ipmr_cache_alloc();
+	c = ipmr_cache_alloc(&init_net);
 	if (c == NULL)
 		return -ENOMEM;
 
@@ -831,7 +839,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 
 	if (uc) {
 		ipmr_cache_resolve(uc, c);
-		kmem_cache_free(mrt_cachep, uc);
+		ipmr_cache_free(uc);
 	}
 	return 0;
 }
@@ -868,7 +876,7 @@ static void mroute_clean_tables(struct sock *sk)
 			*cp = c->next;
 			write_unlock_bh(&mrt_lock);
 
-			kmem_cache_free(mrt_cachep, c);
+			ipmr_cache_free(c);
 		}
 	}
 
-- 
cgit v1.2.3-71-gd317


From 4feb88e5c694bfe414cbc3ce0e383f7f7038f90b Mon Sep 17 00:00:00 2001
From: Benjamin Thery <benjamin.thery@bull.net>
Date: Thu, 22 Jan 2009 04:56:23 +0000
Subject: netns: ipmr: enable namespace support in ipv4 multicast routing code

This last patch makes the appropriate changes to use and propagate the
network namespace where needed in IPv4 multicast routing code.

This consists mainly in replacing all the remaining init_net occurences
with current netns pointer retrieved from sockets, net devices or
mfc_caches depending on the routines' contexts.

Some routines receive a new 'struct net' parameter to propagate the current
netns:
* vif_add/vif_delete
* ipmr_new_tunnel
* mroute_clean_tables
* ipmr_cache_find
* ipmr_cache_report
* ipmr_cache_unresolved
* ipmr_mfc_add/ipmr_mfc_delete
* ipmr_get_route
* rt_fill_info (in route.c)

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |   3 +-
 net/ipv4/ipmr.c        | 243 +++++++++++++++++++++++++++----------------------
 net/ipv4/route.c       |  11 ++-
 3 files changed, 143 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index beaf3aae8aaf..0d45b4e8d367 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -256,7 +256,8 @@ void mfc_net_set(struct mfc_cache *mfc, struct net *net)
 
 #ifdef __KERNEL__
 struct rtmsg;
-extern int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait);
+extern int ipmr_get_route(struct net *net, struct sk_buff *skb,
+			  struct rtmsg *rtm, int nowait);
 #endif
 
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a4fd97f1920c..21a6dc710f20 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -95,7 +95,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 static struct kmem_cache *mrt_cachep __read_mostly;
 
 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
-static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
+static int ipmr_cache_report(struct net *net,
+			     struct sk_buff *pkt, vifi_t vifi, int assert);
 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
 
 #ifdef CONFIG_IP_PIMSM_V2
@@ -108,9 +109,11 @@ static struct timer_list ipmr_expire_timer;
 
 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
 {
+	struct net *net = dev_net(dev);
+
 	dev_close(dev);
 
-	dev = __dev_get_by_name(&init_net, "tunl0");
+	dev = __dev_get_by_name(net, "tunl0");
 	if (dev) {
 		const struct net_device_ops *ops = dev->netdev_ops;
 		struct ifreq ifr;
@@ -136,11 +139,11 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
 }
 
 static
-struct net_device *ipmr_new_tunnel(struct vifctl *v)
+struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 {
 	struct net_device  *dev;
 
-	dev = __dev_get_by_name(&init_net, "tunl0");
+	dev = __dev_get_by_name(net, "tunl0");
 
 	if (dev) {
 		const struct net_device_ops *ops = dev->netdev_ops;
@@ -169,7 +172,8 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
 
 		dev = NULL;
 
-		if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
+		if (err == 0 &&
+		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
 			dev->flags |= IFF_MULTICAST;
 
 			in_dev = __in_dev_get_rtnl(dev);
@@ -199,10 +203,13 @@ failure:
 
 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
+
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
 	dev->stats.tx_packets++;
-	ipmr_cache_report(skb, init_net.ipv4.mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
+	ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num,
+			  IGMPMSG_WHOLEPKT);
 	read_unlock(&mrt_lock);
 	kfree_skb(skb);
 	return 0;
@@ -269,16 +276,16 @@ failure:
  *	@notify: Set to 1, if the caller is a notifier_call
  */
 
-static int vif_delete(int vifi, int notify)
+static int vif_delete(struct net *net, int vifi, int notify)
 {
 	struct vif_device *v;
 	struct net_device *dev;
 	struct in_device *in_dev;
 
-	if (vifi < 0 || vifi >= init_net.ipv4.maxvif)
+	if (vifi < 0 || vifi >= net->ipv4.maxvif)
 		return -EADDRNOTAVAIL;
 
-	v = &init_net.ipv4.vif_table[vifi];
+	v = &net->ipv4.vif_table[vifi];
 
 	write_lock_bh(&mrt_lock);
 	dev = v->dev;
@@ -290,17 +297,17 @@ static int vif_delete(int vifi, int notify)
 	}
 
 #ifdef CONFIG_IP_PIMSM
-	if (vifi == init_net.ipv4.mroute_reg_vif_num)
-		init_net.ipv4.mroute_reg_vif_num = -1;
+	if (vifi == net->ipv4.mroute_reg_vif_num)
+		net->ipv4.mroute_reg_vif_num = -1;
 #endif
 
-	if (vifi+1 == init_net.ipv4.maxvif) {
+	if (vifi+1 == net->ipv4.maxvif) {
 		int tmp;
 		for (tmp=vifi-1; tmp>=0; tmp--) {
-			if (VIF_EXISTS(&init_net, tmp))
+			if (VIF_EXISTS(net, tmp))
 				break;
 		}
-		init_net.ipv4.maxvif = tmp+1;
+		net->ipv4.maxvif = tmp+1;
 	}
 
 	write_unlock_bh(&mrt_lock);
@@ -333,8 +340,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
 {
 	struct sk_buff *skb;
 	struct nlmsgerr *e;
+	struct net *net = mfc_net(c);
 
-	atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
+	atomic_dec(&net->ipv4.cache_resolve_queue_len);
 
 	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
 		if (ip_hdr(skb)->version == 0) {
@@ -346,7 +354,7 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
 			e->error = -ETIMEDOUT;
 			memset(&e->msg, 0, sizeof(e->msg));
 
-			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
+			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
 		} else
 			kfree_skb(skb);
 	}
@@ -401,13 +409,14 @@ out:
 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
 {
 	int vifi;
+	struct net *net = mfc_net(cache);
 
 	cache->mfc_un.res.minvif = MAXVIFS;
 	cache->mfc_un.res.maxvif = 0;
 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
 
-	for (vifi = 0; vifi < init_net.ipv4.maxvif; vifi++) {
-		if (VIF_EXISTS(&init_net, vifi) &&
+	for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) {
+		if (VIF_EXISTS(net, vifi) &&
 		    ttls[vifi] && ttls[vifi] < 255) {
 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
 			if (cache->mfc_un.res.minvif > vifi)
@@ -418,16 +427,16 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
 	}
 }
 
-static int vif_add(struct vifctl *vifc, int mrtsock)
+static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
 {
 	int vifi = vifc->vifc_vifi;
-	struct vif_device *v = &init_net.ipv4.vif_table[vifi];
+	struct vif_device *v = &net->ipv4.vif_table[vifi];
 	struct net_device *dev;
 	struct in_device *in_dev;
 	int err;
 
 	/* Is vif busy ? */
-	if (VIF_EXISTS(&init_net, vifi))
+	if (VIF_EXISTS(net, vifi))
 		return -EADDRINUSE;
 
 	switch (vifc->vifc_flags) {
@@ -437,7 +446,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
 		 * Special Purpose VIF in PIM
 		 * All the packets will be sent to the daemon
 		 */
-		if (init_net.ipv4.mroute_reg_vif_num >= 0)
+		if (net->ipv4.mroute_reg_vif_num >= 0)
 			return -EADDRINUSE;
 		dev = ipmr_reg_vif();
 		if (!dev)
@@ -451,7 +460,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
 		break;
 #endif
 	case VIFF_TUNNEL:
-		dev = ipmr_new_tunnel(vifc);
+		dev = ipmr_new_tunnel(net, vifc);
 		if (!dev)
 			return -ENOBUFS;
 		err = dev_set_allmulti(dev, 1);
@@ -462,7 +471,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
 		}
 		break;
 	case 0:
-		dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
+		dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
 		if (!dev)
 			return -EADDRNOTAVAIL;
 		err = dev_set_allmulti(dev, 1);
@@ -503,20 +512,22 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
 	v->dev = dev;
 #ifdef CONFIG_IP_PIMSM
 	if (v->flags&VIFF_REGISTER)
-		init_net.ipv4.mroute_reg_vif_num = vifi;
+		net->ipv4.mroute_reg_vif_num = vifi;
 #endif
-	if (vifi+1 > init_net.ipv4.maxvif)
-		init_net.ipv4.maxvif = vifi+1;
+	if (vifi+1 > net->ipv4.maxvif)
+		net->ipv4.maxvif = vifi+1;
 	write_unlock_bh(&mrt_lock);
 	return 0;
 }
 
-static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
+static struct mfc_cache *ipmr_cache_find(struct net *net,
+					 __be32 origin,
+					 __be32 mcastgrp)
 {
 	int line = MFC_HASH(mcastgrp, origin);
 	struct mfc_cache *c;
 
-	for (c = init_net.ipv4.mfc_cache_array[line]; c; c = c->next) {
+	for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) {
 		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
 			break;
 	}
@@ -576,7 +587,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 				memset(&e->msg, 0, sizeof(e->msg));
 			}
 
-			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
+			rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid);
 		} else
 			ip_mr_forward(skb, c, 0);
 	}
@@ -589,7 +600,8 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
  *	Called under mrt_lock.
  */
 
-static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
+static int ipmr_cache_report(struct net *net,
+			     struct sk_buff *pkt, vifi_t vifi, int assert)
 {
 	struct sk_buff *skb;
 	const int ihl = ip_hdrlen(pkt);
@@ -621,7 +633,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
 		msg->im_mbz = 0;
-		msg->im_vif = init_net.ipv4.mroute_reg_vif_num;
+		msg->im_vif = net->ipv4.mroute_reg_vif_num;
 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
 					     sizeof(struct iphdr));
@@ -653,7 +665,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
 	skb->transport_header = skb->network_header;
 	}
 
-	if (init_net.ipv4.mroute_sk == NULL) {
+	if (net->ipv4.mroute_sk == NULL) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -661,7 +673,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
 	/*
 	 *	Deliver to mrouted
 	 */
-	ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
+	ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb);
 	if (ret < 0) {
 		if (net_ratelimit())
 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -676,7 +688,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
  */
 
 static int
-ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
+ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 {
 	int err;
 	struct mfc_cache *c;
@@ -684,7 +696,7 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
 
 	spin_lock_bh(&mfc_unres_lock);
 	for (c=mfc_unres_queue; c; c=c->next) {
-		if (net_eq(mfc_net(c), &init_net) &&
+		if (net_eq(mfc_net(c), net) &&
 		    c->mfc_mcastgrp == iph->daddr &&
 		    c->mfc_origin == iph->saddr)
 			break;
@@ -695,8 +707,8 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
 		 *	Create a new entry if allowable
 		 */
 
-		if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) >= 10 ||
-		    (c = ipmr_cache_alloc_unres(&init_net)) == NULL) {
+		if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 ||
+		    (c = ipmr_cache_alloc_unres(net)) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
 			kfree_skb(skb);
@@ -713,7 +725,8 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
 		/*
 		 *	Reflect first query at mrouted.
 		 */
-		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
+		err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE);
+		if (err < 0) {
 			/* If the report failed throw the cache entry
 			   out - Brad Parker
 			 */
@@ -724,7 +737,7 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
 			return err;
 		}
 
-		atomic_inc(&init_net.ipv4.cache_resolve_queue_len);
+		atomic_inc(&net->ipv4.cache_resolve_queue_len);
 		c->next = mfc_unres_queue;
 		mfc_unres_queue = c;
 
@@ -750,14 +763,14 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
  *	MFC cache manipulation by user space mroute daemon
  */
 
-static int ipmr_mfc_delete(struct mfcctl *mfc)
+static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
 {
 	int line;
 	struct mfc_cache *c, **cp;
 
 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 
-	for (cp = &init_net.ipv4.mfc_cache_array[line];
+	for (cp = &net->ipv4.mfc_cache_array[line];
 	     (c = *cp) != NULL; cp = &c->next) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
@@ -772,14 +785,14 @@ static int ipmr_mfc_delete(struct mfcctl *mfc)
 	return -ENOENT;
 }
 
-static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
+static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 {
 	int line;
 	struct mfc_cache *uc, *c, **cp;
 
 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 
-	for (cp = &init_net.ipv4.mfc_cache_array[line];
+	for (cp = &net->ipv4.mfc_cache_array[line];
 	     (c = *cp) != NULL; cp = &c->next) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
@@ -799,7 +812,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
 		return -EINVAL;
 
-	c = ipmr_cache_alloc(&init_net);
+	c = ipmr_cache_alloc(net);
 	if (c == NULL)
 		return -ENOMEM;
 
@@ -811,8 +824,8 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
 	write_lock_bh(&mrt_lock);
-	c->next = init_net.ipv4.mfc_cache_array[line];
-	init_net.ipv4.mfc_cache_array[line] = c;
+	c->next = net->ipv4.mfc_cache_array[line];
+	net->ipv4.mfc_cache_array[line] = c;
 	write_unlock_bh(&mrt_lock);
 
 	/*
@@ -822,11 +835,11 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
 	spin_lock_bh(&mfc_unres_lock);
 	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
 	     cp = &uc->next) {
-		if (net_eq(mfc_net(uc), &init_net) &&
+		if (net_eq(mfc_net(uc), net) &&
 		    uc->mfc_origin == c->mfc_origin &&
 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
 			*cp = uc->next;
-			atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
+			atomic_dec(&net->ipv4.cache_resolve_queue_len);
 			break;
 		}
 	}
@@ -845,16 +858,16 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
  *	Close the multicast socket, and clear the vif tables etc
  */
 
-static void mroute_clean_tables(struct sock *sk)
+static void mroute_clean_tables(struct net *net)
 {
 	int i;
 
 	/*
 	 *	Shut down all active vif entries
 	 */
-	for (i = 0; i < init_net.ipv4.maxvif; i++) {
-		if (!(init_net.ipv4.vif_table[i].flags&VIFF_STATIC))
-			vif_delete(i, 0);
+	for (i = 0; i < net->ipv4.maxvif; i++) {
+		if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
+			vif_delete(net, i, 0);
 	}
 
 	/*
@@ -863,7 +876,7 @@ static void mroute_clean_tables(struct sock *sk)
 	for (i=0; i<MFC_LINES; i++) {
 		struct mfc_cache *c, **cp;
 
-		cp = &init_net.ipv4.mfc_cache_array[i];
+		cp = &net->ipv4.mfc_cache_array[i];
 		while ((c = *cp) != NULL) {
 			if (c->mfc_flags&MFC_STATIC) {
 				cp = &c->next;
@@ -877,13 +890,13 @@ static void mroute_clean_tables(struct sock *sk)
 		}
 	}
 
-	if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) != 0) {
+	if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) {
 		struct mfc_cache *c, **cp;
 
 		spin_lock_bh(&mfc_unres_lock);
 		cp = &mfc_unres_queue;
 		while ((c = *cp) != NULL) {
-			if (!net_eq(mfc_net(c), &init_net)) {
+			if (!net_eq(mfc_net(c), net)) {
 				cp = &c->next;
 				continue;
 			}
@@ -897,15 +910,17 @@ static void mroute_clean_tables(struct sock *sk)
 
 static void mrtsock_destruct(struct sock *sk)
 {
+	struct net *net = sock_net(sk);
+
 	rtnl_lock();
-	if (sk == init_net.ipv4.mroute_sk) {
-		IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
+	if (sk == net->ipv4.mroute_sk) {
+		IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
 
 		write_lock_bh(&mrt_lock);
-		init_net.ipv4.mroute_sk = NULL;
+		net->ipv4.mroute_sk = NULL;
 		write_unlock_bh(&mrt_lock);
 
-		mroute_clean_tables(sk);
+		mroute_clean_tables(net);
 	}
 	rtnl_unlock();
 }
@@ -922,9 +937,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 	int ret;
 	struct vifctl vif;
 	struct mfcctl mfc;
+	struct net *net = sock_net(sk);
 
 	if (optname != MRT_INIT) {
-		if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
+		if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
@@ -937,7 +953,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 			return -ENOPROTOOPT;
 
 		rtnl_lock();
-		if (init_net.ipv4.mroute_sk) {
+		if (net->ipv4.mroute_sk) {
 			rtnl_unlock();
 			return -EADDRINUSE;
 		}
@@ -945,15 +961,15 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
 		if (ret == 0) {
 			write_lock_bh(&mrt_lock);
-			init_net.ipv4.mroute_sk = sk;
+			net->ipv4.mroute_sk = sk;
 			write_unlock_bh(&mrt_lock);
 
-			IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
+			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
 		}
 		rtnl_unlock();
 		return ret;
 	case MRT_DONE:
-		if (sk != init_net.ipv4.mroute_sk)
+		if (sk != net->ipv4.mroute_sk)
 			return -EACCES;
 		return ip_ra_control(sk, 0, NULL);
 	case MRT_ADD_VIF:
@@ -966,9 +982,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 			return -ENFILE;
 		rtnl_lock();
 		if (optname == MRT_ADD_VIF) {
-			ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
+			ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
 		} else {
-			ret = vif_delete(vif.vifc_vifi, 0);
+			ret = vif_delete(net, vif.vifc_vifi, 0);
 		}
 		rtnl_unlock();
 		return ret;
@@ -985,9 +1001,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 			return -EFAULT;
 		rtnl_lock();
 		if (optname == MRT_DEL_MFC)
-			ret = ipmr_mfc_delete(&mfc);
+			ret = ipmr_mfc_delete(net, &mfc);
 		else
-			ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
+			ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk);
 		rtnl_unlock();
 		return ret;
 		/*
@@ -998,7 +1014,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		int v;
 		if (get_user(v,(int __user *)optval))
 			return -EFAULT;
-		init_net.ipv4.mroute_do_assert = (v) ? 1 : 0;
+		net->ipv4.mroute_do_assert = (v) ? 1 : 0;
 		return 0;
 	}
 #ifdef CONFIG_IP_PIMSM
@@ -1012,11 +1028,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 
 		rtnl_lock();
 		ret = 0;
-		if (v != init_net.ipv4.mroute_do_pim) {
-			init_net.ipv4.mroute_do_pim = v;
-			init_net.ipv4.mroute_do_assert = v;
+		if (v != net->ipv4.mroute_do_pim) {
+			net->ipv4.mroute_do_pim = v;
+			net->ipv4.mroute_do_assert = v;
 #ifdef CONFIG_IP_PIMSM_V2
-			if (init_net.ipv4.mroute_do_pim)
+			if (net->ipv4.mroute_do_pim)
 				ret = inet_add_protocol(&pim_protocol,
 							IPPROTO_PIM);
 			else
@@ -1047,6 +1063,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 {
 	int olr;
 	int val;
+	struct net *net = sock_net(sk);
 
 	if (optname != MRT_VERSION &&
 #ifdef CONFIG_IP_PIMSM
@@ -1068,10 +1085,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 		val = 0x0305;
 #ifdef CONFIG_IP_PIMSM
 	else if (optname == MRT_PIM)
-		val = init_net.ipv4.mroute_do_pim;
+		val = net->ipv4.mroute_do_pim;
 #endif
 	else
-		val = init_net.ipv4.mroute_do_assert;
+		val = net->ipv4.mroute_do_assert;
 	if (copy_to_user(optval, &val, olr))
 		return -EFAULT;
 	return 0;
@@ -1087,16 +1104,17 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 	struct sioc_vif_req vr;
 	struct vif_device *vif;
 	struct mfc_cache *c;
+	struct net *net = sock_net(sk);
 
 	switch (cmd) {
 	case SIOCGETVIFCNT:
 		if (copy_from_user(&vr, arg, sizeof(vr)))
 			return -EFAULT;
-		if (vr.vifi >= init_net.ipv4.maxvif)
+		if (vr.vifi >= net->ipv4.maxvif)
 			return -EINVAL;
 		read_lock(&mrt_lock);
-		vif = &init_net.ipv4.vif_table[vr.vifi];
-		if (VIF_EXISTS(&init_net, vr.vifi)) {
+		vif = &net->ipv4.vif_table[vr.vifi];
+		if (VIF_EXISTS(net, vr.vifi)) {
 			vr.icount = vif->pkt_in;
 			vr.ocount = vif->pkt_out;
 			vr.ibytes = vif->bytes_in;
@@ -1114,7 +1132,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 			return -EFAULT;
 
 		read_lock(&mrt_lock);
-		c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
+		c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr);
 		if (c) {
 			sr.pktcnt = c->mfc_un.res.pkt;
 			sr.bytecnt = c->mfc_un.res.bytes;
@@ -1136,18 +1154,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
 	struct vif_device *v;
 	int ct;
 
-	if (!net_eq(dev_net(dev), &init_net))
+	if (!net_eq(dev_net(dev), net))
 		return NOTIFY_DONE;
 
 	if (event != NETDEV_UNREGISTER)
 		return NOTIFY_DONE;
-	v = &init_net.ipv4.vif_table[0];
-	for (ct = 0; ct < init_net.ipv4.maxvif; ct++, v++) {
+	v = &net->ipv4.vif_table[0];
+	for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
 		if (v->dev == dev)
-			vif_delete(ct, 1);
+			vif_delete(net, ct, 1);
 	}
 	return NOTIFY_DONE;
 }
@@ -1207,8 +1226,9 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
 
 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 {
+	struct net *net = mfc_net(c);
 	const struct iphdr *iph = ip_hdr(skb);
-	struct vif_device *vif = &init_net.ipv4.vif_table[vifi];
+	struct vif_device *vif = &net->ipv4.vif_table[vifi];
 	struct net_device *dev;
 	struct rtable *rt;
 	int    encap = 0;
@@ -1222,7 +1242,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 		vif->bytes_out += skb->len;
 		vif->dev->stats.tx_bytes += skb->len;
 		vif->dev->stats.tx_packets++;
-		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
+		ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT);
 		kfree_skb(skb);
 		return;
 	}
@@ -1235,7 +1255,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 						.saddr = vif->local,
 						.tos = RT_TOS(iph->tos) } },
 				    .proto = IPPROTO_IPIP };
-		if (ip_route_output_key(&init_net, &rt, &fl))
+		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
 		encap = sizeof(struct iphdr);
 	} else {
@@ -1244,7 +1264,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 					      { .daddr = iph->daddr,
 						.tos = RT_TOS(iph->tos) } },
 				    .proto = IPPROTO_IPIP };
-		if (ip_route_output_key(&init_net, &rt, &fl))
+		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
 	}
 
@@ -1308,9 +1328,10 @@ out_free:
 
 static int ipmr_find_vif(struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
 	int ct;
-	for (ct = init_net.ipv4.maxvif-1; ct >= 0; ct--) {
-		if (init_net.ipv4.vif_table[ct].dev == dev)
+	for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) {
+		if (net->ipv4.vif_table[ct].dev == dev)
 			break;
 	}
 	return ct;
@@ -1322,6 +1343,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 {
 	int psend = -1;
 	int vif, ct;
+	struct net *net = mfc_net(cache);
 
 	vif = cache->mfc_parent;
 	cache->mfc_un.res.pkt++;
@@ -1330,7 +1352,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 	/*
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
-	if (init_net.ipv4.vif_table[vif].dev != skb->dev) {
+	if (net->ipv4.vif_table[vif].dev != skb->dev) {
 		int true_vifi;
 
 		if (skb->rtable->fl.iif == 0) {
@@ -1351,24 +1373,24 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 		cache->mfc_un.res.wrong_if++;
 		true_vifi = ipmr_find_vif(skb->dev);
 
-		if (true_vifi >= 0 && init_net.ipv4.mroute_do_assert &&
+		if (true_vifi >= 0 && net->ipv4.mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
 		       so that we cannot check that packet arrived on an oif.
 		       It is bad, but otherwise we would need to move pretty
 		       large chunk of pimd to kernel. Ough... --ANK
 		     */
-		    (init_net.ipv4.mroute_do_pim ||
+		    (net->ipv4.mroute_do_pim ||
 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
 		    time_after(jiffies,
 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
 			cache->mfc_un.res.last_assert = jiffies;
-			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
+			ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF);
 		}
 		goto dont_forward;
 	}
 
-	init_net.ipv4.vif_table[vif].pkt_in++;
-	init_net.ipv4.vif_table[vif].bytes_in += skb->len;
+	net->ipv4.vif_table[vif].pkt_in++;
+	net->ipv4.vif_table[vif].bytes_in += skb->len;
 
 	/*
 	 *	Forward the frame
@@ -1408,6 +1430,7 @@ dont_forward:
 int ip_mr_input(struct sk_buff *skb)
 {
 	struct mfc_cache *cache;
+	struct net *net = dev_net(skb->dev);
 	int local = skb->rtable->rt_flags&RTCF_LOCAL;
 
 	/* Packet is looped back after forward, it should not be
@@ -1428,9 +1451,9 @@ int ip_mr_input(struct sk_buff *skb)
 			       that we can forward NO IGMP messages.
 			     */
 			    read_lock(&mrt_lock);
-			    if (init_net.ipv4.mroute_sk) {
+			    if (net->ipv4.mroute_sk) {
 				    nf_reset(skb);
-				    raw_rcv(init_net.ipv4.mroute_sk, skb);
+				    raw_rcv(net->ipv4.mroute_sk, skb);
 				    read_unlock(&mrt_lock);
 				    return 0;
 			    }
@@ -1439,7 +1462,7 @@ int ip_mr_input(struct sk_buff *skb)
 	}
 
 	read_lock(&mrt_lock);
-	cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+	cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
 
 	/*
 	 *	No usable cache entry
@@ -1459,7 +1482,7 @@ int ip_mr_input(struct sk_buff *skb)
 
 		vif = ipmr_find_vif(skb->dev);
 		if (vif >= 0) {
-			int err = ipmr_cache_unresolved(vif, skb);
+			int err = ipmr_cache_unresolved(net, vif, skb);
 			read_unlock(&mrt_lock);
 
 			return err;
@@ -1490,6 +1513,7 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
 {
 	struct net_device *reg_dev = NULL;
 	struct iphdr *encap;
+	struct net *net = dev_net(skb->dev);
 
 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
 	/*
@@ -1504,8 +1528,8 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
 		return 1;
 
 	read_lock(&mrt_lock);
-	if (init_net.ipv4.mroute_reg_vif_num >= 0)
-		reg_dev = init_net.ipv4.vif_table[init_net.ipv4.mroute_reg_vif_num].dev;
+	if (net->ipv4.mroute_reg_vif_num >= 0)
+		reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev;
 	if (reg_dev)
 		dev_hold(reg_dev);
 	read_unlock(&mrt_lock);
@@ -1540,13 +1564,14 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
 int pim_rcv_v1(struct sk_buff * skb)
 {
 	struct igmphdr *pim;
+	struct net *net = dev_net(skb->dev);
 
 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
 		goto drop;
 
 	pim = igmp_hdr(skb);
 
-	if (!init_net.ipv4.mroute_do_pim ||
+	if (!net->ipv4.mroute_do_pim ||
 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
 		goto drop;
 
@@ -1586,7 +1611,8 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 {
 	int ct;
 	struct rtnexthop *nhp;
-	struct net_device *dev = init_net.ipv4.vif_table[c->mfc_parent].dev;
+	struct net *net = mfc_net(c);
+	struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
 	u8 *b = skb_tail_pointer(skb);
 	struct rtattr *mp_head;
 
@@ -1602,7 +1628,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
 			nhp->rtnh_flags = 0;
 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
-			nhp->rtnh_ifindex = init_net.ipv4.vif_table[ct].dev->ifindex;
+			nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
 	}
@@ -1616,14 +1642,15 @@ rtattr_failure:
 	return -EMSGSIZE;
 }
 
-int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+int ipmr_get_route(struct net *net,
+		   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 {
 	int err;
 	struct mfc_cache *cache;
 	struct rtable *rt = skb->rtable;
 
 	read_lock(&mrt_lock);
-	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
+	cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
 
 	if (cache == NULL) {
 		struct sk_buff *skb2;
@@ -1654,7 +1681,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 		iph->saddr = rt->rt_src;
 		iph->daddr = rt->rt_dst;
 		iph->version = 0;
-		err = ipmr_cache_unresolved(vif, skb2);
+		err = ipmr_cache_unresolved(net, vif, skb2);
 		read_unlock(&mrt_lock);
 		return err;
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 97f71153584f..6a9e204c8024 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2779,7 +2779,8 @@ int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
 	return ip_route_output_flow(net, rp, flp, NULL, 0);
 }
 
-static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+static int rt_fill_info(struct net *net,
+			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
 	struct rtable *rt = skb->rtable;
@@ -2844,8 +2845,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 		__be32 dst = rt->rt_dst;
 
 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
-		    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
-			int err = ipmr_get_route(skb, r, nowait);
+		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+			int err = ipmr_get_route(net, skb, r, nowait);
 			if (err <= 0) {
 				if (!nowait) {
 					if (err == 0)
@@ -2950,7 +2951,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
 		goto errout_free;
@@ -2988,7 +2989,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 			if (rt_is_expired(rt))
 				continue;
 			skb->dst = dst_clone(&rt->u.dst);
-			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
-- 
cgit v1.2.3-71-gd317


From fd8757aed16470e088ecdad96ffd30f86c34424d Mon Sep 17 00:00:00 2001
From: Matthew Ranostay <mranostay@embeddedalley.com>
Date: Wed, 21 Jan 2009 17:45:12 -0500
Subject: Add PCI DFI vendor ID

Add a define for DFI PCI vendor id.

Signed-off-by: Matthew Ranostay <mranostay@embeddedalley.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d543365518ab..6b339766b7ad 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2106,6 +2106,8 @@
 #define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c
 #define PCI_DEVICE_ID_MELLANOX_SINAI	0x6274
 
+#define PCI_VENDOR_ID_DFI		0x15bd
+
 #define PCI_VENDOR_ID_QUICKNET		0x15e2
 #define PCI_DEVICE_ID_QUICKNET_XJ	0x0500
 
-- 
cgit v1.2.3-71-gd317


From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 19:01:40 -0500
Subject: trace, lockdep: manual preempt count adding for local_bh_disable

Impact: fix to preempt trace triggering lockdep check_flag failure

In local_bh_disable, the use of add_preempt_count causes the
preempt tracer to start recording the time preemption is off.
But because it already modified the preempt_count to show
softirqs disabled, and before it called the lockdep code to
handle this, it causes a state that lockdep can not handle.

The preempt tracer will reset the ring buffer on start of a trace,
and the ring buffer reset code does a spin_lock_irqsave. This
calls into lockdep and lockdep will fail when it detects the
invalid state of having softirqs disabled but the internal
current->softirqs_enabled is still set.

The fix is to manually add the SOFTIRQ_OFFSET to preempt count
and call the preempt tracer code outside the lockdep critical
area.

Thanks to Peter Zijlstra for suggesting this solution.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        |  8 ++++----
 kernel/softirq.c      | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..33085b88f87b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 
+extern unsigned long get_parent_ip(unsigned long addr);
+
 struct seq_file;
 struct cfs_rq;
 struct task_group;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..c154825ae753 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4399,10 +4399,7 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-				defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
@@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
 	return addr;
 }
 
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..6edfc2c11d99 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
 
@@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip)
 	WARN_ON_ONCE(in_irq());
 
 	raw_local_irq_save(flags);
-	add_preempt_count(SOFTIRQ_OFFSET);
+	/*
+	 * The preempt tracer hooks into add_preempt_count and will break
+	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
+	 * is set and before current->softirq_enabled is cleared.
+	 * We must manually increment preempt_count here and manually
+	 * call the trace_preempt_off later.
+	 */
+	preempt_count() += SOFTIRQ_OFFSET;
 	/*
 	 * Were softirqs turned off above:
 	 */
 	if (softirq_count() == SOFTIRQ_OFFSET)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
+
+	if (preempt_count() == SOFTIRQ_OFFSET)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip)
-- 
cgit v1.2.3-71-gd317


From 0db155de988031f925096a7df1bf9633790a2c18 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 23 Jan 2009 22:28:48 -0800
Subject: com20020: Fix allyesconfig build failure.

Reported by Stephen Rothwell.

Due to missing 'extern' in the com20020_netdev_ops declaration,
each file that includes linux/com20020.h gets another copy
defined in it's resulting object file.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/com20020.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/com20020.h b/include/linux/com20020.h
index 350afa773f8f..5dcfb944b6ce 100644
--- a/include/linux/com20020.h
+++ b/include/linux/com20020.h
@@ -29,7 +29,7 @@
 
 int com20020_check(struct net_device *dev);
 int com20020_found(struct net_device *dev, int shared);
-const struct net_device_ops com20020_netdev_ops;
+extern const struct net_device_ops com20020_netdev_ops;
 
 /* The number of low I/O ports used by the card. */
 #define ARCNET_TOTAL_SIZE 8
-- 
cgit v1.2.3-71-gd317


From 9011262a37cb438f0fa9394b5e83840db8f9680a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:23 -0200
Subject: ftrace: add ftrace_vprintk

Impact: new helper function

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 8 ++++++++
 kernel/trace/trace.c   | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9f7880d87c39..7840e718c6c7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -302,6 +302,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+extern int
+__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -317,6 +320,11 @@ ftrace_printk(const char *fmt, ...)
 {
 	return 0;
 }
+static inline int
+ftrace_vprintk(const char *fmt, va_list ap)
+{
+	return 0;
+}
 static inline void ftrace_dump(void) { }
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2129ab9d2a48..2f8ac1f008f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2951,6 +2951,15 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(__ftrace_printk);
 
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-- 
cgit v1.2.3-71-gd317


From 157f9c00e88529ed84bd7d581a40d411e5414cf0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 26 Jan 2009 15:00:56 -0200
Subject: tracing/blktrace: fix up checkpatch reported problems in ftrace
 plugin patch

Also make sure sparse (make C=2 block/blktrace.o) is happy too.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c             | 40 +++++++++++++++++++++++++---------------
 fs/partitions/check.c        |  5 +----
 include/linux/blktrace_api.h |  5 +++++
 3 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blktrace.c b/block/blktrace.c
index 630f167f8240..1b2267c798b6 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -37,7 +37,7 @@ static int __read_mostly  blk_tracer_enabled;
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
-	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC ) },
+	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
 	{ }
 };
 
@@ -169,7 +169,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 
-	if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
+	if (unlikely(bt->trace_state != Blktrace_running ||
+		     !blk_tracer_enabled))
 		return;
 
 	what |= ddir_act[rw & WRITE];
@@ -192,7 +193,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 						 sizeof(*t) + pdu_len, &flags);
 		if (!event)
 			return;
-		
+
 		ent = ring_buffer_event_data(event);
 		t = (struct blk_io_trace *)ent;
 		pc = preempt_count();
@@ -234,7 +235,7 @@ record_it:
 		if (blk_tr) {
 			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
 			if (pid != 0 &&
-			    (blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) == 0 &&
+			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
 				__trace_stack(blk_tr, NULL, flags, 5, pc);
 			trace_wake_up();
@@ -955,19 +956,27 @@ static void blk_unregister_tracepoints(void)
 
 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 {
-        int i = 0;
+	int i = 0;
 
-        if (t->action & BLK_TC_DISCARD)	   rwbs[i++] = 'D';
-        else if (t->action & BLK_TC_WRITE) rwbs[i++] = 'W';
-        else if (t->bytes)		   rwbs[i++] = 'R';
-        else				   rwbs[i++] = 'N';
+	if (t->action & BLK_TC_DISCARD)
+		rwbs[i++] = 'D';
+	else if (t->action & BLK_TC_WRITE)
+		rwbs[i++] = 'W';
+	else if (t->bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
 
-        if (t->action & BLK_TC_AHEAD)	   rwbs[i++] = 'A';
-        if (t->action & BLK_TC_BARRIER)	   rwbs[i++] = 'B';
-        if (t->action & BLK_TC_SYNC)	   rwbs[i++] = 'S';
-        if (t->action & BLK_TC_META)	   rwbs[i++] = 'M';
+	if (t->action & BLK_TC_AHEAD)
+		rwbs[i++] = 'A';
+	if (t->action & BLK_TC_BARRIER)
+		rwbs[i++] = 'B';
+	if (t->action & BLK_TC_SYNC)
+		rwbs[i++] = 'S';
+	if (t->action & BLK_TC_META)
+		rwbs[i++] = 'M';
 
-        rwbs[i] = '\0';
+	rwbs[i] = '\0';
 }
 
 static inline
@@ -1049,7 +1058,8 @@ static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 	return trace_seq_printf(s, "[%s]\n", cmd);
 }
 
-static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent)
+static int blk_log_with_error(struct trace_seq *s,
+			      const struct trace_entry *ent)
 {
 	if (t_sec(ent))
 		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 01714efdc65a..8a17f7edcc74 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/blktrace_api.h>
 
 #include "check.h"
 
@@ -268,10 +269,6 @@ ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
-#ifdef CONFIG_BLK_DEV_IO_TRACE
-extern struct attribute_group blk_trace_attr_group;
-#endif
-
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 1dba3493d520..59b4b2e8ab67 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -142,6 +142,9 @@ struct blk_user_trace_setup {
 
 #ifdef __KERNEL__
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
+
+#include <linux/sysfs.h>
+
 struct blk_trace {
 	int trace_state;
 	struct rchan *rchan;
@@ -192,6 +195,8 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
 
+extern struct attribute_group blk_trace_attr_group;
+
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-- 
cgit v1.2.3-71-gd317


From d5a9e24afb4ab38110ebb777588ea0bd0eacbd0a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 27 Jan 2009 16:22:11 -0800
Subject: net: Allow RX queue selection to seed TX queue hashing.

The idea is that drivers which implement multiqueue RX
pre-seed the SKB by recording the RX queue selected by
the hardware.

If such a seed is found on TX, we'll use that to select
the outgoing TX queue.

This helps get more consistent load balancing on router
and firewall loads.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 +++++++++++++++
 net/core/dev.c         |  8 ++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cf2cb50f77d1..a2c2378a9c58 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1904,6 +1904,21 @@ static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_bu
 	to->queue_mapping = from->queue_mapping;
 }
 
+static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
+{
+	skb->queue_mapping = rx_queue + 1;
+}
+
+static inline u16 skb_get_rx_queue(struct sk_buff *skb)
+{
+	return skb->queue_mapping - 1;
+}
+
+static inline bool skb_rx_queue_recorded(struct sk_buff *skb)
+{
+	return (skb->queue_mapping != 0);
+}
+
 #ifdef CONFIG_XFRM
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 5379b0c1190a..b21ad0b47aae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1722,6 +1722,13 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
 		simple_tx_hashrnd_initialized = 1;
 	}
 
+	if (skb_rx_queue_recorded(skb)) {
+		u32 val = skb_get_rx_queue(skb);
+
+		hash = jhash_1word(val, simple_tx_hashrnd);
+		goto out;
+	}
+
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
 		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
@@ -1759,6 +1766,7 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
 
 	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
 
+out:
 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
 
-- 
cgit v1.2.3-71-gd317


From 598804cd041c395ce87302af9088b2f227196185 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Fri, 9 Jan 2009 00:55:39 +0300
Subject: powerpc/fsl_pci: Add MPC83xx PCI-E controller RC mode support

This patch adds support for PCI-Express controllers as found on the
newer MPC83xx chips.

The work is loosely based on the Tony Li's patch[1], but unlike the
original patch, this patch implements sliding window for the Type 1
transactions using outbound window translations, so we don't have to
ioremap the whole PCI-E configuration space.

[1] http://ozlabs.org/pipermail/linuxppc-dev/2008-January/049028.html

Signed-off-by: Tony Li <tony.li@freescale.com>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/sysdev/fsl_pci.c | 244 +++++++++++++++++++++++++++++++++++++-----
 include/linux/pci_ids.h       |   8 ++
 2 files changed, 228 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 9817f63723dd..78021d8afc53 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -1,12 +1,16 @@
 /*
  * MPC83xx/85xx/86xx PCI/PCIE support routing.
  *
- * Copyright 2007,2008 Freescale Semiconductor, Inc
+ * Copyright 2007-2009 Freescale Semiconductor, Inc.
+ * Copyright 2008-2009 MontaVista Software, Inc.
  *
  * Initial author: Xianghua Xiao <x.xiao@freescale.com>
  * Recode: ZHANG WEI <wei.zhang@freescale.com>
  * Rewrite the routing for Frescale PCI and PCI Express
  * 	Roy Zang <tie-fei.zang@freescale.com>
+ * MPC83xx PCI-Express support:
+ * 	Tony Li <tony.li@freescale.com>
+ * 	Anton Vorontsov <avorontsov@ru.mvista.com>
  *
  * This program is free software; you can redistribute  it and/or modify it
  * under  the terms of  the GNU General  Public License as published by the
@@ -27,6 +31,29 @@
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 
+static int fsl_pcie_bus_fixup;
+
+static void __init quirk_fsl_pcie_header(struct pci_dev *dev)
+{
+	/* if we aren't a PCIe don't bother */
+	if (!pci_find_capability(dev, PCI_CAP_ID_EXP))
+		return;
+
+	dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+	fsl_pcie_bus_fixup = 1;
+	return;
+}
+
+static int __init fsl_pcie_check_link(struct pci_controller *hose)
+{
+	u32 val;
+
+	early_read_config_dword(hose, 0, 0, PCIE_LTSSM, &val);
+	if (val < PCIE_LTSSM_L0)
+		return 1;
+	return 0;
+}
+
 #if defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_86xx)
 static int __init setup_one_atmu(struct ccsr_pci __iomem *pci,
 	unsigned int index, const struct resource *res,
@@ -159,28 +186,6 @@ static void __init setup_pci_pcsrbar(struct pci_controller *hose)
 #endif
 }
 
-static int fsl_pcie_bus_fixup;
-
-static void __init quirk_fsl_pcie_header(struct pci_dev *dev)
-{
-	/* if we aren't a PCIe don't bother */
-	if (!pci_find_capability(dev, PCI_CAP_ID_EXP))
-		return ;
-
-	dev->class = PCI_CLASS_BRIDGE_PCI << 8;
-	fsl_pcie_bus_fixup = 1;
-	return ;
-}
-
-static int __init fsl_pcie_check_link(struct pci_controller *hose)
-{
-	u32 val;
-	early_read_config_dword(hose, 0, 0, PCIE_LTSSM, &val);
-	if (val < PCIE_LTSSM_L0)
-		return 1;
-	return 0;
-}
-
 void fsl_pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_controller *hose = (struct pci_controller *) bus->sysdata;
@@ -294,8 +299,184 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8610, quirk_fsl_pcie_header);
 #endif /* CONFIG_PPC_85xx || CONFIG_PPC_86xx */
 
 #if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x)
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8314E, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8314, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8315E, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8315, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8377E, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8377, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8378E, quirk_fsl_pcie_header);
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8378, quirk_fsl_pcie_header);
+
+struct mpc83xx_pcie_priv {
+	void __iomem *cfg_type0;
+	void __iomem *cfg_type1;
+	u32 dev_base;
+};
+
+/*
+ * With the convention of u-boot, the PCIE outbound window 0 serves
+ * as configuration transactions outbound.
+ */
+#define PEX_OUTWIN0_BAR		0xCA4
+#define PEX_OUTWIN0_TAL		0xCA8
+#define PEX_OUTWIN0_TAH		0xCAC
+
+static int mpc83xx_pcie_exclude_device(struct pci_bus *bus, unsigned int devfn)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	if (hose->indirect_type & PPC_INDIRECT_TYPE_NO_PCIE_LINK)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Workaround for the HW bug: for Type 0 configure transactions the
+	 * PCI-E controller does not check the device number bits and just
+	 * assumes that the device number bits are 0.
+	 */
+	if (bus->number == hose->first_busno ||
+			bus->primary == hose->first_busno) {
+		if (devfn & 0xf8)
+			return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	if (ppc_md.pci_exclude_device) {
+		if (ppc_md.pci_exclude_device(hose, bus->number, devfn))
+			return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static void __iomem *mpc83xx_pcie_remap_cfg(struct pci_bus *bus,
+					    unsigned int devfn, int offset)
+{
+	struct pci_controller *hose = bus->sysdata;
+	struct mpc83xx_pcie_priv *pcie = hose->dn->data;
+	u8 bus_no = bus->number - hose->first_busno;
+	u32 dev_base = bus_no << 24 | devfn << 16;
+	int ret;
+
+	ret = mpc83xx_pcie_exclude_device(bus, devfn);
+	if (ret)
+		return NULL;
+
+	offset &= 0xfff;
+
+	/* Type 0 */
+	if (bus->number == hose->first_busno)
+		return pcie->cfg_type0 + offset;
+
+	if (pcie->dev_base == dev_base)
+		goto mapped;
+
+	out_le32(pcie->cfg_type0 + PEX_OUTWIN0_TAL, dev_base);
+
+	pcie->dev_base = dev_base;
+mapped:
+	return pcie->cfg_type1 + offset;
+}
+
+static int mpc83xx_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
+				    int offset, int len, u32 *val)
+{
+	void __iomem *cfg_addr;
+
+	cfg_addr = mpc83xx_pcie_remap_cfg(bus, devfn, offset);
+	if (!cfg_addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	switch (len) {
+	case 1:
+		*val = in_8(cfg_addr);
+		break;
+	case 2:
+		*val = in_le16(cfg_addr);
+		break;
+	default:
+		*val = in_le32(cfg_addr);
+		break;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int mpc83xx_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
+				     int offset, int len, u32 val)
+{
+	void __iomem *cfg_addr;
+
+	cfg_addr = mpc83xx_pcie_remap_cfg(bus, devfn, offset);
+	if (!cfg_addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	switch (len) {
+	case 1:
+		out_8(cfg_addr, val);
+		break;
+	case 2:
+		out_le16(cfg_addr, val);
+		break;
+	default:
+		out_le32(cfg_addr, val);
+		break;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops mpc83xx_pcie_ops = {
+	.read = mpc83xx_pcie_read_config,
+	.write = mpc83xx_pcie_write_config,
+};
+
+static int __init mpc83xx_pcie_setup(struct pci_controller *hose,
+				     struct resource *reg)
+{
+	struct mpc83xx_pcie_priv *pcie;
+	u32 cfg_bar;
+	int ret = -ENOMEM;
+
+	pcie = zalloc_maybe_bootmem(sizeof(*pcie), GFP_KERNEL);
+	if (!pcie)
+		return ret;
+
+	pcie->cfg_type0 = ioremap(reg->start, resource_size(reg));
+	if (!pcie->cfg_type0)
+		goto err0;
+
+	cfg_bar = in_le32(pcie->cfg_type0 + PEX_OUTWIN0_BAR);
+	if (!cfg_bar) {
+		/* PCI-E isn't configured. */
+		ret = -ENODEV;
+		goto err1;
+	}
+
+	pcie->cfg_type1 = ioremap(cfg_bar, 0x1000);
+	if (!pcie->cfg_type1)
+		goto err1;
+
+	WARN_ON(hose->dn->data);
+	hose->dn->data = pcie;
+	hose->ops = &mpc83xx_pcie_ops;
+
+	out_le32(pcie->cfg_type0 + PEX_OUTWIN0_TAH, 0);
+	out_le32(pcie->cfg_type0 + PEX_OUTWIN0_TAL, 0);
+
+	if (fsl_pcie_check_link(hose))
+		hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK;
+
+	return 0;
+err1:
+	iounmap(pcie->cfg_type0);
+err0:
+	kfree(pcie);
+	return ret;
+
+}
+
 int __init mpc83xx_add_bridge(struct device_node *dev)
 {
+	int ret;
 	int len;
 	struct pci_controller *hose;
 	struct resource rsrc_reg;
@@ -303,6 +484,11 @@ int __init mpc83xx_add_bridge(struct device_node *dev)
 	const int *bus_range;
 	int primary;
 
+	if (!of_device_is_available(dev)) {
+		pr_warning("%s: disabled by the firmware.\n",
+			   dev->full_name);
+		return -ENODEV;
+	}
 	pr_debug("Adding PCI host bridge %s\n", dev->full_name);
 
 	/* Fetch host bridge registers address */
@@ -350,7 +536,14 @@ int __init mpc83xx_add_bridge(struct device_node *dev)
 	hose->first_busno = bus_range ? bus_range[0] : 0;
 	hose->last_busno = bus_range ? bus_range[1] : 0xff;
 
-	setup_indirect_pci(hose, rsrc_cfg.start, rsrc_cfg.start + 4, 0);
+	if (of_device_is_compatible(dev, "fsl,mpc8314-pcie")) {
+		ret = mpc83xx_pcie_setup(hose, &rsrc_reg);
+		if (ret)
+			goto err0;
+	} else {
+		setup_indirect_pci(hose, rsrc_cfg.start,
+				   rsrc_cfg.start + 4, 0);
+	}
 
 	printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. "
 	       "Firmware bus number: %d->%d\n",
@@ -365,5 +558,8 @@ int __init mpc83xx_add_bridge(struct device_node *dev)
 	pci_process_bridge_OF_ranges(hose, dev, primary);
 
 	return 0;
+err0:
+	pcibios_free_controller(hose);
+	return ret;
 }
 #endif /* CONFIG_PPC_83xx */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index febc10ed3858..ff9b7be2b791 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2217,6 +2217,14 @@
 #define PCI_DEVICE_ID_TDI_EHCI          0x0101
 
 #define PCI_VENDOR_ID_FREESCALE		0x1957
+#define PCI_DEVICE_ID_MPC8315E		0x00b4
+#define PCI_DEVICE_ID_MPC8315		0x00b5
+#define PCI_DEVICE_ID_MPC8314E		0x00b6
+#define PCI_DEVICE_ID_MPC8314		0x00b7
+#define PCI_DEVICE_ID_MPC8378E		0x00c4
+#define PCI_DEVICE_ID_MPC8378		0x00c5
+#define PCI_DEVICE_ID_MPC8377E		0x00c6
+#define PCI_DEVICE_ID_MPC8377		0x00c7
 #define PCI_DEVICE_ID_MPC8548E		0x0012
 #define PCI_DEVICE_ID_MPC8548		0x0013
 #define PCI_DEVICE_ID_MPC8543E		0x0014
-- 
cgit v1.2.3-71-gd317


From 3fb1b6ad0679ad671bd496712b2a088550ee86b2 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 22 Jan 2009 09:55:59 +0000
Subject: sh: CMT clockevent platform driver

SuperH CMT clockevent driver.

Both 16-bit and 32-bit CMT versions are supported, but only 32-bit
is tested. This driver contains support for both clockevents and
clocksources, but no unregistration is supported at this point.

Works fine as clock source and/or event in periodic or oneshot mode.
Tested on sh7722 and sh7723, but should work with any cpu/architecture.

This version is lacking clocksource and early platform driver support
for now - this to minimize the amount of dependencies.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig              |   8 +
 drivers/clocksource/Makefile |   1 +
 drivers/clocksource/sh_cmt.c | 615 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/sh_cmt.h       |  13 +
 4 files changed, 637 insertions(+)
 create mode 100644 drivers/clocksource/sh_cmt.c
 create mode 100644 include/linux/sh_cmt.h

(limited to 'include/linux')

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index ebabe518e729..5407e1295e51 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -397,6 +397,14 @@ source "arch/sh/boards/Kconfig"
 
 menu "Timer and clock configuration"
 
+config SH_TIMER_CMT
+	def_bool n
+	prompt "CMT support"
+	select GENERIC_TIME
+	select GENERIC_CLOCKEVENTS
+	help
+	  This enables build of the CMT system timer driver.
+
 config SH_TMU
 	def_bool y
 	prompt "TMU timer support"
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 1525882190fd..1efb2879a94f 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_ATMEL_TCB_CLKSRC)	+= tcb_clksrc.o
 obj-$(CONFIG_X86_CYCLONE_TIMER)	+= cyclone.o
 obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
+obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
new file mode 100644
index 000000000000..7783b42f6914
--- /dev/null
+++ b/drivers/clocksource/sh_cmt.c
@@ -0,0 +1,615 @@
+/*
+ * SuperH Timer Support - CMT
+ *
+ *  Copyright (C) 2008 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/sh_cmt.h>
+
+struct sh_cmt_priv {
+	void __iomem *mapbase;
+	struct clk *clk;
+	unsigned long width; /* 16 or 32 bit version of hardware block */
+	unsigned long overflow_bit;
+	unsigned long clear_bits;
+	struct irqaction irqaction;
+	struct platform_device *pdev;
+
+	unsigned long flags;
+	unsigned long match_value;
+	unsigned long next_match_value;
+	unsigned long max_match_value;
+	unsigned long rate;
+	spinlock_t lock;
+	struct clock_event_device ced;
+	unsigned long total_cycles;
+};
+
+static DEFINE_SPINLOCK(sh_cmt_lock);
+
+#define CMSTR -1 /* shared register */
+#define CMCSR 0 /* channel register */
+#define CMCNT 1 /* channel register */
+#define CMCOR 2 /* channel register */
+
+static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr)
+{
+	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == CMSTR) {
+		offs = 0;
+		base -= cfg->channel_offset;
+	} else
+		offs = reg_nr;
+
+	if (p->width == 16)
+		offs <<= 1;
+	else {
+		offs <<= 2;
+		if ((reg_nr == CMCNT) || (reg_nr == CMCOR))
+			return ioread32(base + offs);
+	}
+
+	return ioread16(base + offs);
+}
+
+static inline void sh_cmt_write(struct sh_cmt_priv *p, int reg_nr,
+				unsigned long value)
+{
+	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == CMSTR) {
+		offs = 0;
+		base -= cfg->channel_offset;
+	} else
+		offs = reg_nr;
+
+	if (p->width == 16)
+		offs <<= 1;
+	else {
+		offs <<= 2;
+		if ((reg_nr == CMCNT) || (reg_nr == CMCOR)) {
+			iowrite32(value, base + offs);
+			return;
+		}
+	}
+
+	iowrite16(value, base + offs);
+}
+
+static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p,
+					int *has_wrapped)
+{
+	unsigned long v1, v2, v3;
+
+	/* Make sure the timer value is stable. Stolen from acpi_pm.c */
+	do {
+		v1 = sh_cmt_read(p, CMCNT);
+		v2 = sh_cmt_read(p, CMCNT);
+		v3 = sh_cmt_read(p, CMCNT);
+	} while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
+			  || (v3 > v1 && v3 < v2)));
+
+	*has_wrapped = sh_cmt_read(p, CMCSR) & p->overflow_bit;
+	return v2;
+}
+
+
+static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
+{
+	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	unsigned long flags, value;
+
+	/* start stop register shared by multiple timer channels */
+	spin_lock_irqsave(&sh_cmt_lock, flags);
+	value = sh_cmt_read(p, CMSTR);
+
+	if (start)
+		value |= 1 << cfg->timer_bit;
+	else
+		value &= ~(1 << cfg->timer_bit);
+
+	sh_cmt_write(p, CMSTR, value);
+	spin_unlock_irqrestore(&sh_cmt_lock, flags);
+}
+
+static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
+{
+	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	int ret;
+
+	/* enable clock */
+	ret = clk_enable(p->clk);
+	if (ret) {
+		pr_err("sh_cmt: cannot enable clock \"%s\"\n", cfg->clk);
+		return ret;
+	}
+	*rate = clk_get_rate(p->clk) / 8;
+
+	/* make sure channel is disabled */
+	sh_cmt_start_stop_ch(p, 0);
+
+	/* configure channel, periodic mode and maximum timeout */
+	if (p->width == 16)
+		sh_cmt_write(p, CMCSR, 0);
+	else
+		sh_cmt_write(p, CMCSR, 0x01a4);
+
+	sh_cmt_write(p, CMCOR, 0xffffffff);
+	sh_cmt_write(p, CMCNT, 0);
+
+	/* enable channel */
+	sh_cmt_start_stop_ch(p, 1);
+	return 0;
+}
+
+static void sh_cmt_disable(struct sh_cmt_priv *p)
+{
+	/* disable channel */
+	sh_cmt_start_stop_ch(p, 0);
+
+	/* stop clock */
+	clk_disable(p->clk);
+}
+
+/* private flags */
+#define FLAG_CLOCKEVENT (1 << 0)
+#define FLAG_CLOCKSOURCE (1 << 1)
+#define FLAG_REPROGRAM (1 << 2)
+#define FLAG_SKIPEVENT (1 << 3)
+#define FLAG_IRQCONTEXT (1 << 4)
+
+static void sh_cmt_clock_event_program_verify(struct sh_cmt_priv *p,
+					      int absolute)
+{
+	unsigned long new_match;
+	unsigned long value = p->next_match_value;
+	unsigned long delay = 0;
+	unsigned long now = 0;
+	int has_wrapped;
+
+	now = sh_cmt_get_counter(p, &has_wrapped);
+	p->flags |= FLAG_REPROGRAM; /* force reprogram */
+
+	if (has_wrapped) {
+		/* we're competing with the interrupt handler.
+		 *  -> let the interrupt handler reprogram the timer.
+		 *  -> interrupt number two handles the event.
+		 */
+		p->flags |= FLAG_SKIPEVENT;
+		return;
+	}
+
+	if (absolute)
+		now = 0;
+
+	do {
+		/* reprogram the timer hardware,
+		 * but don't save the new match value yet.
+		 */
+		new_match = now + value + delay;
+		if (new_match > p->max_match_value)
+			new_match = p->max_match_value;
+
+		sh_cmt_write(p, CMCOR, new_match);
+
+		now = sh_cmt_get_counter(p, &has_wrapped);
+		if (has_wrapped && (new_match > p->match_value)) {
+			/* we are changing to a greater match value,
+			 * so this wrap must be caused by the counter
+			 * matching the old value.
+			 * -> first interrupt reprograms the timer.
+			 * -> interrupt number two handles the event.
+			 */
+			p->flags |= FLAG_SKIPEVENT;
+			break;
+		}
+
+		if (has_wrapped) {
+			/* we are changing to a smaller match value,
+			 * so the wrap must be caused by the counter
+			 * matching the new value.
+			 * -> save programmed match value.
+			 * -> let isr handle the event.
+			 */
+			p->match_value = new_match;
+			break;
+		}
+
+		/* be safe: verify hardware settings */
+		if (now < new_match) {
+			/* timer value is below match value, all good.
+			 * this makes sure we won't miss any match events.
+			 * -> save programmed match value.
+			 * -> let isr handle the event.
+			 */
+			p->match_value = new_match;
+			break;
+		}
+
+		/* the counter has reached a value greater
+		 * than our new match value. and since the
+		 * has_wrapped flag isn't set we must have
+		 * programmed a too close event.
+		 * -> increase delay and retry.
+		 */
+		if (delay)
+			delay <<= 1;
+		else
+			delay = 1;
+
+		if (!delay)
+			pr_warning("sh_cmt: too long delay\n");
+
+	} while (delay);
+}
+
+static void sh_cmt_set_next(struct sh_cmt_priv *p, unsigned long delta)
+{
+	unsigned long flags;
+
+	if (delta > p->max_match_value)
+		pr_warning("sh_cmt: delta out of range\n");
+
+	spin_lock_irqsave(&p->lock, flags);
+	p->next_match_value = delta;
+	sh_cmt_clock_event_program_verify(p, 0);
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static irqreturn_t sh_cmt_interrupt(int irq, void *dev_id)
+{
+	struct sh_cmt_priv *p = dev_id;
+
+	/* clear flags */
+	sh_cmt_write(p, CMCSR, sh_cmt_read(p, CMCSR) & p->clear_bits);
+
+	/* update clock source counter to begin with if enabled
+	 * the wrap flag should be cleared by the timer specific
+	 * isr before we end up here.
+	 */
+	if (p->flags & FLAG_CLOCKSOURCE)
+		p->total_cycles += p->match_value;
+
+	if (!(p->flags & FLAG_REPROGRAM))
+		p->next_match_value = p->max_match_value;
+
+	p->flags |= FLAG_IRQCONTEXT;
+
+	if (p->flags & FLAG_CLOCKEVENT) {
+		if (!(p->flags & FLAG_SKIPEVENT)) {
+			if (p->ced.mode == CLOCK_EVT_MODE_ONESHOT) {
+				p->next_match_value = p->max_match_value;
+				p->flags |= FLAG_REPROGRAM;
+			}
+
+			p->ced.event_handler(&p->ced);
+		}
+	}
+
+	p->flags &= ~FLAG_SKIPEVENT;
+
+	if (p->flags & FLAG_REPROGRAM) {
+		p->flags &= ~FLAG_REPROGRAM;
+		sh_cmt_clock_event_program_verify(p, 1);
+
+		if (p->flags & FLAG_CLOCKEVENT)
+			if ((p->ced.mode == CLOCK_EVT_MODE_SHUTDOWN)
+			    || (p->match_value == p->next_match_value))
+				p->flags &= ~FLAG_REPROGRAM;
+	}
+
+	p->flags &= ~FLAG_IRQCONTEXT;
+
+	return IRQ_HANDLED;
+}
+
+static int sh_cmt_start(struct sh_cmt_priv *p, unsigned long flag)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->lock, flags);
+
+	if (!(p->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE)))
+		ret = sh_cmt_enable(p, &p->rate);
+
+	if (ret)
+		goto out;
+	p->flags |= flag;
+
+	/* setup timeout if no clockevent */
+	if ((flag == FLAG_CLOCKSOURCE) && (!(p->flags & FLAG_CLOCKEVENT)))
+		sh_cmt_set_next(p, p->max_match_value);
+ out:
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	return ret;
+}
+
+static void sh_cmt_stop(struct sh_cmt_priv *p, unsigned long flag)
+{
+	unsigned long flags;
+	unsigned long f;
+
+	spin_lock_irqsave(&p->lock, flags);
+
+	f = p->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE);
+	p->flags &= ~flag;
+
+	if (f && !(p->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE)))
+		sh_cmt_disable(p);
+
+	/* adjust the timeout to maximum if only clocksource left */
+	if ((flag == FLAG_CLOCKEVENT) && (p->flags & FLAG_CLOCKSOURCE))
+		sh_cmt_set_next(p, p->max_match_value);
+
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static struct sh_cmt_priv *ced_to_sh_cmt(struct clock_event_device *ced)
+{
+	return container_of(ced, struct sh_cmt_priv, ced);
+}
+
+static void sh_cmt_clock_event_start(struct sh_cmt_priv *p, int periodic)
+{
+	struct clock_event_device *ced = &p->ced;
+
+	sh_cmt_start(p, FLAG_CLOCKEVENT);
+
+	/* TODO: calculate good shift from rate and counter bit width */
+
+	ced->shift = 32;
+	ced->mult = div_sc(p->rate, NSEC_PER_SEC, ced->shift);
+	ced->max_delta_ns = clockevent_delta2ns(p->max_match_value, ced);
+	ced->min_delta_ns = clockevent_delta2ns(0x1f, ced);
+
+	if (periodic)
+		sh_cmt_set_next(p, (p->rate + HZ/2) / HZ);
+	else
+		sh_cmt_set_next(p, p->max_match_value);
+}
+
+static void sh_cmt_clock_event_mode(enum clock_event_mode mode,
+				    struct clock_event_device *ced)
+{
+	struct sh_cmt_priv *p = ced_to_sh_cmt(ced);
+
+	/* deal with old setting first */
+	switch (ced->mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+		sh_cmt_stop(p, FLAG_CLOCKEVENT);
+		break;
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		pr_info("sh_cmt: %s used for periodic clock events\n",
+			ced->name);
+		sh_cmt_clock_event_start(p, 1);
+		break;
+	case CLOCK_EVT_MODE_ONESHOT:
+		pr_info("sh_cmt: %s used for oneshot clock events\n",
+			ced->name);
+		sh_cmt_clock_event_start(p, 0);
+		break;
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_UNUSED:
+		sh_cmt_stop(p, FLAG_CLOCKEVENT);
+		break;
+	default:
+		break;
+	}
+}
+
+static int sh_cmt_clock_event_next(unsigned long delta,
+				   struct clock_event_device *ced)
+{
+	struct sh_cmt_priv *p = ced_to_sh_cmt(ced);
+
+	BUG_ON(ced->mode != CLOCK_EVT_MODE_ONESHOT);
+	if (likely(p->flags & FLAG_IRQCONTEXT))
+		p->next_match_value = delta;
+	else
+		sh_cmt_set_next(p, delta);
+
+	return 0;
+}
+
+static void sh_cmt_register_clockevent(struct sh_cmt_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clock_event_device *ced = &p->ced;
+
+	memset(ced, 0, sizeof(*ced));
+
+	ced->name = name;
+	ced->features = CLOCK_EVT_FEAT_PERIODIC;
+	ced->features |= CLOCK_EVT_FEAT_ONESHOT;
+	ced->rating = rating;
+	ced->cpumask = cpumask_of(0);
+	ced->set_next_event = sh_cmt_clock_event_next;
+	ced->set_mode = sh_cmt_clock_event_mode;
+
+	pr_info("sh_cmt: %s used for clock events\n", ced->name);
+	ced->mult = 1; /* work around misplaced WARN_ON() in clockevents.c */
+	clockevents_register_device(ced);
+}
+
+int sh_cmt_register(struct sh_cmt_priv *p, char *name,
+		    unsigned long clockevent_rating,
+		    unsigned long clocksource_rating)
+{
+	if (p->width == (sizeof(p->max_match_value) * 8))
+		p->max_match_value = ~0;
+	else
+		p->max_match_value = (1 << p->width) - 1;
+
+	p->match_value = p->max_match_value;
+	spin_lock_init(&p->lock);
+
+	if (clockevent_rating)
+		sh_cmt_register_clockevent(p, name, clockevent_rating);
+
+	return 0;
+}
+
+static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
+{
+	struct sh_cmt_config *cfg = pdev->dev.platform_data;
+	struct resource *res;
+	int irq, ret;
+	ret = -ENXIO;
+
+	memset(p, 0, sizeof(*p));
+	p->pdev = pdev;
+
+	if (!cfg) {
+		dev_err(&p->pdev->dev, "missing platform data\n");
+		goto err0;
+	}
+
+	platform_set_drvdata(pdev, p);
+
+	res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&p->pdev->dev, "failed to get I/O memory\n");
+		goto err0;
+	}
+
+	irq = platform_get_irq(p->pdev, 0);
+	if (irq < 0) {
+		dev_err(&p->pdev->dev, "failed to get irq\n");
+		goto err0;
+	}
+
+	/* map memory, let mapbase point to our channel */
+	p->mapbase = ioremap_nocache(res->start, resource_size(res));
+	if (p->mapbase == NULL) {
+		pr_err("sh_cmt: failed to remap I/O memory\n");
+		goto err0;
+	}
+
+	/* request irq using setup_irq() (too early for request_irq()) */
+	p->irqaction.name = cfg->name;
+	p->irqaction.handler = sh_cmt_interrupt;
+	p->irqaction.dev_id = p;
+	p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
+	p->irqaction.mask = CPU_MASK_NONE;
+	ret = setup_irq(irq, &p->irqaction);
+	if (ret) {
+		pr_err("sh_cmt: failed to request irq %d\n", irq);
+		goto err1;
+	}
+
+	/* get hold of clock */
+	p->clk = clk_get(&p->pdev->dev, cfg->clk);
+	if (IS_ERR(p->clk)) {
+		pr_err("sh_cmt: cannot get clock \"%s\"\n", cfg->clk);
+		ret = PTR_ERR(p->clk);
+		goto err2;
+	}
+
+	if (resource_size(res) == 6) {
+		p->width = 16;
+		p->overflow_bit = 0x80;
+		p->clear_bits = ~0xc0;
+	} else {
+		p->width = 32;
+		p->overflow_bit = 0x8000;
+		p->clear_bits = ~0xc000;
+	}
+
+	return sh_cmt_register(p, cfg->name,
+			       cfg->clockevent_rating,
+			       cfg->clocksource_rating);
+ err2:
+	free_irq(irq, p);
+ err1:
+	iounmap(p->mapbase);
+ err0:
+	return ret;
+}
+
+static int __devinit sh_cmt_probe(struct platform_device *pdev)
+{
+	struct sh_cmt_priv *p = platform_get_drvdata(pdev);
+	int ret;
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	ret = sh_cmt_setup(p, pdev);
+	if (ret) {
+		kfree(p);
+
+		platform_set_drvdata(pdev, NULL);
+	}
+	return ret;
+}
+
+static int __devexit sh_cmt_remove(struct platform_device *pdev)
+{
+	return -EBUSY; /* cannot unregister clockevent and clocksource */
+}
+
+static struct platform_driver sh_cmt_device_driver = {
+	.probe		= sh_cmt_probe,
+	.remove		= __devexit_p(sh_cmt_remove),
+	.driver		= {
+		.name	= "sh_cmt",
+	}
+};
+
+static int __init sh_cmt_init(void)
+{
+	return platform_driver_register(&sh_cmt_device_driver);
+}
+
+static void __exit sh_cmt_exit(void)
+{
+	platform_driver_unregister(&sh_cmt_device_driver);
+}
+
+module_init(sh_cmt_init);
+module_exit(sh_cmt_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("SuperH CMT Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sh_cmt.h b/include/linux/sh_cmt.h
new file mode 100644
index 000000000000..68cacde5954f
--- /dev/null
+++ b/include/linux/sh_cmt.h
@@ -0,0 +1,13 @@
+#ifndef __SH_CMT_H__
+#define __SH_CMT_H__
+
+struct sh_cmt_config {
+	char *name;
+	unsigned long channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+	unsigned long clocksource_rating;
+};
+
+#endif /* __SH_CMT_H__ */
-- 
cgit v1.2.3-71-gd317


From d7ab5c46ae2743079a40bb4060e510418c0842b4 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 28 Jan 2009 21:53:18 +0900
Subject: intel-iommu: make dma mapping functions static

The dma ops unification enables X86 and IA64 to share intel_dma_ops so
we can make dma mapping functions static. This also remove unused
intel_map_single().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/pci/intel-iommu.c   | 29 +++++++++++------------------
 include/linux/intel-iommu.h |  9 ---------
 2 files changed, 11 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 59de56304aa6..628f8b72e530 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2283,13 +2283,6 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
 				  dir, to_pci_dev(dev)->dma_mask);
 }
 
-dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
-			    size_t size, int dir)
-{
-	return __intel_map_single(hwdev, paddr, size, dir,
-				  to_pci_dev(hwdev)->dma_mask);
-}
-
 static void flush_unmaps(void)
 {
 	int i, j;
@@ -2397,14 +2390,14 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 	}
 }
 
-void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
-			int dir)
+static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
+			       int dir)
 {
 	intel_unmap_page(dev, dev_addr, size, dir, NULL);
 }
 
-void *intel_alloc_coherent(struct device *hwdev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t flags)
+static void *intel_alloc_coherent(struct device *hwdev, size_t size,
+				  dma_addr_t *dma_handle, gfp_t flags)
 {
 	void *vaddr;
 	int order;
@@ -2427,8 +2420,8 @@ void *intel_alloc_coherent(struct device *hwdev, size_t size,
 	return NULL;
 }
 
-void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
-			 dma_addr_t dma_handle)
+static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+				dma_addr_t dma_handle)
 {
 	int order;
 
@@ -2441,9 +2434,9 @@ void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 
 #define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
 
-void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
-		    int nelems, enum dma_data_direction dir,
-		    struct dma_attrs *attrs)
+static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
+			   int nelems, enum dma_data_direction dir,
+			   struct dma_attrs *attrs)
 {
 	int i;
 	struct pci_dev *pdev = to_pci_dev(hwdev);
@@ -2500,8 +2493,8 @@ static int intel_nontranslate_map_sg(struct device *hddev,
 	return nelems;
 }
 
-int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
-		 enum dma_data_direction dir, struct dma_attrs *attrs)
+static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
+			enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	void *addr;
 	int i;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a254db1decd0..43412aeddb53 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -330,13 +330,4 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t);
-extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t);
-extern dma_addr_t intel_map_single(struct device *, phys_addr_t, size_t, int);
-extern void intel_unmap_single(struct device *, dma_addr_t, size_t, int);
-extern int intel_map_sg(struct device *, struct scatterlist *, int,
-			enum dma_data_direction, struct dma_attrs *);
-extern void intel_unmap_sg(struct device *, struct scatterlist *, int,
-			   enum dma_data_direction, struct dma_attrs *);
-
 #endif
-- 
cgit v1.2.3-71-gd317


From 9ee677c2276bfcbcf68042ec2718a504af0c5fd7 Mon Sep 17 00:00:00 2001
From: David Kilroy <kilroyd@googlemail.com>
Date: Tue, 23 Dec 2008 14:03:38 +0000
Subject: wireless: Add channel/frequency conversions to ieee80211.h

Added mappings for FHSS, DSSS and OFDM channels - with macros to point
HR DSSS and ERP to the DSSS mappings. Currently just static inline
functions.

Use the new functions in the older fullmac drivers. This eliminates a
number of const static buffers and removes a couple of range checks that
are now redundant.

Signed-off-by: David Kilroy <kilroyd@googlemail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Richard Farina <sidhayn@gmail.com>
Acked-by: Jeroen Vreeken <pe1rxq@amsat.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/airo.c            |  25 ++-----
 drivers/net/wireless/atmel.c           |  20 +++---
 drivers/net/wireless/orinoco/orinoco.c |  33 ++++------
 drivers/net/wireless/rndis_wlan.c      |  13 ++--
 drivers/net/wireless/wl3501_cs.c       |   9 +--
 drivers/net/wireless/zd1201.c          |   7 +-
 include/linux/ieee80211.h              | 116 +++++++++++++++++++++++++++++++++
 7 files changed, 155 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index fc4322ca669f..2ff588bb0a7c 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -1070,10 +1070,6 @@ static WifiCtlHdr wifictlhdr8023 = {
 	}
 };
 
-// Frequency list (map channels to frequencies)
-static const long frequency_list[] = { 2412, 2417, 2422, 2427, 2432, 2437, 2442,
-				2447, 2452, 2457, 2462, 2467, 2472, 2484 };
-
 // A few details needed for WEP (Wireless Equivalent Privacy)
 #define MAX_KEY_SIZE 13			// 128 (?) bits
 #define MIN_KEY_SIZE  5			// 40 bits RC4 - WEP
@@ -5725,16 +5721,12 @@ static int airo_set_freq(struct net_device *dev,
 	int rc = -EINPROGRESS;		/* Call commit handler */
 
 	/* If setting by frequency, convert to a channel */
-	if((fwrq->e == 1) &&
-	   (fwrq->m >= (int) 2.412e8) &&
-	   (fwrq->m <= (int) 2.487e8)) {
+	if(fwrq->e == 1) {
 		int f = fwrq->m / 100000;
-		int c = 0;
-		while((c < 14) && (f != frequency_list[c]))
-			c++;
+
 		/* Hack to fall through... */
 		fwrq->e = 0;
-		fwrq->m = c + 1;
+		fwrq->m = ieee80211_freq_to_dsss_chan(f);
 	}
 	/* Setting by channel number */
 	if((fwrq->m > 1000) || (fwrq->e > 0))
@@ -5778,7 +5770,7 @@ static int airo_get_freq(struct net_device *dev,
 
 	ch = le16_to_cpu(status_rid.channel);
 	if((ch > 0) && (ch < 15)) {
-		fwrq->m = frequency_list[ch - 1] * 100000;
+		fwrq->m = ieee80211_dsss_chan_to_freq(ch) * 100000;
 		fwrq->e = 1;
 	} else {
 		fwrq->m = ch;
@@ -6795,8 +6787,8 @@ static int airo_get_range(struct net_device *dev,
 	k = 0;
 	for(i = 0; i < 14; i++) {
 		range->freq[k].i = i + 1; /* List index */
-		range->freq[k].m = frequency_list[i] * 100000;
-		range->freq[k++].e = 1;	/* Values in table in MHz -> * 10^5 * 10 */
+		range->freq[k].m = ieee80211_dsss_chan_to_freq(i + 1) * 100000;
+		range->freq[k++].e = 1;	/* Values in MHz -> * 10^5 * 10 */
 	}
 	range->num_frequency = k;
 
@@ -7189,10 +7181,7 @@ static inline char *airo_translate_scan(struct net_device *dev,
 	/* Add frequency */
 	iwe.cmd = SIOCGIWFREQ;
 	iwe.u.freq.m = le16_to_cpu(bss->dsChannel);
-	/* iwe.u.freq.m containt the channel (starting 1), our 
-	 * frequency_list array start at index 0...
-	 */
-	iwe.u.freq.m = frequency_list[iwe.u.freq.m - 1] * 100000;
+	iwe.u.freq.m = ieee80211_dsss_chan_to_freq(iwe.u.freq.m) * 100000;
 	iwe.u.freq.e = 1;
 	current_ev = iwe_stream_add_event(info, current_ev, end_buf,
 					  &iwe, IW_EV_FREQ_LEN);
diff --git a/drivers/net/wireless/atmel.c b/drivers/net/wireless/atmel.c
index 4223672c4432..91930a2c3c6b 100644
--- a/drivers/net/wireless/atmel.c
+++ b/drivers/net/wireless/atmel.c
@@ -2204,9 +2204,6 @@ static int atmel_get_frag(struct net_device *dev,
 	return 0;
 }
 
-static const long frequency_list[] = { 2412, 2417, 2422, 2427, 2432, 2437, 2442,
-				2447, 2452, 2457, 2462, 2467, 2472, 2484 };
-
 static int atmel_set_freq(struct net_device *dev,
 			  struct iw_request_info *info,
 			  struct iw_freq *fwrq,
@@ -2216,16 +2213,12 @@ static int atmel_set_freq(struct net_device *dev,
 	int rc = -EINPROGRESS;		/* Call commit handler */
 
 	/* If setting by frequency, convert to a channel */
-	if ((fwrq->e == 1) &&
-	    (fwrq->m >= (int) 241200000) &&
-	    (fwrq->m <= (int) 248700000)) {
+	if (fwrq->e == 1) {
 		int f = fwrq->m / 100000;
-		int c = 0;
-		while ((c < 14) && (f != frequency_list[c]))
-			c++;
+
 		/* Hack to fall through... */
 		fwrq->e = 0;
-		fwrq->m = c + 1;
+		fwrq->m = ieee80211_freq_to_dsss_chan(f);
 	}
 	/* Setting by channel number */
 	if ((fwrq->m > 1000) || (fwrq->e > 0))
@@ -2384,8 +2377,11 @@ static int atmel_get_range(struct net_device *dev,
 	if (range->num_channels != 0) {
 		for (k = 0, i = channel_table[j].min; i <= channel_table[j].max; i++) {
 			range->freq[k].i = i; /* List index */
-			range->freq[k].m = frequency_list[i - 1] * 100000;
-			range->freq[k++].e = 1;	/* Values in table in MHz -> * 10^5 * 10 */
+
+			/* Values in MHz -> * 10^5 * 10 */
+			range->freq[k].m = (ieee80211_dsss_chan_to_freq(i) *
+					    100000);
+			range->freq[k++].e = 1;
 		}
 		range->num_frequency = k;
 	}
diff --git a/drivers/net/wireless/orinoco/orinoco.c b/drivers/net/wireless/orinoco/orinoco.c
index 45a04faa7818..beb4d1f8c184 100644
--- a/drivers/net/wireless/orinoco/orinoco.c
+++ b/drivers/net/wireless/orinoco/orinoco.c
@@ -178,12 +178,7 @@ static const struct ethtool_ops orinoco_ethtool_ops;
 /* Data tables                                                      */
 /********************************************************************/
 
-/* The frequency of each channel in MHz */
-static const long channel_frequency[] = {
-	2412, 2417, 2422, 2427, 2432, 2437, 2442,
-	2447, 2452, 2457, 2462, 2467, 2472, 2484
-};
-#define NUM_CHANNELS ARRAY_SIZE(channel_frequency)
+#define NUM_CHANNELS 14
 
 /* This tables gives the actual meanings of the bitrate IDs returned
  * by the firmware. */
@@ -3742,13 +3737,13 @@ static int orinoco_hw_get_essid(struct orinoco_private *priv, int *active,
 	return err;       
 }
 
-static long orinoco_hw_get_freq(struct orinoco_private *priv)
+static int orinoco_hw_get_freq(struct orinoco_private *priv)
 {
 	
 	hermes_t *hw = &priv->hw;
 	int err = 0;
 	u16 channel;
-	long freq = 0;
+	int freq = 0;
 	unsigned long flags;
 
 	if (orinoco_lock(priv, &flags) != 0)
@@ -3771,7 +3766,7 @@ static long orinoco_hw_get_freq(struct orinoco_private *priv)
 		goto out;
 
 	}
-	freq = channel_frequency[channel-1] * 100000;
+	freq = ieee80211_dsss_chan_to_freq(channel);
 
  out:
 	orinoco_unlock(priv, &flags);
@@ -3998,7 +3993,8 @@ static int orinoco_ioctl_getiwrange(struct net_device *dev,
 	for (i = 0; i < NUM_CHANNELS; i++) {
 		if (priv->channel_mask & (1 << i)) {
 			range->freq[k].i = i + 1;
-			range->freq[k].m = channel_frequency[i] * 100000;
+			range->freq[k].m = (ieee80211_dsss_chan_to_freq(i + 1) *
+					    100000);
 			range->freq[k].e = 1;
 			k++;
 		}
@@ -4346,16 +4342,15 @@ static int orinoco_ioctl_setfreq(struct net_device *dev,
 		/* Setting by channel number */
 		chan = frq->m;
 	} else {
-		/* Setting by frequency - search the table */
-		int mult = 1;
+		/* Setting by frequency */
+		int denom = 1;
 		int i;
 
+		/* Calculate denominator to rescale to MHz */
 		for (i = 0; i < (6 - frq->e); i++)
-			mult *= 10;
+			denom *= 10;
 
-		for (i = 0; i < NUM_CHANNELS; i++)
-			if (frq->m == (channel_frequency[i] * mult))
-				chan = i+1;
+		chan = ieee80211_freq_to_dsss_chan(frq->m / denom);
 	}
 
 	if ( (chan < 1) || (chan > NUM_CHANNELS) ||
@@ -4392,7 +4387,7 @@ static int orinoco_ioctl_getfreq(struct net_device *dev,
 		return tmp;
 	}
 
-	frq->m = tmp;
+	frq->m = tmp * 100000;
 	frq->e = 1;
 
 	return 0;
@@ -5609,7 +5604,7 @@ static inline char *orinoco_translate_scan(struct net_device *dev,
 		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
 						  &iwe, IW_EV_FREQ_LEN);
 
-		iwe.u.freq.m = channel_frequency[channel-1] * 100000;
+		iwe.u.freq.m = ieee80211_dsss_chan_to_freq(channel) * 100000;
 		iwe.u.freq.e = 1;
 		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
 						  &iwe, IW_EV_FREQ_LEN);
@@ -5760,7 +5755,7 @@ static inline char *orinoco_translate_ext_scan(struct net_device *dev,
 		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
 						  &iwe, IW_EV_FREQ_LEN);
 
-		iwe.u.freq.m = channel_frequency[channel-1] * 100000;
+		iwe.u.freq.m = ieee80211_dsss_chan_to_freq(channel) * 100000;
 		iwe.u.freq.e = 1;
 		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
 						  &iwe, IW_EV_FREQ_LEN);
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index ed93ac41297f..105f214e21f4 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -369,9 +369,6 @@ struct rndis_wext_private {
 };
 
 
-static const int freq_chan[] = { 2412, 2417, 2422, 2427, 2432, 2437, 2442,
-				2447, 2452, 2457, 2462, 2467, 2472, 2484 };
-
 static const int rates_80211g[8] = { 6, 9, 12, 18, 24, 36, 48, 54 };
 
 static const int bcm4320_power_output[4] = { 25, 50, 75, 100 };
@@ -640,8 +637,8 @@ static void dsconfig_to_freq(unsigned int dsconfig, struct iw_freq *freq)
 static int freq_to_dsconfig(struct iw_freq *freq, unsigned int *dsconfig)
 {
 	if (freq->m < 1000 && freq->e == 0) {
-		if (freq->m >= 1 && freq->m <= ARRAY_SIZE(freq_chan))
-			*dsconfig = freq_chan[freq->m - 1] * 1000;
+		if (freq->m >= 1 && freq->m <= 14)
+			*dsconfig = ieee80211_dsss_chan_to_freq(freq->m) * 1000;
 		else
 			return -1;
 	} else {
@@ -1178,11 +1175,11 @@ static int rndis_iw_get_range(struct net_device *dev,
 		range->throughput = 11 * 1000 * 1000 / 2;
 	}
 
-	range->num_channels = ARRAY_SIZE(freq_chan);
+	range->num_channels = 14;
 
-	for (i = 0; i < ARRAY_SIZE(freq_chan) && i < IW_MAX_FREQUENCIES; i++) {
+	for (i = 0; (i < 14) && (i < IW_MAX_FREQUENCIES); i++) {
 		range->freq[i].i = i + 1;
-		range->freq[i].m = freq_chan[i] * 100000;
+		range->freq[i].m = ieee80211_dsss_chan_to_freq(i + 1) * 100000;
 		range->freq[i].e = 1;
 	}
 	range->num_frequency = i;
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index c99a1b6b948f..c8d5c34e8ddf 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -44,6 +44,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/wireless.h>
+#include <linux/ieee80211.h>
 
 #include <net/iw_handler.h>
 
@@ -111,12 +112,6 @@ static void wl3501_release(struct pcmcia_device *link);
  */
 static dev_info_t wl3501_dev_info = "wl3501_cs";
 
-static int wl3501_chan2freq[] = {
-	[0]  = 2412, [1]  = 2417, [2]  = 2422, [3]  = 2427, [4] = 2432,
-	[5]  = 2437, [6]  = 2442, [7]  = 2447, [8]  = 2452, [9] = 2457,
-	[10] = 2462, [11] = 2467, [12] = 2472, [13] = 2477,
-};
-
 static const struct {
 	int reg_domain;
 	int min, max, deflt;
@@ -1510,7 +1505,7 @@ static int wl3501_get_freq(struct net_device *dev, struct iw_request_info *info,
 {
 	struct wl3501_card *this = netdev_priv(dev);
 
-	wrqu->freq.m = wl3501_chan2freq[this->chan - 1] * 100000;
+	wrqu->freq.m = ieee80211_dsss_chan_to_freq(this->chan) * 100000;
 	wrqu->freq.e = 1;
 	return 0;
 }
diff --git a/drivers/net/wireless/zd1201.c b/drivers/net/wireless/zd1201.c
index b45c27d42fd8..6226ac2357f8 100644
--- a/drivers/net/wireless/zd1201.c
+++ b/drivers/net/wireless/zd1201.c
@@ -919,10 +919,9 @@ static int zd1201_set_freq(struct net_device *dev,
 	if (freq->e == 0)
 		channel = freq->m;
 	else {
-		if (freq->m >= 2482)
-			channel = 14;
-		if (freq->m >= 2407)
-			channel = (freq->m-2407)/5;
+		channel = ieee80211_freq_to_dsss_chan(freq->m);
+		if (channel < 0)
+			channel = 0;
 	}
 
 	err = zd1201_setconfig16(zd, ZD1201_RID_CNFOWNCHANNEL, channel);
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index c4e6ca1a6306..cade2556af0e 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1185,4 +1185,120 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr)
 		return hdr->addr1;
 }
 
+/**
+ * ieee80211_fhss_chan_to_freq - get channel frequency
+ * @channel: the FHSS channel
+ *
+ * Convert IEEE802.11 FHSS channel to frequency (MHz)
+ * Ref IEEE 802.11-2007 section 14.6
+ */
+static inline int ieee80211_fhss_chan_to_freq(int channel)
+{
+	if ((channel > 1) && (channel < 96))
+		return channel + 2400;
+	else
+		return -1;
+}
+
+/**
+ * ieee80211_freq_to_fhss_chan - get channel
+ * @freq: the channels frequency
+ *
+ * Convert frequency (MHz) to IEEE802.11 FHSS channel
+ * Ref IEEE 802.11-2007 section 14.6
+ */
+static inline int ieee80211_freq_to_fhss_chan(int freq)
+{
+	if ((freq > 2401) && (freq < 2496))
+		return freq - 2400;
+	else
+		return -1;
+}
+
+/**
+ * ieee80211_dsss_chan_to_freq - get channel center frequency
+ * @channel: the DSSS channel
+ *
+ * Convert IEEE802.11 DSSS channel to the center frequency (MHz).
+ * Ref IEEE 802.11-2007 section 15.6
+ */
+static inline int ieee80211_dsss_chan_to_freq(int channel)
+{
+	if ((channel > 0) && (channel < 14))
+		return 2407 + (channel * 5);
+	else if (channel == 14)
+		return 2484;
+	else
+		return -1;
+}
+
+/**
+ * ieee80211_freq_to_dsss_chan - get channel
+ * @freq: the frequency
+ *
+ * Convert frequency (MHz) to IEEE802.11 DSSS channel
+ * Ref IEEE 802.11-2007 section 15.6
+ *
+ * This routine selects the channel with the closest center frequency.
+ */
+static inline int ieee80211_freq_to_dsss_chan(int freq)
+{
+	if ((freq >= 2410) && (freq < 2475))
+		return (freq - 2405) / 5;
+	else if ((freq >= 2482) && (freq < 2487))
+		return 14;
+	else
+		return -1;
+}
+
+/* Convert IEEE802.11 HR DSSS channel to frequency (MHz) and back
+ * Ref IEEE 802.11-2007 section 18.4.6.2
+ *
+ * The channels and frequencies are the same as those defined for DSSS
+ */
+#define ieee80211_hr_chan_to_freq(chan) ieee80211_dsss_chan_to_freq(chan)
+#define ieee80211_freq_to_hr_chan(freq) ieee80211_freq_to_dsss_chan(freq)
+
+/* Convert IEEE802.11 ERP channel to frequency (MHz) and back
+ * Ref IEEE 802.11-2007 section 19.4.2
+ */
+#define ieee80211_erp_chan_to_freq(chan) ieee80211_hr_chan_to_freq(chan)
+#define ieee80211_freq_to_erp_chan(freq) ieee80211_freq_to_hr_chan(freq)
+
+/**
+ * ieee80211_ofdm_chan_to_freq - get channel center frequency
+ * @s_freq: starting frequency == (dotChannelStartingFactor/2) MHz
+ * @channel: the OFDM channel
+ *
+ * Convert IEEE802.11 OFDM channel to center frequency (MHz)
+ * Ref IEEE 802.11-2007 section 17.3.8.3.2
+ */
+static inline int ieee80211_ofdm_chan_to_freq(int s_freq, int channel)
+{
+	if ((channel > 0) && (channel <= 200) &&
+	    (s_freq >= 4000))
+		return s_freq + (channel * 5);
+	else
+		return -1;
+}
+
+/**
+ * ieee80211_freq_to_ofdm_channel - get channel
+ * @s_freq: starting frequency == (dotChannelStartingFactor/2) MHz
+ * @freq: the frequency
+ *
+ * Convert frequency (MHz) to IEEE802.11 OFDM channel
+ * Ref IEEE 802.11-2007 section 17.3.8.3.2
+ *
+ * This routine selects the channel with the closest center frequency.
+ */
+static inline int ieee80211_freq_to_ofdm_chan(int s_freq, int freq)
+{
+	if ((freq > (s_freq + 2)) && (freq <= (s_freq + 1202)) &&
+	    (s_freq >= 4000))
+		return (freq + 2 - s_freq) / 5;
+	else
+		return -1;
+}
+
 #endif /* LINUX_IEEE80211_H */
-- 
cgit v1.2.3-71-gd317


From 6b1c7c67603efdf0b39f6056989b0f8194cdc1f3 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Thu, 25 Dec 2008 00:39:28 +0100
Subject: b43/ssb: Add SPROM8 extraction and LP-PHY detection

This adds detection code for the LP-PHY and SPROM
extraction code for version 8, which is needed by the LP-PHY and
newer N-PHY.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/b43/main.c | 12 +++++++
 drivers/ssb/b43_pci_bridge.c    |  1 +
 drivers/ssb/pci.c               | 74 ++++++++++++++++++++++++++++++++++-------
 include/linux/ssb/ssb_regs.h    | 36 ++++++++++++++++++++
 4 files changed, 111 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index 26e733a5a56e..c627bac87a40 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -97,6 +97,7 @@ static const struct ssb_device_id b43_ssb_tbl[] = {
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 10),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 11),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 13),
+	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 15),
 	SSB_DEVICE(SSB_VENDOR_BROADCOM, SSB_DEV_80211, 16),
 	SSB_DEVTABLE_END
 };
@@ -3755,6 +3756,12 @@ static int b43_phy_versioning(struct b43_wldev *dev)
 		if (phy_rev > 4)
 			unsupported = 1;
 		break;
+#endif
+#ifdef CONFIG_B43_PHY_LP
+	case B43_PHYTYPE_LP:
+		if (phy_rev > 1)
+			unsupported = 1;
+		break;
 #endif
 	default:
 		unsupported = 1;
@@ -3808,6 +3815,10 @@ static int b43_phy_versioning(struct b43_wldev *dev)
 		if (radio_ver != 0x2055 && radio_ver != 0x2056)
 			unsupported = 1;
 		break;
+	case B43_PHYTYPE_LP:
+		if (radio_ver != 0x2062)
+			unsupported = 1;
+		break;
 	default:
 		B43_WARN_ON(1);
 	}
@@ -4402,6 +4413,7 @@ static int b43_wireless_core_attach(struct b43_wldev *dev)
 			break;
 		case B43_PHYTYPE_G:
 		case B43_PHYTYPE_N:
+		case B43_PHYTYPE_LP:
 			have_2ghz_phy = 1;
 			break;
 		default:
diff --git a/drivers/ssb/b43_pci_bridge.c b/drivers/ssb/b43_pci_bridge.c
index 6433a7ed39f8..27a677584a4c 100644
--- a/drivers/ssb/b43_pci_bridge.c
+++ b/drivers/ssb/b43_pci_bridge.c
@@ -21,6 +21,7 @@ static const struct pci_device_id b43_pci_bridge_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4307) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4311) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4312) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4315) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4318) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4319) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4320) },
diff --git a/drivers/ssb/pci.c b/drivers/ssb/pci.c
index d5cde051806b..c958ac16423c 100644
--- a/drivers/ssb/pci.c
+++ b/drivers/ssb/pci.c
@@ -467,6 +467,51 @@ static void sprom_extract_r45(struct ssb_sprom *out, const u16 *in)
 	/* TODO - get remaining rev 4 stuff needed */
 }
 
+static void sprom_extract_r8(struct ssb_sprom *out, const u16 *in)
+{
+	int i;
+	u16 v;
+
+	/* extract the MAC address */
+	for (i = 0; i < 3; i++) {
+		v = in[SPOFF(SSB_SPROM1_IL0MAC) + i];
+		*(((__be16 *)out->il0mac) + i) = cpu_to_be16(v);
+	}
+	SPEX(country_code, SSB_SPROM8_CCODE, 0xFFFF, 0);
+	SPEX(boardflags_lo, SSB_SPROM8_BFLLO, 0xFFFF, 0);
+	SPEX(boardflags_hi, SSB_SPROM8_BFLHI, 0xFFFF, 0);
+	SPEX(ant_available_a, SSB_SPROM8_ANTAVAIL, SSB_SPROM8_ANTAVAIL_A,
+	     SSB_SPROM8_ANTAVAIL_A_SHIFT);
+	SPEX(ant_available_bg, SSB_SPROM8_ANTAVAIL, SSB_SPROM8_ANTAVAIL_BG,
+	     SSB_SPROM8_ANTAVAIL_BG_SHIFT);
+	SPEX(maxpwr_bg, SSB_SPROM8_MAXP_BG, SSB_SPROM8_MAXP_BG_MASK, 0);
+	SPEX(itssi_bg, SSB_SPROM8_MAXP_BG, SSB_SPROM8_ITSSI_BG,
+	     SSB_SPROM8_ITSSI_BG_SHIFT);
+	SPEX(maxpwr_a, SSB_SPROM8_MAXP_A, SSB_SPROM8_MAXP_A_MASK, 0);
+	SPEX(itssi_a, SSB_SPROM8_MAXP_A, SSB_SPROM8_ITSSI_A,
+	     SSB_SPROM8_ITSSI_A_SHIFT);
+	SPEX(gpio0, SSB_SPROM8_GPIOA, SSB_SPROM8_GPIOA_P0, 0);
+	SPEX(gpio1, SSB_SPROM8_GPIOA, SSB_SPROM8_GPIOA_P1,
+	     SSB_SPROM8_GPIOA_P1_SHIFT);
+	SPEX(gpio2, SSB_SPROM8_GPIOB, SSB_SPROM8_GPIOB_P2, 0);
+	SPEX(gpio3, SSB_SPROM8_GPIOB, SSB_SPROM8_GPIOB_P3,
+	     SSB_SPROM8_GPIOB_P3_SHIFT);
+
+	/* Extract the antenna gain values. */
+	SPEX(antenna_gain.ghz24.a0, SSB_SPROM8_AGAIN01,
+	     SSB_SPROM8_AGAIN0, SSB_SPROM8_AGAIN0_SHIFT);
+	SPEX(antenna_gain.ghz24.a1, SSB_SPROM8_AGAIN01,
+	     SSB_SPROM8_AGAIN1, SSB_SPROM8_AGAIN1_SHIFT);
+	SPEX(antenna_gain.ghz24.a2, SSB_SPROM8_AGAIN23,
+	     SSB_SPROM8_AGAIN2, SSB_SPROM8_AGAIN2_SHIFT);
+	SPEX(antenna_gain.ghz24.a3, SSB_SPROM8_AGAIN23,
+	     SSB_SPROM8_AGAIN3, SSB_SPROM8_AGAIN3_SHIFT);
+	memcpy(&out->antenna_gain.ghz5, &out->antenna_gain.ghz24,
+	       sizeof(out->antenna_gain.ghz5));
+
+	/* TODO - get remaining rev 8 stuff needed */
+}
+
 static int sprom_extract(struct ssb_bus *bus, struct ssb_sprom *out,
 			 const u16 *in, u16 size)
 {
@@ -487,15 +532,25 @@ static int sprom_extract(struct ssb_bus *bus, struct ssb_sprom *out,
 		out->revision = 4;
 		sprom_extract_r45(out, in);
 	} else {
-		if (out->revision == 0)
-			goto unsupported;
-		if (out->revision >= 1 && out->revision <= 3) {
+		switch (out->revision) {
+		case 1:
+		case 2:
+		case 3:
 			sprom_extract_r123(out, in);
-		}
-		if (out->revision == 4 || out->revision == 5)
+			break;
+		case 4:
+		case 5:
 			sprom_extract_r45(out, in);
-		if (out->revision > 5)
-			goto unsupported;
+			break;
+		case 8:
+			sprom_extract_r8(out, in);
+			break;
+		default:
+			ssb_printk(KERN_WARNING PFX "Unsupported SPROM"
+				   "  revision %d detected. Will extract"
+				   " v1\n", out->revision);
+			sprom_extract_r123(out, in);
+		}
 	}
 
 	if (out->boardflags_lo == 0xFFFF)
@@ -504,11 +559,6 @@ static int sprom_extract(struct ssb_bus *bus, struct ssb_sprom *out,
 		out->boardflags_hi = 0;  /* per specs */
 
 	return 0;
-unsupported:
-	ssb_printk(KERN_WARNING PFX "Unsupported SPROM revision %d "
-		   "detected. Will extract v1\n", out->revision);
-	sprom_extract_r123(out, in);
-	return 0;
 }
 
 static int ssb_pci_sprom_get(struct ssb_bus *bus,
diff --git a/include/linux/ssb/ssb_regs.h b/include/linux/ssb/ssb_regs.h
index 99a0f991e850..a01b982b5783 100644
--- a/include/linux/ssb/ssb_regs.h
+++ b/include/linux/ssb/ssb_regs.h
@@ -326,6 +326,42 @@
 #define  SSB_SPROM5_GPIOB_P3		0xFF00	/* Pin 3 */
 #define  SSB_SPROM5_GPIOB_P3_SHIFT	8
 
+/* SPROM Revision 8 */
+#define SSB_SPROM8_BFLLO		0x1084	/* Boardflags (low 16 bits) */
+#define SSB_SPROM8_BFLHI		0x1086	/* Boardflags Hi */
+#define SSB_SPROM8_IL0MAC		0x108C	/* 6 byte MAC address */
+#define SSB_SPROM8_CCODE		0x1092	/* 2 byte country code */
+#define SSB_SPROM8_ANTAVAIL		0x109C  /* Antenna available bitfields*/
+#define SSB_SPROM8_ANTAVAIL_A		0xFF00	/* A-PHY bitfield */
+#define SSB_SPROM8_ANTAVAIL_A_SHIFT	8
+#define SSB_SPROM8_ANTAVAIL_BG		0x00FF	/* B-PHY and G-PHY bitfield */
+#define SSB_SPROM8_ANTAVAIL_BG_SHIFT	0
+#define SSB_SPROM8_AGAIN01		0x109E	/* Antenna Gain (in dBm Q5.2) */
+#define  SSB_SPROM8_AGAIN0		0x00FF	/* Antenna 0 */
+#define  SSB_SPROM8_AGAIN0_SHIFT	0
+#define  SSB_SPROM8_AGAIN1		0xFF00	/* Antenna 1 */
+#define  SSB_SPROM8_AGAIN1_SHIFT	8
+#define SSB_SPROM8_AGAIN23		0x10A0
+#define  SSB_SPROM8_AGAIN2		0x00FF	/* Antenna 2 */
+#define  SSB_SPROM8_AGAIN2_SHIFT	0
+#define  SSB_SPROM8_AGAIN3		0xFF00	/* Antenna 3 */
+#define  SSB_SPROM8_AGAIN3_SHIFT	8
+#define SSB_SPROM8_GPIOA		0x1096	/*Gen. Purpose IO # 0 and 1 */
+#define  SSB_SPROM8_GPIOA_P0		0x00FF	/* Pin 0 */
+#define  SSB_SPROM8_GPIOA_P1		0xFF00	/* Pin 1 */
+#define  SSB_SPROM8_GPIOA_P1_SHIFT	8
+#define SSB_SPROM8_GPIOB		0x1098	/* Gen. Purpose IO # 2 and 3 */
+#define  SSB_SPROM8_GPIOB_P2		0x00FF	/* Pin 2 */
+#define  SSB_SPROM8_GPIOB_P3		0xFF00	/* Pin 3 */
+#define  SSB_SPROM8_GPIOB_P3_SHIFT	8
+#define SSB_SPROM8_MAXP_BG		0x10C0  /* Max Power BG in path 1 */
+#define  SSB_SPROM8_MAXP_BG_MASK	0x00FF  /* Mask for Max Power BG */
+#define  SSB_SPROM8_ITSSI_BG		0xFF00	/* Mask for path 1 itssi_bg */
+#define  SSB_SPROM8_ITSSI_BG_SHIFT	8
+#define SSB_SPROM8_MAXP_A		0x10C8  /* Max Power A in path 1 */
+#define  SSB_SPROM8_MAXP_A_MASK		0x00FF  /* Mask for Max Power A */
+#define  SSB_SPROM8_ITSSI_A		0xFF00	/* Mask for path 1 itssi_a */
+#define  SSB_SPROM8_ITSSI_A_SHIFT	8
 
 /* Values for SSB_SPROM1_BINF_CCODE */
 enum {
-- 
cgit v1.2.3-71-gd317


From 63649b6cf0a964582af2b4d4734e28ca90ec8f5c Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Thu, 1 Jan 2009 15:01:44 -0500
Subject: ath5k: support LEDs on Acer Aspire One netbook

Add vendor ID for Foxconn and use it to set the ath5k LED gpio and
polarity for Acer branded laptops.

base.c:
Changes-licensed-under: 3-Clause-BSD

Reported-by: Maxim Levitsky <maximlevitsky@gmail.com>
Tested-by: Maxim Levitsky <maximlevitsky@gmail.com>
Tested-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath5k/base.c | 7 +++++++
 include/linux/pci_ids.h           | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath5k/base.c b/drivers/net/wireless/ath5k/base.c
index 88618645a7e4..fdf7733e76e1 100644
--- a/drivers/net/wireless/ath5k/base.c
+++ b/drivers/net/wireless/ath5k/base.c
@@ -2596,6 +2596,13 @@ ath5k_init_leds(struct ath5k_softc *sc)
 		sc->led_pin = 1;
 		sc->led_on = 1;  /* active high */
 	}
+	/* Pin 3 on Foxconn chips used in Acer Aspire One (0x105b:e008) */
+	if (pdev->subsystem_vendor == PCI_VENDOR_ID_FOXCONN) {
+		__set_bit(ATH_STAT_LEDSOFT, sc->status);
+		sc->led_pin = 3;
+		sc->led_on = 0;  /* active low */
+	}
+
 	if (!test_bit(ATH_STAT_LEDSOFT, sc->status))
 		goto out;
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 302423afa136..5b7a48c1d616 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -834,6 +834,8 @@
 #define PCI_DEVICE_ID_PROMISE_20276	0x5275
 #define PCI_DEVICE_ID_PROMISE_20277	0x7275
 
+#define PCI_VENDOR_ID_FOXCONN		0x105b
+
 #define PCI_VENDOR_ID_UMC		0x1060
 #define PCI_DEVICE_ID_UMC_UM8673F	0x0101
 #define PCI_DEVICE_ID_UMC_UM8886BF	0x673a
-- 
cgit v1.2.3-71-gd317


From 5394af4d86ae51b369ff243c3f75b6f9a74e164b Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:31:59 +0200
Subject: mac80211: 802.11w - STA flag for MFP

Add flags for setting STA entries and struct ieee80211_if_sta to
indicate whether management frame protection (MFP) is used.

Signed-off-by: Jouni Malinen <j@w1.fi>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    | 2 ++
 include/net/cfg80211.h     | 2 ++
 net/mac80211/cfg.c         | 4 ++++
 net/mac80211/debugfs_sta.c | 5 +++--
 net/mac80211/ieee80211_i.h | 1 +
 net/mac80211/mlme.c        | 7 +++++--
 net/mac80211/sta_info.h    | 2 ++
 7 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index e86ed59f9ad5..218f0e73a7ae 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -412,12 +412,14 @@ enum nl80211_iftype {
  * @NL80211_STA_FLAG_SHORT_PREAMBLE: station is capable of receiving frames
  *	with short barker preamble
  * @NL80211_STA_FLAG_WME: station is WME/QoS capable
+ * @NL80211_STA_FLAG_MFP: station uses management frame protection
  */
 enum nl80211_sta_flags {
 	__NL80211_STA_FLAG_INVALID,
 	NL80211_STA_FLAG_AUTHORIZED,
 	NL80211_STA_FLAG_SHORT_PREAMBLE,
 	NL80211_STA_FLAG_WME,
+	NL80211_STA_FLAG_MFP,
 
 	/* keep last */
 	__NL80211_STA_FLAG_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 23c0ab74ded6..6619ed106134 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -112,12 +112,14 @@ struct beacon_parameters {
  * @STATION_FLAG_SHORT_PREAMBLE: station is capable of receiving frames
  *	with short preambles
  * @STATION_FLAG_WME: station is WME/QoS capable
+ * @STATION_FLAG_MFP: station uses management frame protection
  */
 enum station_flags {
 	STATION_FLAG_CHANGED		= 1<<0,
 	STATION_FLAG_AUTHORIZED		= 1<<NL80211_STA_FLAG_AUTHORIZED,
 	STATION_FLAG_SHORT_PREAMBLE	= 1<<NL80211_STA_FLAG_SHORT_PREAMBLE,
 	STATION_FLAG_WME		= 1<<NL80211_STA_FLAG_WME,
+	STATION_FLAG_MFP		= 1<<NL80211_STA_FLAG_MFP,
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 9d4e4d846ec1..309d9189aa49 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -630,6 +630,10 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 		sta->flags &= ~WLAN_STA_WME;
 		if (params->station_flags & STATION_FLAG_WME)
 			sta->flags |= WLAN_STA_WME;
+
+		sta->flags &= ~WLAN_STA_MFP;
+		if (params->station_flags & STATION_FLAG_MFP)
+			sta->flags |= WLAN_STA_MFP;
 		spin_unlock_bh(&sta->lock);
 	}
 
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index a2fbe0131312..90230c718b5b 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -67,14 +67,15 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf,
 	char buf[100];
 	struct sta_info *sta = file->private_data;
 	u32 staflags = get_sta_flags(sta);
-	int res = scnprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s",
+	int res = scnprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s",
 		staflags & WLAN_STA_AUTH ? "AUTH\n" : "",
 		staflags & WLAN_STA_ASSOC ? "ASSOC\n" : "",
 		staflags & WLAN_STA_PS ? "PS\n" : "",
 		staflags & WLAN_STA_AUTHORIZED ? "AUTHORIZED\n" : "",
 		staflags & WLAN_STA_SHORT_PREAMBLE ? "SHORT PREAMBLE\n" : "",
 		staflags & WLAN_STA_WME ? "WME\n" : "",
-		staflags & WLAN_STA_WDS ? "WDS\n" : "");
+		staflags & WLAN_STA_WDS ? "WDS\n" : "",
+		staflags & WLAN_STA_MFP ? "MFP\n" : "");
 	return simple_read_from_buffer(userbuf, count, ppos, buf, res);
 }
 STA_OPS(flags);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 3db6bc3cdaf2..b5f86cb17630 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -260,6 +260,7 @@ struct mesh_preq_queue {
 #define IEEE80211_STA_PRIVACY_INVOKED	BIT(13)
 #define IEEE80211_STA_TKIP_WEP_USED	BIT(14)
 #define IEEE80211_STA_CSA_RECEIVED	BIT(15)
+#define IEEE80211_STA_MFP_ENABLED	BIT(16)
 /* flags for MLME request */
 #define IEEE80211_STA_REQ_SCAN 0
 #define IEEE80211_STA_REQ_DIRECT_PROBE 1
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a1e683e305f0..bc8a7f1a6a15 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1,6 +1,6 @@
 /*
  * BSS client mode implementation
- * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
  * Copyright 2004, Instant802 Networks, Inc.
  * Copyright 2005, Devicescape Software, Inc.
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
@@ -472,7 +472,7 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	/* u.deauth.reason_code == u.disassoc.reason_code */
 	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
 
-	ieee80211_tx_skb(sdata, skb, 0);
+	ieee80211_tx_skb(sdata, skb, ifsta->flags & IEEE80211_STA_MFP_ENABLED);
 }
 
 /* MLME */
@@ -1408,6 +1408,9 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 
 	rate_control_rate_init(sta);
 
+	if (ifsta->flags & IEEE80211_STA_MFP_ENABLED)
+		set_sta_flags(sta, WLAN_STA_MFP);
+
 	if (elems.wmm_param)
 		set_sta_flags(sta, WLAN_STA_WME);
 
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index b683d3f5ef8a..d13a44b935e2 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -34,6 +34,7 @@
  * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the
  *	IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next
  *	frame to this station is transmitted.
+ * @WLAN_STA_MFP: Management frame protection is used with this STA.
  */
 enum ieee80211_sta_info_flags {
 	WLAN_STA_AUTH		= 1<<0,
@@ -46,6 +47,7 @@ enum ieee80211_sta_info_flags {
 	WLAN_STA_WDS		= 1<<7,
 	WLAN_STA_PSPOLL		= 1<<8,
 	WLAN_STA_CLEAR_PS_FILT	= 1<<9,
+	WLAN_STA_MFP		= 1<<10,
 };
 
 #define STA_TID_NUM 16
-- 
cgit v1.2.3-71-gd317


From fb7333367632c67d8b6b06fb8d906cdabb11b02a Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:00 +0200
Subject: mac80211: 802.11w - CCMP for management frames

Extend CCMP to support encryption and decryption of unicast management
frames.

Signed-off-by: Jouni Malinen <j@w1.fi>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 30 ++++++++++++++++++++++++++++++
 net/mac80211/tx.c         | 23 ++++++++++++++++++++++-
 net/mac80211/wpa.c        | 18 ++++++++++++------
 3 files changed, 64 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index cade2556af0e..d5165895f316 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1030,6 +1030,7 @@ enum ieee80211_category {
 	WLAN_CATEGORY_QOS = 1,
 	WLAN_CATEGORY_DLS = 2,
 	WLAN_CATEGORY_BACK = 3,
+	WLAN_CATEGORY_PUBLIC = 4,
 	WLAN_CATEGORY_WMM = 17,
 };
 
@@ -1185,6 +1186,35 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr)
 		return hdr->addr1;
 }
 
+/**
+ * ieee80211_is_robust_mgmt_frame - check if frame is a robust management frame
+ * @hdr: the frame (buffer must include at least the first octet of payload)
+ */
+static inline bool ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr)
+{
+	if (ieee80211_is_disassoc(hdr->frame_control) ||
+	    ieee80211_is_deauth(hdr->frame_control))
+		return true;
+
+	if (ieee80211_is_action(hdr->frame_control)) {
+		u8 *category;
+
+		/*
+		 * Action frames, excluding Public Action frames, are Robust
+		 * Management Frames. However, if we are looking at a Protected
+		 * frame, skip the check since the data may be encrypted and
+		 * the frame has already been found to be a Robust Management
+		 * Frame (by the other end).
+		 */
+		if (ieee80211_has_protected(hdr->frame_control))
+			return true;
+		category = ((u8 *) hdr) + 24;
+		return *category != WLAN_CATEGORY_PUBLIC;
+	}
+
+	return false;
+}
+
 /**
  * ieee80211_fhss_chan_to_freq - get channel frequency
  * @channel: the FHSS channel
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index cd6bc87eec73..50c6c4fabea5 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -330,6 +330,22 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta,
+			     struct sk_buff *skb)
+{
+	if (!ieee80211_is_mgmt(fc))
+		return 0;
+
+	if (sta == NULL || !test_sta_flags(sta, WLAN_STA_MFP))
+		return 0;
+
+	if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *)
+					    skb->data))
+		return 0;
+
+	return 1;
+}
+
 static ieee80211_tx_result
 ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 {
@@ -428,10 +444,15 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 			if (ieee80211_is_auth(hdr->frame_control))
 				break;
 		case ALG_TKIP:
-		case ALG_CCMP:
 			if (!ieee80211_is_data_present(hdr->frame_control))
 				tx->key = NULL;
 			break;
+		case ALG_CCMP:
+			if (!ieee80211_is_data_present(hdr->frame_control) &&
+			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
+					       tx->skb))
+				tx->key = NULL;
+			break;
 		}
 	}
 
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 7aa63caf8d50..aff46adde3f0 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -266,7 +266,7 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch,
 				int encrypted)
 {
 	__le16 mask_fc;
-	int a4_included;
+	int a4_included, mgmt;
 	u8 qos_tid;
 	u8 *b_0, *aad;
 	u16 data_len, len_a;
@@ -277,12 +277,15 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch,
 	aad = scratch + 4 * AES_BLOCK_LEN;
 
 	/*
-	 * Mask FC: zero subtype b4 b5 b6
+	 * Mask FC: zero subtype b4 b5 b6 (if not mgmt)
 	 * Retry, PwrMgt, MoreData; set Protected
 	 */
+	mgmt = ieee80211_is_mgmt(hdr->frame_control);
 	mask_fc = hdr->frame_control;
-	mask_fc &= ~cpu_to_le16(0x0070 | IEEE80211_FCTL_RETRY |
+	mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY |
 				IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA);
+	if (!mgmt)
+		mask_fc &= ~cpu_to_le16(0x0070);
 	mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -300,8 +303,10 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch,
 
 	/* First block, b_0 */
 	b_0[0] = 0x59; /* flags: Adata: 1, M: 011, L: 001 */
-	/* Nonce: QoS Priority | A2 | PN */
-	b_0[1] = qos_tid;
+	/* Nonce: Nonce Flags | A2 | PN
+	 * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7)
+	 */
+	b_0[1] = qos_tid | (mgmt << 4);
 	memcpy(&b_0[2], hdr->addr2, ETH_ALEN);
 	memcpy(&b_0[8], pn, CCMP_PN_LEN);
 	/* l(m) */
@@ -446,7 +451,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
 
-	if (!ieee80211_is_data(hdr->frame_control))
+	if (!ieee80211_is_data(hdr->frame_control) &&
+	    !ieee80211_is_robust_mgmt_frame(hdr))
 		return RX_CONTINUE;
 
 	data_len = skb->len - hdrlen - CCMP_HDR_LEN - CCMP_MIC_LEN;
-- 
cgit v1.2.3-71-gd317


From 765cb46a3fc856245ea68a7c961ac87c77e4ae2d Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:01 +0200
Subject: mac80211: 802.11w - Add BIP (AES-128-CMAC)

Implement Broadcast/Multicast Integrity Protocol for management frame
protection. This patch adds the needed definitions for the new
information element (MMIE) and implementation for the new "encryption"
type (though, BIP is actually not encrypting data, it provides only
integrity protection). These routines will be used by a follow-on patch
that enables BIP for multicast/broadcast robust management frames.

Signed-off-by: Jouni Malinen <j@w1.fi>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  |  10 ++++
 net/mac80211/Makefile      |   1 +
 net/mac80211/aes_cmac.c    | 135 +++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/aes_cmac.h    |  19 +++++++
 net/mac80211/ieee80211_i.h |   2 +-
 net/mac80211/key.h         |  10 ++++
 net/mac80211/wpa.c         | 125 +++++++++++++++++++++++++++++++++++++++++
 net/mac80211/wpa.h         |   5 ++
 8 files changed, 306 insertions(+), 1 deletion(-)
 create mode 100644 net/mac80211/aes_cmac.c
 create mode 100644 net/mac80211/aes_cmac.h

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d5165895f316..cceb9e86c744 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -655,6 +655,15 @@ struct ieee80211_mgmt {
 #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)
 
 
+/* Management MIC information element (IEEE 802.11w) */
+struct ieee80211_mmie {
+	u8 element_id;
+	u8 length;
+	__le16 key_id;
+	u8 sequence_number[6];
+	u8 mic[8];
+} __attribute__ ((packed));
+
 /* Control frames */
 struct ieee80211_rts {
 	__le16 frame_control;
@@ -1018,6 +1027,7 @@ enum ieee80211_eid {
 	WLAN_EID_HT_INFORMATION = 61,
 	/* 802.11i */
 	WLAN_EID_RSN = 48,
+	WLAN_EID_MMIE = 76 /* 802.11w */,
 	WLAN_EID_WPA = 221,
 	WLAN_EID_GENERIC = 221,
 	WLAN_EID_VENDOR_SPECIFIC = 221,
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 7d4971aa443f..5c6fadfb6a00 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -15,6 +15,7 @@ mac80211-y := \
 	michael.o \
 	tkip.o \
 	aes_ccm.o \
+	aes_cmac.o \
 	cfg.o \
 	rx.o \
 	spectmgmt.o \
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
new file mode 100644
index 000000000000..3d097b3d7b62
--- /dev/null
+++ b/net/mac80211/aes_cmac.c
@@ -0,0 +1,135 @@
+/*
+ * AES-128-CMAC with TLen 16 for IEEE 802.11w BIP
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+
+#include <net/mac80211.h>
+#include "key.h"
+#include "aes_cmac.h"
+
+#define AES_BLOCK_SIZE 16
+#define AES_CMAC_KEY_LEN 16
+#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */
+#define AAD_LEN 20
+
+
+static void gf_mulx(u8 *pad)
+{
+	int i, carry;
+
+	carry = pad[0] & 0x80;
+	for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
+		pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
+	pad[AES_BLOCK_SIZE - 1] <<= 1;
+	if (carry)
+		pad[AES_BLOCK_SIZE - 1] ^= 0x87;
+}
+
+
+static void aes_128_cmac_vector(struct crypto_cipher *tfm, u8 *scratch,
+				size_t num_elem,
+				const u8 *addr[], const size_t *len, u8 *mac)
+{
+	u8 *cbc, *pad;
+	const u8 *pos, *end;
+	size_t i, e, left, total_len;
+
+	cbc = scratch;
+	pad = scratch + AES_BLOCK_SIZE;
+
+	memset(cbc, 0, AES_BLOCK_SIZE);
+
+	total_len = 0;
+	for (e = 0; e < num_elem; e++)
+		total_len += len[e];
+	left = total_len;
+
+	e = 0;
+	pos = addr[0];
+	end = pos + len[0];
+
+	while (left >= AES_BLOCK_SIZE) {
+		for (i = 0; i < AES_BLOCK_SIZE; i++) {
+			cbc[i] ^= *pos++;
+			if (pos >= end) {
+				e++;
+				pos = addr[e];
+				end = pos + len[e];
+			}
+		}
+		if (left > AES_BLOCK_SIZE)
+			crypto_cipher_encrypt_one(tfm, cbc, cbc);
+		left -= AES_BLOCK_SIZE;
+	}
+
+	memset(pad, 0, AES_BLOCK_SIZE);
+	crypto_cipher_encrypt_one(tfm, pad, pad);
+	gf_mulx(pad);
+
+	if (left || total_len == 0) {
+		for (i = 0; i < left; i++) {
+			cbc[i] ^= *pos++;
+			if (pos >= end) {
+				e++;
+				pos = addr[e];
+				end = pos + len[e];
+			}
+		}
+		cbc[left] ^= 0x80;
+		gf_mulx(pad);
+	}
+
+	for (i = 0; i < AES_BLOCK_SIZE; i++)
+		pad[i] ^= cbc[i];
+	crypto_cipher_encrypt_one(tfm, pad, pad);
+	memcpy(mac, pad, CMAC_TLEN);
+}
+
+
+void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad,
+			const u8 *data, size_t data_len, u8 *mic)
+{
+	const u8 *addr[3];
+	size_t len[3];
+	u8 zero[CMAC_TLEN];
+
+	memset(zero, 0, CMAC_TLEN);
+	addr[0] = aad;
+	len[0] = AAD_LEN;
+	addr[1] = data;
+	len[1] = data_len - CMAC_TLEN;
+	addr[2] = zero;
+	len[2] = CMAC_TLEN;
+
+	aes_128_cmac_vector(tfm, scratch, 3, addr, len, mic);
+}
+
+
+struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[])
+{
+	struct crypto_cipher *tfm;
+
+	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return NULL;
+
+	crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
+
+	return tfm;
+}
+
+
+void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
+{
+	if (tfm)
+		crypto_free_cipher(tfm);
+}
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
new file mode 100644
index 000000000000..0eb9a4831508
--- /dev/null
+++ b/net/mac80211/aes_cmac.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef AES_CMAC_H
+#define AES_CMAC_H
+
+#include <linux/crypto.h>
+
+struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]);
+void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad,
+			const u8 *data, size_t data_len, u8 *mic);
+void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
+
+#endif /* AES_CMAC_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b5f86cb17630..20af92abd61d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -43,7 +43,7 @@ struct ieee80211_local;
 
 /* Required encryption head and tailroom */
 #define IEEE80211_ENCRYPT_HEADROOM 8
-#define IEEE80211_ENCRYPT_TAILROOM 12
+#define IEEE80211_ENCRYPT_TAILROOM 18
 
 /* IEEE 802.11 (Ch. 9.5 Defragmentation) requires support for concurrent
  * reception of at least three fragmented frames. This limit can be increased
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 425816e0996c..73ac28ca2ede 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -96,6 +96,16 @@ struct ieee80211_key {
 			u8 tx_crypto_buf[6 * AES_BLOCK_LEN];
 			u8 rx_crypto_buf[6 * AES_BLOCK_LEN];
 		} ccmp;
+		struct {
+			u8 tx_pn[6];
+			u8 rx_pn[6];
+			struct crypto_cipher *tfm;
+			u32 replays; /* dot11RSNAStatsCMACReplays */
+			u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
+			/* scratch buffers for virt_to_page() (crypto API) */
+			u8 tx_crypto_buf[2 * AES_BLOCK_LEN];
+			u8 rx_crypto_buf[2 * AES_BLOCK_LEN];
+		} aes_cmac;
 	} u;
 
 	/* number of times this key has been used */
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index aff46adde3f0..53e11e6ff66e 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -19,6 +20,7 @@
 #include "michael.h"
 #include "tkip.h"
 #include "aes_ccm.h"
+#include "aes_cmac.h"
 #include "wpa.h"
 
 ieee80211_tx_result
@@ -491,3 +493,126 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
 
 	return RX_CONTINUE;
 }
+
+
+static void bip_aad(struct sk_buff *skb, u8 *aad)
+{
+	/* BIP AAD: FC(masked) || A1 || A2 || A3 */
+
+	/* FC type/subtype */
+	aad[0] = skb->data[0];
+	/* Mask FC Retry, PwrMgt, MoreData flags to zero */
+	aad[1] = skb->data[1] & ~(BIT(4) | BIT(5) | BIT(6));
+	/* A1 || A2 || A3 */
+	memcpy(aad + 2, skb->data + 4, 3 * ETH_ALEN);
+}
+
+
+static inline void bip_ipn_swap(u8 *d, const u8 *s)
+{
+	*d++ = s[5];
+	*d++ = s[4];
+	*d++ = s[3];
+	*d++ = s[2];
+	*d++ = s[1];
+	*d = s[0];
+}
+
+
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_key *key = tx->key;
+	struct ieee80211_mmie *mmie;
+	u8 *pn, aad[20];
+	int i;
+
+	if (tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
+		/* hwaccel */
+		info->control.hw_key = &tx->key->conf;
+		return 0;
+	}
+
+	if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
+		return TX_DROP;
+
+	mmie = (struct ieee80211_mmie *) skb_put(skb, sizeof(*mmie));
+	mmie->element_id = WLAN_EID_MMIE;
+	mmie->length = sizeof(*mmie) - 2;
+	mmie->key_id = cpu_to_le16(key->conf.keyidx);
+
+	/* PN = PN + 1 */
+	pn = key->u.aes_cmac.tx_pn;
+
+	for (i = sizeof(key->u.aes_cmac.tx_pn) - 1; i >= 0; i--) {
+		pn[i]++;
+		if (pn[i])
+			break;
+	}
+	bip_ipn_swap(mmie->sequence_number, pn);
+
+	bip_aad(skb, aad);
+
+	/*
+	 * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64)
+	 */
+	ieee80211_aes_cmac(key->u.aes_cmac.tfm, key->u.aes_cmac.tx_crypto_buf,
+			   aad, skb->data + 24, skb->len - 24, mmie->mic);
+
+	return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_key *key = rx->key;
+	struct ieee80211_mmie *mmie;
+	u8 aad[20], mic[8], ipn[6];
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (!ieee80211_is_mgmt(hdr->frame_control))
+		return RX_CONTINUE;
+
+	if ((rx->status->flag & RX_FLAG_DECRYPTED) &&
+	    (rx->status->flag & RX_FLAG_IV_STRIPPED))
+		return RX_CONTINUE;
+
+	if (skb->len < 24 + sizeof(*mmie))
+		return RX_DROP_UNUSABLE;
+
+	mmie = (struct ieee80211_mmie *)
+		(skb->data + skb->len - sizeof(*mmie));
+	if (mmie->element_id != WLAN_EID_MMIE ||
+	    mmie->length != sizeof(*mmie) - 2)
+		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+
+	bip_ipn_swap(ipn, mmie->sequence_number);
+
+	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
+		key->u.aes_cmac.replays++;
+		return RX_DROP_UNUSABLE;
+	}
+
+	if (!(rx->status->flag & RX_FLAG_DECRYPTED)) {
+		/* hardware didn't decrypt/verify MIC */
+		bip_aad(skb, aad);
+		ieee80211_aes_cmac(key->u.aes_cmac.tfm,
+				   key->u.aes_cmac.rx_crypto_buf, aad,
+				   skb->data + 24, skb->len - 24, mic);
+		if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+			key->u.aes_cmac.icverrors++;
+			return RX_DROP_UNUSABLE;
+		}
+	}
+
+	memcpy(key->u.aes_cmac.rx_pn, ipn, 6);
+
+	/* Remove MMIE */
+	skb_trim(skb, skb->len - sizeof(*mmie));
+
+	return RX_CONTINUE;
+}
diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h
index d42d221d8a1d..baba0608313e 100644
--- a/net/mac80211/wpa.h
+++ b/net/mac80211/wpa.h
@@ -28,4 +28,9 @@ ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx);
 ieee80211_rx_result
 ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx);
 
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx);
+
 #endif /* WPA_H */
-- 
cgit v1.2.3-71-gd317


From 3cfcf6ac6d69dc290e96416731eea5c88ac7d426 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:02 +0200
Subject: mac80211: 802.11w - Use BIP (AES-128-CMAC)

Add mechanism for managing BIP keys (IGTK) and integrate BIP into the
TX/RX paths.

Signed-off-by: Jouni Malinen <j@w1.fi>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath5k/pcu.c |  3 ++
 include/linux/ieee80211.h        |  1 +
 include/linux/nl80211.h          |  6 ++-
 include/net/cfg80211.h           |  5 +++
 include/net/mac80211.h           |  2 +
 net/mac80211/cfg.c               | 31 ++++++++++++++
 net/mac80211/debugfs_key.c       | 79 ++++++++++++++++++++++++++++++++++-
 net/mac80211/debugfs_key.h       | 10 +++++
 net/mac80211/ieee80211_i.h       |  5 ++-
 net/mac80211/key.c               | 62 ++++++++++++++++++++++++++-
 net/mac80211/key.h               |  6 +++
 net/mac80211/rx.c                | 90 ++++++++++++++++++++++++++++++++++++----
 net/mac80211/tx.c                |  9 ++++
 net/wireless/nl80211.c           | 29 +++++++++----
 14 files changed, 317 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath5k/pcu.c b/drivers/net/wireless/ath5k/pcu.c
index 5b416ed65299..e758b70ab7eb 100644
--- a/drivers/net/wireless/ath5k/pcu.c
+++ b/drivers/net/wireless/ath5k/pcu.c
@@ -1026,6 +1026,9 @@ int ath5k_keycache_type(const struct ieee80211_key_conf *key)
 			return AR5K_KEYTABLE_TYPE_40;
 		else if (key->keylen == LEN_WEP104)
 			return AR5K_KEYTABLE_TYPE_104;
+		return -EINVAL;
+	default:
+		return -EINVAL;
 	}
 	return -EINVAL;
 }
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index cceb9e86c744..df98a8a549a2 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1139,6 +1139,7 @@ enum ieee80211_back_parties {
 /* reserved: 				0x000FAC03 */
 #define WLAN_CIPHER_SUITE_CCMP		0x000FAC04
 #define WLAN_CIPHER_SUITE_WEP104	0x000FAC05
+#define WLAN_CIPHER_SUITE_AES_CMAC	0x000FAC06
 
 #define WLAN_MAX_KEY_LEN		32
 
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 218f0e73a7ae..ee742bc9761e 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -72,8 +72,8 @@
  *
  * @NL80211_CMD_GET_KEY: Get sequence counter information for a key specified
  *	by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC.
- * @NL80211_CMD_SET_KEY: Set key attributes %NL80211_ATTR_KEY_DEFAULT or
- *	%NL80211_ATTR_KEY_THRESHOLD.
+ * @NL80211_CMD_SET_KEY: Set key attributes %NL80211_ATTR_KEY_DEFAULT,
+ *	%NL80211_ATTR_KEY_DEFAULT_MGMT, or %NL80211_ATTR_KEY_THRESHOLD.
  * @NL80211_CMD_NEW_KEY: add a key with given %NL80211_ATTR_KEY_DATA,
  *	%NL80211_ATTR_KEY_IDX, %NL80211_ATTR_MAC and %NL80211_ATTR_KEY_CIPHER
  *	attributes.
@@ -346,6 +346,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_WIPHY_FREQ,
 	NL80211_ATTR_WIPHY_CHANNEL_TYPE,
 
+	NL80211_ATTR_KEY_DEFAULT_MGMT,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6619ed106134..df78abc496f1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -473,6 +473,8 @@ struct ieee80211_channel;
  *
  * @set_default_key: set the default key on an interface
  *
+ * @set_default_mgmt_key: set the default management frame key on an interface
+ *
  * @add_beacon: Add a beacon with given parameters, @head, @interval
  *	and @dtim_period will be valid, @tail is optional.
  * @set_beacon: Change the beacon parameters for an access point mode
@@ -520,6 +522,9 @@ struct cfg80211_ops {
 	int	(*set_default_key)(struct wiphy *wiphy,
 				   struct net_device *netdev,
 				   u8 key_index);
+	int	(*set_default_mgmt_key)(struct wiphy *wiphy,
+					struct net_device *netdev,
+					u8 key_index);
 
 	int	(*add_beacon)(struct wiphy *wiphy, struct net_device *dev,
 			      struct beacon_parameters *info);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8a305bfdb87b..61f1f37a9e27 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -651,11 +651,13 @@ struct ieee80211_if_conf {
  * @ALG_WEP: WEP40 or WEP104
  * @ALG_TKIP: TKIP
  * @ALG_CCMP: CCMP (AES)
+ * @ALG_AES_CMAC: AES-128-CMAC
  */
 enum ieee80211_key_alg {
 	ALG_WEP,
 	ALG_TKIP,
 	ALG_CCMP,
+	ALG_AES_CMAC,
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 309d9189aa49..72c106915433 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -133,6 +133,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 	case WLAN_CIPHER_SUITE_CCMP:
 		alg = ALG_CCMP;
 		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		alg = ALG_AES_CMAC;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -275,6 +278,17 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 		else
 			params.cipher = WLAN_CIPHER_SUITE_WEP104;
 		break;
+	case ALG_AES_CMAC:
+		params.cipher = WLAN_CIPHER_SUITE_AES_CMAC;
+		seq[0] = key->u.aes_cmac.tx_pn[5];
+		seq[1] = key->u.aes_cmac.tx_pn[4];
+		seq[2] = key->u.aes_cmac.tx_pn[3];
+		seq[3] = key->u.aes_cmac.tx_pn[2];
+		seq[4] = key->u.aes_cmac.tx_pn[1];
+		seq[5] = key->u.aes_cmac.tx_pn[0];
+		params.seq = seq;
+		params.seq_len = 6;
+		break;
 	}
 
 	params.key = key->conf.key;
@@ -304,6 +318,22 @@ static int ieee80211_config_default_key(struct wiphy *wiphy,
 	return 0;
 }
 
+static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
+					     struct net_device *dev,
+					     u8 key_idx)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	rcu_read_lock();
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	ieee80211_set_default_mgmt_key(sdata, key_idx);
+
+	rcu_read_unlock();
+
+	return 0;
+}
+
 static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
@@ -1153,6 +1183,7 @@ struct cfg80211_ops mac80211_config_ops = {
 	.del_key = ieee80211_del_key,
 	.get_key = ieee80211_get_key,
 	.set_default_key = ieee80211_config_default_key,
+	.set_default_mgmt_key = ieee80211_config_default_mgmt_key,
 	.add_beacon = ieee80211_add_beacon,
 	.set_beacon = ieee80211_set_beacon,
 	.del_beacon = ieee80211_del_beacon,
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 6424ac565ae0..99c752588b30 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -76,6 +76,9 @@ static ssize_t key_algorithm_read(struct file *file,
 	case ALG_CCMP:
 		alg = "CCMP\n";
 		break;
+	case ALG_AES_CMAC:
+		alg = "AES-128-CMAC\n";
+		break;
 	default:
 		return 0;
 	}
@@ -105,6 +108,12 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
 		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
 				tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]);
 		break;
+	case ALG_AES_CMAC:
+		tpn = key->u.aes_cmac.tx_pn;
+		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
+				tpn[0], tpn[1], tpn[2], tpn[3], tpn[4],
+				tpn[5]);
+		break;
 	default:
 		return 0;
 	}
@@ -142,6 +151,14 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
 		}
 		len = p - buf;
 		break;
+	case ALG_AES_CMAC:
+		rpn = key->u.aes_cmac.rx_pn;
+		p += scnprintf(p, sizeof(buf)+buf-p,
+			       "%02x%02x%02x%02x%02x%02x\n",
+			       rpn[0], rpn[1], rpn[2],
+			       rpn[3], rpn[4], rpn[5]);
+		len = p - buf;
+		break;
 	default:
 		return 0;
 	}
@@ -156,13 +173,40 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf,
 	char buf[20];
 	int len;
 
-	if (key->conf.alg != ALG_CCMP)
+	switch (key->conf.alg) {
+	case ALG_CCMP:
+		len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
+		break;
+	case ALG_AES_CMAC:
+		len = scnprintf(buf, sizeof(buf), "%u\n",
+				key->u.aes_cmac.replays);
+		break;
+	default:
 		return 0;
-	len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
+	}
 	return simple_read_from_buffer(userbuf, count, ppos, buf, len);
 }
 KEY_OPS(replays);
 
+static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
+				  size_t count, loff_t *ppos)
+{
+	struct ieee80211_key *key = file->private_data;
+	char buf[20];
+	int len;
+
+	switch (key->conf.alg) {
+	case ALG_AES_CMAC:
+		len = scnprintf(buf, sizeof(buf), "%u\n",
+				key->u.aes_cmac.icverrors);
+		break;
+	default:
+		return 0;
+	}
+	return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(icverrors);
+
 static ssize_t key_key_read(struct file *file, char __user *userbuf,
 			    size_t count, loff_t *ppos)
 {
@@ -222,6 +266,7 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key)
 	DEBUGFS_ADD(tx_spec);
 	DEBUGFS_ADD(rx_spec);
 	DEBUGFS_ADD(replays);
+	DEBUGFS_ADD(icverrors);
 	DEBUGFS_ADD(key);
 	DEBUGFS_ADD(ifindex);
 };
@@ -243,6 +288,7 @@ void ieee80211_debugfs_key_remove(struct ieee80211_key *key)
 	DEBUGFS_DEL(tx_spec);
 	DEBUGFS_DEL(rx_spec);
 	DEBUGFS_DEL(replays);
+	DEBUGFS_DEL(icverrors);
 	DEBUGFS_DEL(key);
 	DEBUGFS_DEL(ifindex);
 
@@ -280,6 +326,35 @@ void ieee80211_debugfs_key_remove_default(struct ieee80211_sub_if_data *sdata)
 	sdata->common_debugfs.default_key = NULL;
 }
 
+void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata)
+{
+	char buf[50];
+	struct ieee80211_key *key;
+
+	if (!sdata->debugfsdir)
+		return;
+
+	/* this is running under the key lock */
+
+	key = sdata->default_mgmt_key;
+	if (key) {
+		sprintf(buf, "../keys/%d", key->debugfs.cnt);
+		sdata->common_debugfs.default_mgmt_key =
+			debugfs_create_symlink("default_mgmt_key",
+					       sdata->debugfsdir, buf);
+	} else
+		ieee80211_debugfs_key_remove_mgmt_default(sdata);
+}
+
+void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata)
+{
+	if (!sdata)
+		return;
+
+	debugfs_remove(sdata->common_debugfs.default_mgmt_key);
+	sdata->common_debugfs.default_mgmt_key = NULL;
+}
+
 void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
 				   struct sta_info *sta)
 {
diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h
index b1a3754ee240..54717b4e1371 100644
--- a/net/mac80211/debugfs_key.h
+++ b/net/mac80211/debugfs_key.h
@@ -6,6 +6,10 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key);
 void ieee80211_debugfs_key_remove(struct ieee80211_key *key);
 void ieee80211_debugfs_key_add_default(struct ieee80211_sub_if_data *sdata);
 void ieee80211_debugfs_key_remove_default(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_add_mgmt_default(
+	struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_remove_mgmt_default(
+	struct ieee80211_sub_if_data *sdata);
 void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
 				   struct sta_info *sta);
 #else
@@ -19,6 +23,12 @@ static inline void ieee80211_debugfs_key_add_default(
 static inline void ieee80211_debugfs_key_remove_default(
 	struct ieee80211_sub_if_data *sdata)
 {}
+static inline void ieee80211_debugfs_key_add_mgmt_default(
+	struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_remove_mgmt_default(
+	struct ieee80211_sub_if_data *sdata)
+{}
 static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
 						 struct sta_info *sta)
 {}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 20af92abd61d..8c3245717c55 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -409,8 +409,10 @@ struct ieee80211_sub_if_data {
 	unsigned int fragment_next;
 
 #define NUM_DEFAULT_KEYS 4
-	struct ieee80211_key *keys[NUM_DEFAULT_KEYS];
+#define NUM_DEFAULT_MGMT_KEYS 2
+	struct ieee80211_key *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
 	struct ieee80211_key *default_key;
+	struct ieee80211_key *default_mgmt_key;
 
 	u16 sequence_number;
 
@@ -482,6 +484,7 @@ struct ieee80211_sub_if_data {
 	} debugfs;
 	struct {
 		struct dentry *default_key;
+		struct dentry *default_mgmt_key;
 	} common_debugfs;
 
 #ifdef CONFIG_MAC80211_MESH
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index b0a025c9b615..19b480de4bbc 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -18,6 +18,7 @@
 #include "ieee80211_i.h"
 #include "debugfs_key.h"
 #include "aes_ccm.h"
+#include "aes_cmac.h"
 
 
 /**
@@ -215,13 +216,38 @@ void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx)
 	spin_unlock_irqrestore(&sdata->local->key_lock, flags);
 }
 
+static void
+__ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx)
+{
+	struct ieee80211_key *key = NULL;
+
+	if (idx >= NUM_DEFAULT_KEYS &&
+	    idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+		key = sdata->keys[idx];
+
+	rcu_assign_pointer(sdata->default_mgmt_key, key);
+
+	if (key)
+		add_todo(key, KEY_FLAG_TODO_DEFMGMTKEY);
+}
+
+void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
+				    int idx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sdata->local->key_lock, flags);
+	__ieee80211_set_default_mgmt_key(sdata, idx);
+	spin_unlock_irqrestore(&sdata->local->key_lock, flags);
+}
+
 
 static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 				    struct sta_info *sta,
 				    struct ieee80211_key *old,
 				    struct ieee80211_key *new)
 {
-	int idx, defkey;
+	int idx, defkey, defmgmtkey;
 
 	if (new)
 		list_add(&new->list, &sdata->key_list);
@@ -237,13 +263,19 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 			idx = new->conf.keyidx;
 
 		defkey = old && sdata->default_key == old;
+		defmgmtkey = old && sdata->default_mgmt_key == old;
 
 		if (defkey && !new)
 			__ieee80211_set_default_key(sdata, -1);
+		if (defmgmtkey && !new)
+			__ieee80211_set_default_mgmt_key(sdata, -1);
 
 		rcu_assign_pointer(sdata->keys[idx], new);
 		if (defkey && new)
 			__ieee80211_set_default_key(sdata, new->conf.keyidx);
+		if (defmgmtkey && new)
+			__ieee80211_set_default_mgmt_key(sdata,
+							 new->conf.keyidx);
 	}
 
 	if (old) {
@@ -262,7 +294,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 {
 	struct ieee80211_key *key;
 
-	BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS);
+	BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS);
 
 	key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL);
 	if (!key)
@@ -291,6 +323,10 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 		key->conf.iv_len = CCMP_HDR_LEN;
 		key->conf.icv_len = CCMP_MIC_LEN;
 		break;
+	case ALG_AES_CMAC:
+		key->conf.iv_len = 0;
+		key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		break;
 	}
 	memcpy(key->conf.key, key_data, key_len);
 	INIT_LIST_HEAD(&key->list);
@@ -308,6 +344,19 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 		}
 	}
 
+	if (alg == ALG_AES_CMAC) {
+		/*
+		 * Initialize AES key state here as an optimization so that
+		 * it does not need to be initialized for every packet.
+		 */
+		key->u.aes_cmac.tfm =
+			ieee80211_aes_cmac_key_setup(key_data);
+		if (!key->u.aes_cmac.tfm) {
+			kfree(key);
+			return NULL;
+		}
+	}
+
 	return key;
 }
 
@@ -461,6 +510,8 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key)
 
 	if (key->conf.alg == ALG_CCMP)
 		ieee80211_aes_key_free(key->u.ccmp.tfm);
+	if (key->conf.alg == ALG_AES_CMAC)
+		ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
 	ieee80211_debugfs_key_remove(key);
 
 	kfree(key);
@@ -483,6 +534,7 @@ static void __ieee80211_key_todo(void)
 		list_del_init(&key->todo);
 		todoflags = key->flags & (KEY_FLAG_TODO_ADD_DEBUGFS |
 					  KEY_FLAG_TODO_DEFKEY |
+					  KEY_FLAG_TODO_DEFMGMTKEY |
 					  KEY_FLAG_TODO_HWACCEL_ADD |
 					  KEY_FLAG_TODO_HWACCEL_REMOVE |
 					  KEY_FLAG_TODO_DELETE);
@@ -500,6 +552,11 @@ static void __ieee80211_key_todo(void)
 			ieee80211_debugfs_key_add_default(key->sdata);
 			work_done = true;
 		}
+		if (todoflags & KEY_FLAG_TODO_DEFMGMTKEY) {
+			ieee80211_debugfs_key_remove_mgmt_default(key->sdata);
+			ieee80211_debugfs_key_add_mgmt_default(key->sdata);
+			work_done = true;
+		}
 		if (todoflags & KEY_FLAG_TODO_HWACCEL_ADD) {
 			ieee80211_key_enable_hw_accel(key);
 			work_done = true;
@@ -535,6 +592,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata)
 	ieee80211_key_lock();
 
 	ieee80211_debugfs_key_remove_default(sdata);
+	ieee80211_debugfs_key_remove_mgmt_default(sdata);
 
 	spin_lock_irqsave(&sdata->local->key_lock, flags);
 	list_for_each_entry_safe(key, tmp, &sdata->key_list, list)
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 73ac28ca2ede..215d3ef42a4f 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -46,6 +46,8 @@ struct sta_info;
  *	acceleration.
  * @KEY_FLAG_TODO_DEFKEY: Key is default key and debugfs needs to be updated.
  * @KEY_FLAG_TODO_ADD_DEBUGFS: Key needs to be added to debugfs.
+ * @KEY_FLAG_TODO_DEFMGMTKEY: Key is default management key and debugfs needs
+ *	to be updated.
  */
 enum ieee80211_internal_key_flags {
 	KEY_FLAG_UPLOADED_TO_HARDWARE	= BIT(0),
@@ -54,6 +56,7 @@ enum ieee80211_internal_key_flags {
 	KEY_FLAG_TODO_HWACCEL_REMOVE	= BIT(3),
 	KEY_FLAG_TODO_DEFKEY		= BIT(4),
 	KEY_FLAG_TODO_ADD_DEBUGFS	= BIT(5),
+	KEY_FLAG_TODO_DEFMGMTKEY	= BIT(6),
 };
 
 struct tkip_ctx {
@@ -124,6 +127,7 @@ struct ieee80211_key {
 		struct dentry *tx_spec;
 		struct dentry *rx_spec;
 		struct dentry *replays;
+		struct dentry *icverrors;
 		struct dentry *key;
 		struct dentry *ifindex;
 		int cnt;
@@ -150,6 +154,8 @@ void ieee80211_key_link(struct ieee80211_key *key,
 			struct sta_info *sta);
 void ieee80211_key_free(struct ieee80211_key *key);
 void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx);
+void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
+				    int idx);
 void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata);
 void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata);
 void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b68e082e99ce..abc3aa583ca6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -446,6 +446,52 @@ ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+
+static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (skb->len < 24 || is_multicast_ether_addr(hdr->addr1))
+		return 0;
+
+	return ieee80211_is_robust_mgmt_frame(hdr);
+}
+
+
+static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (skb->len < 24 || !is_multicast_ether_addr(hdr->addr1))
+		return 0;
+
+	return ieee80211_is_robust_mgmt_frame(hdr);
+}
+
+
+/* Get the BIP key index from MMIE; return -1 if this is not a BIP frame */
+static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data;
+	struct ieee80211_mmie *mmie;
+
+	if (skb->len < 24 + sizeof(*mmie) ||
+	    !is_multicast_ether_addr(hdr->da))
+		return -1;
+
+	if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) hdr))
+		return -1; /* not a robust management frame */
+
+	mmie = (struct ieee80211_mmie *)
+		(skb->data + skb->len - sizeof(*mmie));
+	if (mmie->element_id != WLAN_EID_MMIE ||
+	    mmie->length != sizeof(*mmie) - 2)
+		return -1;
+
+	return le16_to_cpu(mmie->key_id);
+}
+
+
 static ieee80211_rx_result
 ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
 {
@@ -561,21 +607,23 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	int hdrlen;
 	ieee80211_rx_result result = RX_DROP_UNUSABLE;
 	struct ieee80211_key *stakey = NULL;
+	int mmie_keyidx = -1;
 
 	/*
 	 * Key selection 101
 	 *
-	 * There are three types of keys:
+	 * There are four types of keys:
 	 *  - GTK (group keys)
+	 *  - IGTK (group keys for management frames)
 	 *  - PTK (pairwise keys)
 	 *  - STK (station-to-station pairwise keys)
 	 *
 	 * When selecting a key, we have to distinguish between multicast
 	 * (including broadcast) and unicast frames, the latter can only
-	 * use PTKs and STKs while the former always use GTKs. Unless, of
-	 * course, actual WEP keys ("pre-RSNA") are used, then unicast
-	 * frames can also use key indizes like GTKs. Hence, if we don't
-	 * have a PTK/STK we check the key index for a WEP key.
+	 * use PTKs and STKs while the former always use GTKs and IGTKs.
+	 * Unless, of course, actual WEP keys ("pre-RSNA") are used, then
+	 * unicast frames can also use key indices like GTKs. Hence, if we
+	 * don't have a PTK/STK we check the key index for a WEP key.
 	 *
 	 * Note that in a regular BSS, multicast frames are sent by the
 	 * AP only, associated stations unicast the frame to the AP first
@@ -588,8 +636,14 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	 * possible.
 	 */
 
-	if (!ieee80211_has_protected(hdr->frame_control))
-		return RX_CONTINUE;
+	if (!ieee80211_has_protected(hdr->frame_control)) {
+		if (!ieee80211_is_mgmt(hdr->frame_control) ||
+		    rx->sta == NULL || !test_sta_flags(rx->sta, WLAN_STA_MFP))
+			return RX_CONTINUE;
+		mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
+		if (mmie_keyidx < 0)
+			return RX_CONTINUE;
+	}
 
 	/*
 	 * No point in finding a key and decrypting if the frame is neither
@@ -603,6 +657,16 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 
 	if (!is_multicast_ether_addr(hdr->addr1) && stakey) {
 		rx->key = stakey;
+	} else if (mmie_keyidx >= 0) {
+		/* Broadcast/multicast robust management frame / BIP */
+		if ((rx->status->flag & RX_FLAG_DECRYPTED) &&
+		    (rx->status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
+
+		if (mmie_keyidx < NUM_DEFAULT_KEYS ||
+		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+		rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
 	} else {
 		/*
 		 * The device doesn't give us the IV so we won't be
@@ -665,6 +729,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	case ALG_CCMP:
 		result = ieee80211_crypto_ccmp_decrypt(rx);
 		break;
+	case ALG_AES_CMAC:
+		result = ieee80211_crypto_aes_cmac_decrypt(rx);
+		break;
 	}
 
 	/* either the frame has been decrypted or will be dropped */
@@ -1112,6 +1179,15 @@ ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
 	/* Drop unencrypted frames if key is set. */
 	if (unlikely(!ieee80211_has_protected(fc) &&
 		     !ieee80211_is_nullfunc(fc) &&
+		     (!ieee80211_is_mgmt(fc) ||
+		      (ieee80211_is_unicast_robust_mgmt_frame(rx->skb) &&
+		       rx->sta && test_sta_flags(rx->sta, WLAN_STA_MFP))) &&
+		     (rx->key || rx->sdata->drop_unencrypted)))
+		return -EACCES;
+	/* BIP does not use Protected field, so need to check MMIE */
+	if (unlikely(rx->sta && test_sta_flags(rx->sta, WLAN_STA_MFP) &&
+		     ieee80211_is_multicast_robust_mgmt_frame(rx->skb) &&
+		     ieee80211_get_mmie_keyidx(rx->skb) < 0 &&
 		     (rx->key || rx->sdata->drop_unencrypted)))
 		return -EACCES;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 50c6c4fabea5..ad53ea9e9c77 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -425,6 +425,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		tx->key = NULL;
 	else if (tx->sta && (key = rcu_dereference(tx->sta->key)))
 		tx->key = key;
+	else if (ieee80211_is_mgmt(hdr->frame_control) &&
+		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
+		tx->key = key;
 	else if ((key = rcu_dereference(tx->sdata->default_key)))
 		tx->key = key;
 	else if (tx->sdata->drop_unencrypted &&
@@ -453,6 +456,10 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 					       tx->skb))
 				tx->key = NULL;
 			break;
+		case ALG_AES_CMAC:
+			if (!ieee80211_is_mgmt(hdr->frame_control))
+				tx->key = NULL;
+			break;
 		}
 	}
 
@@ -808,6 +815,8 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
 		return ieee80211_crypto_tkip_encrypt(tx);
 	case ALG_CCMP:
 		return ieee80211_crypto_ccmp_encrypt(tx);
+	case ALG_AES_CMAC:
+		return ieee80211_crypto_aes_cmac_encrypt(tx);
 	}
 
 	/* not reached */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1e728fff474e..123d3b160fad 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -738,7 +738,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_KEY_IDX])
 		key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
 
-	if (key_idx > 3)
+	if (key_idx > 5)
 		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_MAC])
@@ -804,30 +804,41 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 	int err;
 	struct net_device *dev;
 	u8 key_idx;
+	int (*func)(struct wiphy *wiphy, struct net_device *netdev,
+		    u8 key_index);
 
 	if (!info->attrs[NL80211_ATTR_KEY_IDX])
 		return -EINVAL;
 
 	key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
 
-	if (key_idx > 3)
+	if (info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT]) {
+		if (key_idx < 4 || key_idx > 5)
+			return -EINVAL;
+	} else if (key_idx > 3)
 		return -EINVAL;
 
 	/* currently only support setting default key */
-	if (!info->attrs[NL80211_ATTR_KEY_DEFAULT])
+	if (!info->attrs[NL80211_ATTR_KEY_DEFAULT] &&
+	    !info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT])
 		return -EINVAL;
 
 	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
 	if (err)
 		return err;
 
-	if (!drv->ops->set_default_key) {
+	if (info->attrs[NL80211_ATTR_KEY_DEFAULT])
+		func = drv->ops->set_default_key;
+	else
+		func = drv->ops->set_default_mgmt_key;
+
+	if (!func) {
 		err = -EOPNOTSUPP;
 		goto out;
 	}
 
 	rtnl_lock();
-	err = drv->ops->set_default_key(&drv->wiphy, dev, key_idx);
+	err = func(&drv->wiphy, dev, key_idx);
 	rtnl_unlock();
 
  out:
@@ -863,7 +874,7 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_MAC])
 		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	if (key_idx > 3)
+	if (key_idx > 5)
 		return -EINVAL;
 
 	/*
@@ -894,6 +905,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 		if (params.key_len != 13)
 			return -EINVAL;
 		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		if (params.key_len != 16)
+			return -EINVAL;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -928,7 +943,7 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_KEY_IDX])
 		key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
 
-	if (key_idx > 3)
+	if (key_idx > 5)
 		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_MAC])
-- 
cgit v1.2.3-71-gd317


From 54604d3a827b37525ef017adba313c7112e0f484 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:03 +0200
Subject: mac80211: 802.11w - WEXT parameter for setting mgmt cipher

Add a new IW_AUTH parameter for setting cipher suite for
multicast/broadcast management frames. This is for full-mac drivers
that take care of RSN IE generation for (re)association request frames.

Signed-off-by: Jouni Malinen <j@w1.fi>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/wireless.h | 5 ++++-
 net/mac80211/wext.c      | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index d7958f9b52cb..d426dce47e7c 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -577,18 +577,21 @@
 #define IW_AUTH_RX_UNENCRYPTED_EAPOL	8
 #define IW_AUTH_ROAMING_CONTROL		9
 #define IW_AUTH_PRIVACY_INVOKED		10
+#define IW_AUTH_CIPHER_GROUP_MGMT	11
 
 /* IW_AUTH_WPA_VERSION values (bit field) */
 #define IW_AUTH_WPA_VERSION_DISABLED	0x00000001
 #define IW_AUTH_WPA_VERSION_WPA		0x00000002
 #define IW_AUTH_WPA_VERSION_WPA2	0x00000004
 
-/* IW_AUTH_PAIRWISE_CIPHER and IW_AUTH_GROUP_CIPHER values (bit field) */
+/* IW_AUTH_PAIRWISE_CIPHER, IW_AUTH_GROUP_CIPHER, and IW_AUTH_CIPHER_GROUP_MGMT
+ * values (bit field) */
 #define IW_AUTH_CIPHER_NONE	0x00000001
 #define IW_AUTH_CIPHER_WEP40	0x00000002
 #define IW_AUTH_CIPHER_TKIP	0x00000004
 #define IW_AUTH_CIPHER_CCMP	0x00000008
 #define IW_AUTH_CIPHER_WEP104	0x00000010
+#define IW_AUTH_CIPHER_AES_CMAC	0x00000020
 
 /* IW_AUTH_KEY_MGMT values (bit field) */
 #define IW_AUTH_KEY_MGMT_802_1X	1
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 1e5b29bdb3a7..c3b2dd5706fb 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -927,6 +927,7 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
 	case IW_AUTH_WPA_ENABLED:
 	case IW_AUTH_RX_UNENCRYPTED_EAPOL:
 	case IW_AUTH_KEY_MGMT:
+	case IW_AUTH_CIPHER_GROUP_MGMT:
 		break;
 	case IW_AUTH_CIPHER_PAIRWISE:
 		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
-- 
cgit v1.2.3-71-gd317


From 22787dbaa3b952602542506e0426ea6d5f104042 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:04 +0200
Subject: mac80211: 802.11w - WEXT configuration for IGTK

Added new SIOCSIWENCODEEXT algorithm for configuring BIP (AES-CMAC)
keys (IGTK).

Signed-off-by: Jouni Malinen <j@w1.fi>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/wireless.h |  1 +
 net/mac80211/wext.c      | 62 +++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index d426dce47e7c..5d1f3fbffd77 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -615,6 +615,7 @@
 #define IW_ENCODE_ALG_TKIP	2
 #define IW_ENCODE_ALG_CCMP	3
 #define IW_ENCODE_ALG_PMK	4
+#define IW_ENCODE_ALG_AES_CMAC	5
 /* struct iw_encode_ext ->ext_flags */
 #define IW_ENCODE_EXT_TX_SEQ_VALID	0x00000001
 #define IW_ENCODE_EXT_RX_SEQ_VALID	0x00000002
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index c3b2dd5706fb..7ba1d5ba3afa 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -37,7 +37,14 @@ static int ieee80211_set_encryption(struct ieee80211_sub_if_data *sdata, u8 *sta
 	struct ieee80211_key *key;
 	int err;
 
-	if (idx < 0 || idx >= NUM_DEFAULT_KEYS) {
+	if (alg == ALG_AES_CMAC) {
+		if (idx < NUM_DEFAULT_KEYS ||
+		    idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) {
+			printk(KERN_DEBUG "%s: set_encrypt - invalid idx=%d "
+			       "(BIP)\n", sdata->dev->name, idx);
+			return -EINVAL;
+		}
+	} else if (idx < 0 || idx >= NUM_DEFAULT_KEYS) {
 		printk(KERN_DEBUG "%s: set_encrypt - invalid idx=%d\n",
 		       sdata->dev->name, idx);
 		return -EINVAL;
@@ -103,6 +110,9 @@ static int ieee80211_set_encryption(struct ieee80211_sub_if_data *sdata, u8 *sta
 
 		if (set_tx_key || (!sta && !sdata->default_key && key))
 			ieee80211_set_default_key(sdata, idx);
+		if (alg == ALG_AES_CMAC &&
+		    (set_tx_key || (!sta && !sdata->default_mgmt_key && key)))
+			ieee80211_set_default_mgmt_key(sdata, idx);
 	}
 
  out_unlock:
@@ -1048,6 +1058,9 @@ static int ieee80211_ioctl_siwencodeext(struct net_device *dev,
 	case IW_ENCODE_ALG_CCMP:
 		alg = ALG_CCMP;
 		break;
+	case IW_ENCODE_ALG_AES_CMAC:
+		alg = ALG_AES_CMAC;
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1056,20 +1069,41 @@ static int ieee80211_ioctl_siwencodeext(struct net_device *dev,
 		remove = 1;
 
 	idx = erq->flags & IW_ENCODE_INDEX;
-	if (idx < 1 || idx > 4) {
-		idx = -1;
-		if (!sdata->default_key)
-			idx = 0;
-		else for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
-			if (sdata->default_key == sdata->keys[i]) {
-				idx = i;
-				break;
+	if (alg == ALG_AES_CMAC) {
+		if (idx < NUM_DEFAULT_KEYS + 1 ||
+		    idx > NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) {
+			idx = -1;
+			if (!sdata->default_mgmt_key)
+				idx = 0;
+			else for (i = NUM_DEFAULT_KEYS;
+				  i < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS;
+				  i++) {
+				if (sdata->default_mgmt_key == sdata->keys[i])
+				{
+					idx = i;
+					break;
+				}
 			}
-		}
-		if (idx < 0)
-			return -EINVAL;
-	} else
-		idx--;
+			if (idx < 0)
+				return -EINVAL;
+		} else
+			idx--;
+	} else {
+		if (idx < 1 || idx > 4) {
+			idx = -1;
+			if (!sdata->default_key)
+				idx = 0;
+			else for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+				if (sdata->default_key == sdata->keys[i]) {
+					idx = i;
+					break;
+				}
+			}
+			if (idx < 0)
+				return -EINVAL;
+		} else
+			idx--;
+	}
 
 	return ieee80211_set_encryption(sdata, ext->addr.sa_data, idx, alg,
 					remove,
-- 
cgit v1.2.3-71-gd317


From fdfacf0ae2e8339098b1164d2317b792d7662c0a Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:05 +0200
Subject: mac80211: 802.11w - Configuration of MFP disabled/optional/required

Add new WEXT IW_AUTH_* parameter for setting MFP
disabled/optional/required.

Signed-off-by: Jouni Malinen <j@w1.fi>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/wireless.h   | 6 ++++++
 net/mac80211/ieee80211_i.h | 6 ++++++
 net/mac80211/mlme.c        | 4 ++++
 net/mac80211/wext.c        | 7 +++++++
 4 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index 5d1f3fbffd77..cb24204851f7 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -578,6 +578,7 @@
 #define IW_AUTH_ROAMING_CONTROL		9
 #define IW_AUTH_PRIVACY_INVOKED		10
 #define IW_AUTH_CIPHER_GROUP_MGMT	11
+#define IW_AUTH_MFP			12
 
 /* IW_AUTH_WPA_VERSION values (bit field) */
 #define IW_AUTH_WPA_VERSION_DISABLED	0x00000001
@@ -607,6 +608,11 @@
 #define IW_AUTH_ROAMING_DISABLE	1	/* user space program used for roaming
 					 * control */
 
+/* IW_AUTH_MFP (management frame protection) values */
+#define IW_AUTH_MFP_DISABLED	0	/* MFP disabled */
+#define IW_AUTH_MFP_OPTIONAL	1	/* MFP optional */
+#define IW_AUTH_MFP_REQUIRED	2	/* MFP required */
+
 /* SIOCSIWENCODEEXT definitions */
 #define IW_ENCODE_SEQ_MAX_SIZE	8
 /* struct iw_encode_ext ->alg */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8c3245717c55..212c732fbba7 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -320,6 +320,12 @@ struct ieee80211_if_sta {
 	int auth_alg; /* currently used IEEE 802.11 authentication algorithm */
 	int auth_transaction;
 
+	enum {
+		IEEE80211_MFP_DISABLED,
+		IEEE80211_MFP_OPTIONAL,
+		IEEE80211_MFP_REQUIRED
+	} mfp; /* management frame protection */
+
 	unsigned long ibss_join_req;
 	struct sk_buff *probe_resp; /* ProbeResp template for IBSS */
 	u32 supp_rates_bits[IEEE80211_NUM_BANDS];
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index bc8a7f1a6a15..42c5f981c715 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2317,6 +2317,10 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 					       selected->ssid_len);
 		ieee80211_sta_set_bssid(sdata, selected->bssid);
 		ieee80211_sta_def_wmm_params(sdata, selected);
+		if (sdata->u.sta.mfp == IEEE80211_MFP_REQUIRED)
+			sdata->u.sta.flags |= IEEE80211_STA_MFP_ENABLED;
+		else
+			sdata->u.sta.flags &= ~IEEE80211_STA_MFP_ENABLED;
 
 		/* Send out direct probe if no probe resp was received or
 		 * the one we have is outdated
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 7ba1d5ba3afa..2dd387495dfe 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -975,6 +975,13 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
 		else
 			ret = -EOPNOTSUPP;
 		break;
+	case IW_AUTH_MFP:
+		if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+		    sdata->vif.type == NL80211_IFTYPE_ADHOC)
+			sdata->u.sta.mfp = data->value;
+		else
+			ret = -EOPNOTSUPP;
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
-- 
cgit v1.2.3-71-gd317


From fea147328908b7e2bfcaf9dc4377909d5507ca35 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:06 +0200
Subject: mac80211: 802.11w - SA Query processing

Process SA Query Requests for client mode in mac80211. AP side
processing of SA Query Response frames is in user space (hostapd).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 14 ++++++++++
 net/mac80211/rx.c         | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index df98a8a549a2..9fe1948d28d3 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -527,6 +527,8 @@ struct ieee80211_tim_ie {
 	u8 virtual_map[0];
 } __attribute__ ((packed));
 
+#define WLAN_SA_QUERY_TR_ID_LEN 16
+
 struct ieee80211_mgmt {
 	__le16 frame_control;
 	__le16 duration;
@@ -646,6 +648,10 @@ struct ieee80211_mgmt {
 					u8 action_code;
 					u8 variable[0];
 				} __attribute__((packed)) mesh_action;
+				struct {
+					u8 action;
+					u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN];
+				} __attribute__ ((packed)) sa_query;
 			} u;
 		} __attribute__ ((packed)) action;
 	} u;
@@ -1041,6 +1047,7 @@ enum ieee80211_category {
 	WLAN_CATEGORY_DLS = 2,
 	WLAN_CATEGORY_BACK = 3,
 	WLAN_CATEGORY_PUBLIC = 4,
+	WLAN_CATEGORY_SA_QUERY = 8,
 	WLAN_CATEGORY_WMM = 17,
 };
 
@@ -1129,6 +1136,13 @@ enum ieee80211_back_parties {
 	WLAN_BACK_TIMER = 2,
 };
 
+/* SA Query action */
+enum ieee80211_sa_query_action {
+	WLAN_ACTION_SA_QUERY_REQUEST = 0,
+	WLAN_ACTION_SA_QUERY_RESPONSE = 1,
+};
+
+
 /* A-MSDU 802.11n */
 #define IEEE80211_QOS_CONTROL_A_MSDU_PRESENT 0x0080
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index abc3aa583ca6..63db89aef3e4 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1667,6 +1667,57 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_mgmt *mgmt,
+				    size_t len)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *resp;
+
+	if (compare_ether_addr(mgmt->da, sdata->dev->dev_addr) != 0) {
+		/* Not to own unicast address */
+		return;
+	}
+
+	if (compare_ether_addr(mgmt->sa, sdata->u.sta.bssid) != 0 ||
+	    compare_ether_addr(mgmt->bssid, sdata->u.sta.bssid) != 0) {
+		/* Not from the current AP. */
+		return;
+	}
+
+	if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATE) {
+		/* Association in progress; ignore SA Query */
+		return;
+	}
+
+	if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) {
+		/* Too short SA Query request frame */
+		return;
+	}
+
+	skb = dev_alloc_skb(sizeof(*resp) + local->hw.extra_tx_headroom);
+	if (skb == NULL)
+		return;
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	resp = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(resp, 0, 24);
+	memcpy(resp->da, mgmt->sa, ETH_ALEN);
+	memcpy(resp->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(resp->bssid, sdata->u.sta.bssid, ETH_ALEN);
+	resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
+	resp->u.action.category = WLAN_CATEGORY_SA_QUERY;
+	resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE;
+	memcpy(resp->u.action.u.sa_query.trans_id,
+	       mgmt->u.action.u.sa_query.trans_id,
+	       WLAN_SA_QUERY_TR_ID_LEN);
+
+	ieee80211_tx_skb(sdata, skb, 1);
+}
+
 static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 {
@@ -1743,6 +1794,24 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			break;
 		}
 		break;
+	case WLAN_CATEGORY_SA_QUERY:
+		if (len < (IEEE80211_MIN_ACTION_SIZE +
+			   sizeof(mgmt->u.action.u.sa_query)))
+			return RX_DROP_MONITOR;
+		switch (mgmt->u.action.u.sa_query.action) {
+		case WLAN_ACTION_SA_QUERY_REQUEST:
+			if (sdata->vif.type != NL80211_IFTYPE_STATION)
+				return RX_DROP_MONITOR;
+			ieee80211_process_sa_query_req(sdata, mgmt, len);
+			break;
+		case WLAN_ACTION_SA_QUERY_RESPONSE:
+			/*
+			 * SA Query response is currently only used in AP mode
+			 * and it is processed in user space.
+			 */
+			return RX_CONTINUE;
+		}
+		break;
 	default:
 		return RX_CONTINUE;
 	}
-- 
cgit v1.2.3-71-gd317


From 63a5ab82255a4ff5d0783f16427210f1d45d7ec8 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 8 Jan 2009 13:32:09 +0200
Subject: mac80211: 802.11w - Implement Association Comeback processing

When MFP is enabled, the AP does not allow a STA to associate if an
existing security association exists without first going through SA
Query process. When this happens, the association request is denied
with a new status code ("temporarily rejected") ans Association
Comeback IE is used to notify when the association may be tried again
(i.e., when the SA Query procedure has timed out).

Use the comeback time to update the mac80211 client MLME timer for
next association attempt to minimize waiting time if association is
temporarily rejected.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  |  4 ++++
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/mlme.c        | 20 +++++++++++++++++---
 net/mac80211/util.c        |  4 ++++
 4 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 9fe1948d28d3..7800e20f197f 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -914,6 +914,9 @@ enum ieee80211_statuscode {
 	/* 802.11g */
 	WLAN_STATUS_ASSOC_DENIED_NOSHORTTIME = 25,
 	WLAN_STATUS_ASSOC_DENIED_NODSSSOFDM = 26,
+	/* 802.11w */
+	WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY = 30,
+	WLAN_STATUS_ROBUST_MGMT_FRAME_POLICY_VIOLATION = 31,
 	/* 802.11i */
 	WLAN_STATUS_INVALID_IE = 40,
 	WLAN_STATUS_INVALID_GROUP_CIPHER = 41,
@@ -1034,6 +1037,7 @@ enum ieee80211_eid {
 	/* 802.11i */
 	WLAN_EID_RSN = 48,
 	WLAN_EID_MMIE = 76 /* 802.11w */,
+	WLAN_EID_ASSOC_COMEBACK_TIME = 77,
 	WLAN_EID_WPA = 221,
 	WLAN_EID_GENERIC = 221,
 	WLAN_EID_VENDOR_SPECIFIC = 221,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 212c732fbba7..9112c5247c35 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -820,6 +820,7 @@ struct ieee802_11_elems {
 	u8 *country_elem;
 	u8 *pwr_constr_elem;
 	u8 *quiet_elem; 	/* first quite element */
+	u8 *assoc_comeback;
 
 	/* length of them, respectively */
 	u8 ssid_len;
@@ -847,6 +848,7 @@ struct ieee802_11_elems {
 	u8 pwr_constr_elem_len;
 	u8 quiet_elem_len;
 	u8 num_of_quiet_elem;	/* can be more the one */
+	u8 assoc_comeback_len;
 };
 
 static inline struct ieee80211_local *hw_to_local(
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 42c5f981c715..82c598a83687 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1275,6 +1275,23 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	       sdata->dev->name, reassoc ? "Rea" : "A", mgmt->sa,
 	       capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
 
+	pos = mgmt->u.assoc_resp.variable;
+	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
+
+	if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY &&
+	    elems.assoc_comeback && elems.assoc_comeback_len == 4) {
+		u32 tu, ms;
+		tu = get_unaligned_le32(elems.assoc_comeback);
+		ms = tu * 1024 / 1000;
+		printk(KERN_DEBUG "%s: AP rejected association temporarily; "
+		       "comeback duration %u TU (%u ms)\n",
+		       sdata->dev->name, tu, ms);
+		if (ms > IEEE80211_ASSOC_TIMEOUT)
+			mod_timer(&ifsta->timer,
+				  jiffies + msecs_to_jiffies(ms));
+		return;
+	}
+
 	if (status_code != WLAN_STATUS_SUCCESS) {
 		printk(KERN_DEBUG "%s: AP denied association (code=%d)\n",
 		       sdata->dev->name, status_code);
@@ -1290,9 +1307,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		       "set\n", sdata->dev->name, aid);
 	aid &= ~(BIT(15) | BIT(14));
 
-	pos = mgmt->u.assoc_resp.variable;
-	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
-
 	if (!elems.supp_rates) {
 		printk(KERN_DEBUG "%s: no SuppRates element in AssocResp\n",
 		       sdata->dev->name);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 5cd430333f08..963e0473205c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -653,6 +653,10 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 			elems->pwr_constr_elem = pos;
 			elems->pwr_constr_elem_len = elen;
 			break;
+		case WLAN_EID_ASSOC_COMEBACK_TIME:
+			elems->assoc_comeback = pos;
+			elems->assoc_comeback_len = elen;
+			break;
 		default:
 			break;
 		}
-- 
cgit v1.2.3-71-gd317


From d2b21f191753abd12c4063776cb1a3d635397509 Mon Sep 17 00:00:00 2001
From: Colin McCabe <colin@cozybit.com>
Date: Fri, 9 Jan 2009 14:58:09 -0800
Subject: libertas: if_spi, driver for libertas GSPI devices

Add initial support for libertas devices using a GSPI interface.  This has
been tested with the 8686.

GSPI is intended to be used on embedded systems. Board-specific parameters are
required (see libertas_spi.h).

Thanks to everyone who took a look at the earlier versions of the patch.

Signed-off-by: Colin McCabe <colin@cozybit.com>
Signed-off-by: Andrey Yurovsky <andrey@cozybit.com>
Acked-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/Kconfig           |    6 +
 drivers/net/wireless/libertas/Makefile |    2 +
 drivers/net/wireless/libertas/defs.h   |    2 +
 drivers/net/wireless/libertas/if_spi.c | 1203 ++++++++++++++++++++++++++++++++
 drivers/net/wireless/libertas/if_spi.h |  208 ++++++
 include/linux/spi/libertas_spi.h       |   25 +
 6 files changed, 1446 insertions(+)
 create mode 100644 drivers/net/wireless/libertas/if_spi.c
 create mode 100644 drivers/net/wireless/libertas/if_spi.h
 create mode 100644 include/linux/spi/libertas_spi.h

(limited to 'include/linux')

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index e4f9f747de88..2dddbd012a99 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -151,6 +151,12 @@ config LIBERTAS_SDIO
 	---help---
 	  A driver for Marvell Libertas 8385 and 8686 SDIO devices.
 
+config LIBERTAS_SPI
+	tristate "Marvell Libertas 8686 SPI 802.11b/g cards"
+	depends on LIBERTAS && SPI && GENERIC_GPIO
+	---help---
+	  A driver for Marvell Libertas 8686 SPI devices.
+
 config LIBERTAS_DEBUG
 	bool "Enable full debugging output in the Libertas module."
 	depends on LIBERTAS
diff --git a/drivers/net/wireless/libertas/Makefile b/drivers/net/wireless/libertas/Makefile
index 02080a3682a9..0b6918584503 100644
--- a/drivers/net/wireless/libertas/Makefile
+++ b/drivers/net/wireless/libertas/Makefile
@@ -4,8 +4,10 @@ libertas-objs := main.o wext.o rx.o tx.o cmd.o cmdresp.o scan.o 11d.o	\
 usb8xxx-objs += if_usb.o
 libertas_cs-objs += if_cs.o
 libertas_sdio-objs += if_sdio.o
+libertas_spi-objs += if_spi.o
 
 obj-$(CONFIG_LIBERTAS)     += libertas.o
 obj-$(CONFIG_LIBERTAS_USB) += usb8xxx.o
 obj-$(CONFIG_LIBERTAS_CS)  += libertas_cs.o
 obj-$(CONFIG_LIBERTAS_SDIO) += libertas_sdio.o
+obj-$(CONFIG_LIBERTAS_SPI) += libertas_spi.o
diff --git a/drivers/net/wireless/libertas/defs.h b/drivers/net/wireless/libertas/defs.h
index c364e4c01d1b..6388b05df4fc 100644
--- a/drivers/net/wireless/libertas/defs.h
+++ b/drivers/net/wireless/libertas/defs.h
@@ -41,6 +41,7 @@
 #define LBS_DEB_HEX	0x00200000
 #define LBS_DEB_SDIO	0x00400000
 #define LBS_DEB_SYSFS	0x00800000
+#define LBS_DEB_SPI	0x01000000
 
 extern unsigned int lbs_debug;
 
@@ -84,6 +85,7 @@ do { if ((lbs_debug & (grp)) == (grp)) \
 #define lbs_deb_thread(fmt, args...)    LBS_DEB_LL(LBS_DEB_THREAD, " thread", fmt, ##args)
 #define lbs_deb_sdio(fmt, args...)      LBS_DEB_LL(LBS_DEB_SDIO, " sdio", fmt, ##args)
 #define lbs_deb_sysfs(fmt, args...)     LBS_DEB_LL(LBS_DEB_SYSFS, " sysfs", fmt, ##args)
+#define lbs_deb_spi(fmt, args...)       LBS_DEB_LL(LBS_DEB_SPI, " spi", fmt, ##args)
 
 #define lbs_pr_info(format, args...) \
 	printk(KERN_INFO DRV_NAME": " format, ## args)
diff --git a/drivers/net/wireless/libertas/if_spi.c b/drivers/net/wireless/libertas/if_spi.c
new file mode 100644
index 000000000000..7c02ea314fd1
--- /dev/null
+++ b/drivers/net/wireless/libertas/if_spi.c
@@ -0,0 +1,1203 @@
+/*
+ *	linux/drivers/net/wireless/libertas/if_spi.c
+ *
+ *	Driver for Marvell SPI WLAN cards.
+ *
+ *	Copyright 2008 Analog Devices Inc.
+ *
+ *	Authors:
+ *	Andrey Yurovsky <andrey@cozybit.com>
+ *	Colin McCabe <colin@cozybit.com>
+ *
+ *	Inspired by if_sdio.c, Copyright 2007-2008 Pierre Ossman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/firmware.h>
+#include <linux/gpio.h>
+#include <linux/jiffies.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/spi/libertas_spi.h>
+#include <linux/spi/spi.h>
+
+#include "host.h"
+#include "decl.h"
+#include "defs.h"
+#include "dev.h"
+#include "if_spi.h"
+
+struct if_spi_packet {
+	struct list_head		list;
+	u16				blen;
+	u8				buffer[0] __attribute__((aligned(4)));
+};
+
+struct if_spi_card {
+	struct spi_device		*spi;
+	struct lbs_private		*priv;
+
+	char				helper_fw_name[FIRMWARE_NAME_MAX];
+	char				main_fw_name[FIRMWARE_NAME_MAX];
+
+	/* The card ID and card revision, as reported by the hardware. */
+	u16				card_id;
+	u8				card_rev;
+
+	/* Pin number for our GPIO chip-select. */
+	/* TODO: Once the generic SPI layer has some additional features, we
+	 * should take this out and use the normal chip select here.
+	 * We need support for chip select delays, and not dropping chipselect
+	 * after each word. */
+	int				gpio_cs;
+
+	/* The last time that we initiated an SPU operation */
+	unsigned long			prev_xfer_time;
+
+	int				use_dummy_writes;
+	unsigned long			spu_port_delay;
+	unsigned long			spu_reg_delay;
+
+	/* Handles all SPI communication (except for FW load) */
+	struct task_struct		*spi_thread;
+	int				run_thread;
+
+	/* Used to wake up the spi_thread */
+	struct semaphore		spi_ready;
+	struct semaphore		spi_thread_terminated;
+
+	u8				cmd_buffer[IF_SPI_CMD_BUF_SIZE];
+
+	/* A buffer of incoming packets from libertas core.
+	 * Since we can't sleep in hw_host_to_card, we have to buffer
+	 * them. */
+	struct list_head		cmd_packet_list;
+	struct list_head		data_packet_list;
+
+	/* Protects cmd_packet_list and data_packet_list */
+	spinlock_t			buffer_lock;
+};
+
+static void free_if_spi_card(struct if_spi_card *card)
+{
+	struct list_head *cursor, *next;
+	struct if_spi_packet *packet;
+
+	BUG_ON(card->run_thread);
+	list_for_each_safe(cursor, next, &card->cmd_packet_list) {
+		packet = container_of(cursor, struct if_spi_packet, list);
+		list_del(&packet->list);
+		kfree(packet);
+	}
+	list_for_each_safe(cursor, next, &card->data_packet_list) {
+		packet = container_of(cursor, struct if_spi_packet, list);
+		list_del(&packet->list);
+		kfree(packet);
+	}
+	spi_set_drvdata(card->spi, NULL);
+	kfree(card);
+}
+
+static struct chip_ident chip_id_to_device_name[] = {
+	{ .chip_id = 0x04, .name = 8385 },
+	{ .chip_id = 0x0b, .name = 8686 },
+};
+
+/*
+ * SPI Interface Unit Routines
+ *
+ * The SPU sits between the host and the WLAN module.
+ * All communication with the firmware is through SPU transactions.
+ *
+ * First we have to put a SPU register name on the bus. Then we can
+ * either read from or write to that register.
+ *
+ * For 16-bit transactions, byte order on the bus is big-endian.
+ * We don't have to worry about that here, though.
+ * The translation takes place in the SPI routines.
+ */
+
+static void spu_transaction_init(struct if_spi_card *card)
+{
+	if (!time_after(jiffies, card->prev_xfer_time + 1)) {
+		/* Unfortunately, the SPU requires a delay between successive
+		 * transactions. If our last transaction was more than a jiffy
+		 * ago, we have obviously already delayed enough.
+		 * If not, we have to busy-wait to be on the safe side. */
+		ndelay(400);
+	}
+	gpio_set_value(card->gpio_cs, 0); /* assert CS */
+}
+
+static void spu_transaction_finish(struct if_spi_card *card)
+{
+	gpio_set_value(card->gpio_cs, 1); /* drop CS */
+	card->prev_xfer_time = jiffies;
+}
+
+/* Write out a byte buffer to an SPI register,
+ * using a series of 16-bit transfers. */
+static int spu_write(struct if_spi_card *card, u16 reg, const u8 *buf, int len)
+{
+	int err = 0;
+	u16 reg_out = reg | IF_SPI_WRITE_OPERATION_MASK;
+
+	/* You must give an even number of bytes to the SPU, even if it
+	 * doesn't care about the last one.  */
+	BUG_ON(len & 0x1);
+
+	spu_transaction_init(card);
+
+	/* write SPU register index */
+	err = spi_write(card->spi, (u8 *)&reg_out, sizeof(u16));
+	if (err)
+		goto out;
+
+	err = spi_write(card->spi, buf, len);
+
+out:
+	spu_transaction_finish(card);
+	return err;
+}
+
+static inline int spu_write_u16(struct if_spi_card *card, u16 reg, u16 val)
+{
+	return spu_write(card, reg, (u8 *)&val, sizeof(u16));
+}
+
+static inline int spu_write_u32(struct if_spi_card *card, u16 reg, u32 val)
+{
+	/* The lower 16 bits are written first. */
+	u16 out[2];
+	out[0] = val & 0xffff;
+	out[1] = (val & 0xffff0000) >> 16;
+	return spu_write(card, reg, (u8 *)&out, sizeof(u32));
+}
+
+static inline int spu_reg_is_port_reg(u16 reg)
+{
+	switch (reg) {
+	case IF_SPI_IO_RDWRPORT_REG:
+	case IF_SPI_CMD_RDWRPORT_REG:
+	case IF_SPI_DATA_RDWRPORT_REG:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int spu_read(struct if_spi_card *card, u16 reg, u8 *buf, int len)
+{
+	unsigned int i, delay;
+	int err = 0;
+	u16 zero = 0;
+	u16 reg_out = reg | IF_SPI_READ_OPERATION_MASK;
+
+	/* You must take an even number of bytes from the SPU, even if you
+	 * don't care about the last one.  */
+	BUG_ON(len & 0x1);
+
+	spu_transaction_init(card);
+
+	/* write SPU register index */
+	err = spi_write(card->spi, (u8 *)&reg_out, sizeof(u16));
+	if (err)
+		goto out;
+
+	delay = spu_reg_is_port_reg(reg) ? card->spu_port_delay :
+						card->spu_reg_delay;
+	if (card->use_dummy_writes) {
+		/* Clock in dummy cycles while the SPU fills the FIFO */
+		for (i = 0; i < delay / 16; ++i) {
+			err = spi_write(card->spi, (u8 *)&zero, sizeof(u16));
+			if (err)
+				return err;
+		}
+	} else {
+		/* Busy-wait while the SPU fills the FIFO */
+		ndelay(100 + (delay * 10));
+	}
+
+	/* read in data */
+	err = spi_read(card->spi, buf, len);
+
+out:
+	spu_transaction_finish(card);
+	return err;
+}
+
+/* Read 16 bits from an SPI register */
+static inline int spu_read_u16(struct if_spi_card *card, u16 reg, u16 *val)
+{
+	return spu_read(card, reg, (u8 *)val, sizeof(u16));
+}
+
+/* Read 32 bits from an SPI register.
+ * The low 16 bits are read first. */
+static int spu_read_u32(struct if_spi_card *card, u16 reg, u32 *val)
+{
+	u16 buf[2];
+	int err;
+	err = spu_read(card, reg, (u8 *)buf, sizeof(u32));
+	if (!err)
+		*val = buf[0] | (buf[1] << 16);
+	return err;
+}
+
+/* Keep reading 16 bits from an SPI register until you get the correct result.
+ *
+ * If mask = 0, the correct result is any non-zero number.
+ * If mask != 0, the correct result is any number where
+ * number & target_mask == target
+ *
+ * Returns -ETIMEDOUT if a second passes without the correct result. */
+static int spu_wait_for_u16(struct if_spi_card *card, u16 reg,
+			u16 target_mask, u16 target)
+{
+	int err;
+	unsigned long timeout = jiffies + 5*HZ;
+	while (1) {
+		u16 val;
+		err = spu_read_u16(card, reg, &val);
+		if (err)
+			return err;
+		if (target_mask) {
+			if ((val & target_mask) == target)
+				return 0;
+		} else {
+			if (val)
+				return 0;
+		}
+		udelay(100);
+		if (time_after(jiffies, timeout)) {
+			lbs_pr_err("%s: timeout with val=%02x, "
+			       "target_mask=%02x, target=%02x\n",
+			       __func__, val, target_mask, target);
+			return -ETIMEDOUT;
+		}
+	}
+}
+
+/* Read 16 bits from an SPI register until you receive a specific value.
+ * Returns -ETIMEDOUT if a 4 tries pass without success. */
+static int spu_wait_for_u32(struct if_spi_card *card, u32 reg, u32 target)
+{
+	int err, try;
+	for (try = 0; try < 4; ++try) {
+		u32 val = 0;
+		err = spu_read_u32(card, reg, &val);
+		if (err)
+			return err;
+		if (val == target)
+			return 0;
+		mdelay(100);
+	}
+	return -ETIMEDOUT;
+}
+
+static int spu_set_interrupt_mode(struct if_spi_card *card,
+			   int suppress_host_int,
+			   int auto_int)
+{
+	int err = 0;
+
+	/* We can suppress a host interrupt by clearing the appropriate
+	 * bit in the "host interrupt status mask" register */
+	if (suppress_host_int) {
+		err = spu_write_u16(card, IF_SPI_HOST_INT_STATUS_MASK_REG, 0);
+		if (err)
+			return err;
+	} else {
+		err = spu_write_u16(card, IF_SPI_HOST_INT_STATUS_MASK_REG,
+			      IF_SPI_HISM_TX_DOWNLOAD_RDY |
+			      IF_SPI_HISM_RX_UPLOAD_RDY |
+			      IF_SPI_HISM_CMD_DOWNLOAD_RDY |
+			      IF_SPI_HISM_CARDEVENT |
+			      IF_SPI_HISM_CMD_UPLOAD_RDY);
+		if (err)
+			return err;
+	}
+
+	/* If auto-interrupts are on, the completion of certain transactions
+	 * will trigger an interrupt automatically. If auto-interrupts
+	 * are off, we need to set the "Card Interrupt Cause" register to
+	 * trigger a card interrupt. */
+	if (auto_int) {
+		err = spu_write_u16(card, IF_SPI_HOST_INT_CTRL_REG,
+				IF_SPI_HICT_TX_DOWNLOAD_OVER_AUTO |
+				IF_SPI_HICT_RX_UPLOAD_OVER_AUTO |
+				IF_SPI_HICT_CMD_DOWNLOAD_OVER_AUTO |
+				IF_SPI_HICT_CMD_UPLOAD_OVER_AUTO);
+		if (err)
+			return err;
+	} else {
+		err = spu_write_u16(card, IF_SPI_HOST_INT_STATUS_MASK_REG, 0);
+		if (err)
+			return err;
+	}
+	return err;
+}
+
+static int spu_get_chip_revision(struct if_spi_card *card,
+				  u16 *card_id, u8 *card_rev)
+{
+	int err = 0;
+	u32 dev_ctrl;
+	err = spu_read_u32(card, IF_SPI_DEVICEID_CTRL_REG, &dev_ctrl);
+	if (err)
+		return err;
+	*card_id = IF_SPI_DEVICEID_CTRL_REG_TO_CARD_ID(dev_ctrl);
+	*card_rev = IF_SPI_DEVICEID_CTRL_REG_TO_CARD_REV(dev_ctrl);
+	return err;
+}
+
+static int spu_set_bus_mode(struct if_spi_card *card, u16 mode)
+{
+	int err = 0;
+	u16 rval;
+	/* set bus mode */
+	err = spu_write_u16(card, IF_SPI_SPU_BUS_MODE_REG, mode);
+	if (err)
+		return err;
+	/* Check that we were able to read back what we just wrote. */
+	err = spu_read_u16(card, IF_SPI_SPU_BUS_MODE_REG, &rval);
+	if (err)
+		return err;
+	if (rval != mode) {
+		lbs_pr_err("Can't read bus mode register.\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+static int spu_init(struct if_spi_card *card, int use_dummy_writes)
+{
+	int err = 0;
+	u32 delay;
+
+	/* We have to start up in timed delay mode so that we can safely
+	 * read the Delay Read Register. */
+	card->use_dummy_writes = 0;
+	err = spu_set_bus_mode(card,
+				IF_SPI_BUS_MODE_SPI_CLOCK_PHASE_RISING |
+				IF_SPI_BUS_MODE_DELAY_METHOD_TIMED |
+				IF_SPI_BUS_MODE_16_BIT_ADDRESS_16_BIT_DATA);
+	if (err)
+		return err;
+	card->spu_port_delay = 1000;
+	card->spu_reg_delay = 1000;
+	err = spu_read_u32(card, IF_SPI_DELAY_READ_REG, &delay);
+	if (err)
+		return err;
+	card->spu_port_delay = delay & 0x0000ffff;
+	card->spu_reg_delay = (delay & 0xffff0000) >> 16;
+
+	/* If dummy clock delay mode has been requested, switch to it now */
+	if (use_dummy_writes) {
+		card->use_dummy_writes = 1;
+		err = spu_set_bus_mode(card,
+				IF_SPI_BUS_MODE_SPI_CLOCK_PHASE_RISING |
+				IF_SPI_BUS_MODE_DELAY_METHOD_DUMMY_CLOCK |
+				IF_SPI_BUS_MODE_16_BIT_ADDRESS_16_BIT_DATA);
+		if (err)
+			return err;
+	}
+
+	lbs_deb_spi("Initialized SPU unit. "
+		    "spu_port_delay=0x%04lx, spu_reg_delay=0x%04lx\n",
+		    card->spu_port_delay, card->spu_reg_delay);
+	return err;
+}
+
+/*
+ * Firmware Loading
+ */
+
+static int if_spi_prog_helper_firmware(struct if_spi_card *card)
+{
+	int err = 0;
+	const struct firmware *firmware = NULL;
+	int bytes_remaining;
+	const u8 *fw;
+	u8 temp[HELPER_FW_LOAD_CHUNK_SZ];
+	struct spi_device *spi = card->spi;
+
+	lbs_deb_enter(LBS_DEB_SPI);
+
+	err = spu_set_interrupt_mode(card, 1, 0);
+	if (err)
+		goto out;
+	/* Get helper firmware image */
+	err = request_firmware(&firmware, card->helper_fw_name, &spi->dev);
+	if (err) {
+		lbs_pr_err("request_firmware failed with err = %d\n", err);
+		goto out;
+	}
+	bytes_remaining = firmware->size;
+	fw = firmware->data;
+
+	/* Load helper firmware image */
+	while (bytes_remaining > 0) {
+		/* Scratch pad 1 should contain the number of bytes we
+		 * want to download to the firmware */
+		err = spu_write_u16(card, IF_SPI_SCRATCH_1_REG,
+					HELPER_FW_LOAD_CHUNK_SZ);
+		if (err)
+			goto release_firmware;
+
+		err = spu_wait_for_u16(card, IF_SPI_HOST_INT_STATUS_REG,
+					IF_SPI_HIST_CMD_DOWNLOAD_RDY,
+					IF_SPI_HIST_CMD_DOWNLOAD_RDY);
+		if (err)
+			goto release_firmware;
+
+		/* Feed the data into the command read/write port reg
+		 * in chunks of 64 bytes */
+		memset(temp, 0, sizeof(temp));
+		memcpy(temp, fw,
+		       min(bytes_remaining, HELPER_FW_LOAD_CHUNK_SZ));
+		mdelay(10);
+		err = spu_write(card, IF_SPI_CMD_RDWRPORT_REG,
+					temp, HELPER_FW_LOAD_CHUNK_SZ);
+		if (err)
+			goto release_firmware;
+
+		/* Interrupt the boot code */
+		err = spu_write_u16(card, IF_SPI_HOST_INT_STATUS_REG, 0);
+		if (err)
+			goto release_firmware;
+		err = spu_write_u16(card, IF_SPI_CARD_INT_CAUSE_REG,
+				       IF_SPI_CIC_CMD_DOWNLOAD_OVER);
+		if (err)
+			goto release_firmware;
+		bytes_remaining -= HELPER_FW_LOAD_CHUNK_SZ;
+		fw += HELPER_FW_LOAD_CHUNK_SZ;
+	}
+
+	/* Once the helper / single stage firmware download is complete,
+	 * write 0 to scratch pad 1 and interrupt the
+	 * bootloader. This completes the helper download. */
+	err = spu_write_u16(card, IF_SPI_SCRATCH_1_REG, FIRMWARE_DNLD_OK);
+	if (err)
+		goto release_firmware;
+	err = spu_write_u16(card, IF_SPI_HOST_INT_STATUS_REG, 0);
+	if (err)
+		goto release_firmware;
+	err = spu_write_u16(card, IF_SPI_CARD_INT_CAUSE_REG,
+				IF_SPI_CIC_CMD_DOWNLOAD_OVER);
+		goto release_firmware;
+
+	lbs_deb_spi("waiting for helper to boot...\n");
+
+release_firmware:
+	release_firmware(firmware);
+out:
+	if (err)
+		lbs_pr_err("failed to load helper firmware (err=%d)\n", err);
+	lbs_deb_leave_args(LBS_DEB_SPI, "err %d", err);
+	return err;
+}
+
+/* Returns the length of the next packet the firmware expects us to send
+ * Sets crc_err if the previous transfer had a CRC error. */
+static int if_spi_prog_main_firmware_check_len(struct if_spi_card *card,
+						int *crc_err)
+{
+	u16 len;
+	int err = 0;
+
+	/* wait until the host interrupt status register indicates
+	 * that we are ready to download */
+	err = spu_wait_for_u16(card, IF_SPI_HOST_INT_STATUS_REG,
+				IF_SPI_HIST_CMD_DOWNLOAD_RDY,
+				IF_SPI_HIST_CMD_DOWNLOAD_RDY);
+	if (err) {
+		lbs_pr_err("timed out waiting for host_int_status\n");
+		return err;
+	}
+
+	/* Ask the device how many bytes of firmware it wants. */
+	err = spu_read_u16(card, IF_SPI_SCRATCH_1_REG, &len);
+	if (err)
+		return err;
+
+	if (len > IF_SPI_CMD_BUF_SIZE) {
+		lbs_pr_err("firmware load device requested a larger "
+			   "tranfer than we are prepared to "
+			   "handle. (len = %d)\n", len);
+		return -EIO;
+	}
+	if (len & 0x1) {
+		lbs_deb_spi("%s: crc error\n", __func__);
+		len &= ~0x1;
+		*crc_err = 1;
+	} else
+		*crc_err = 0;
+
+	return len;
+}
+
+static int if_spi_prog_main_firmware(struct if_spi_card *card)
+{
+	int len, prev_len;
+	int bytes, crc_err = 0, err = 0;
+	const struct firmware *firmware = NULL;
+	const u8 *fw;
+	struct spi_device *spi = card->spi;
+	u16 num_crc_errs;
+
+	lbs_deb_enter(LBS_DEB_SPI);
+
+	err = spu_set_interrupt_mode(card, 1, 0);
+	if (err)
+		goto out;
+
+	/* Get firmware image */
+	err = request_firmware(&firmware, card->main_fw_name, &spi->dev);
+	if (err) {
+		lbs_pr_err("%s: can't get firmware '%s' from kernel. "
+			"err = %d\n", __func__, card->main_fw_name, err);
+		goto out;
+	}
+
+	err = spu_wait_for_u16(card, IF_SPI_SCRATCH_1_REG, 0, 0);
+	if (err) {
+		lbs_pr_err("%s: timed out waiting for initial "
+			   "scratch reg = 0\n", __func__);
+		goto release_firmware;
+	}
+
+	num_crc_errs = 0;
+	prev_len = 0;
+	bytes = firmware->size;
+	fw = firmware->data;
+	while ((len = if_spi_prog_main_firmware_check_len(card, &crc_err))) {
+		if (len < 0) {
+			err = len;
+			goto release_firmware;
+		}
+		if (bytes < 0) {
+			/* If there are no more bytes left, we would normally
+			 * expect to have terminated with len = 0 */
+			lbs_pr_err("Firmware load wants more bytes "
+				   "than we have to offer.\n");
+			break;
+		}
+		if (crc_err) {
+			/* Previous transfer failed. */
+			if (++num_crc_errs > MAX_MAIN_FW_LOAD_CRC_ERR) {
+				lbs_pr_err("Too many CRC errors encountered "
+					   "in firmware load.\n");
+				err = -EIO;
+				goto release_firmware;
+			}
+		} else {
+			/* Previous transfer succeeded. Advance counters. */
+			bytes -= prev_len;
+			fw += prev_len;
+		}
+		if (bytes < len) {
+			memset(card->cmd_buffer, 0, len);
+			memcpy(card->cmd_buffer, fw, bytes);
+		} else
+			memcpy(card->cmd_buffer, fw, len);
+
+		err = spu_write_u16(card, IF_SPI_HOST_INT_STATUS_REG, 0);
+		if (err)
+			goto release_firmware;
+		err = spu_write(card, IF_SPI_CMD_RDWRPORT_REG,
+				card->cmd_buffer, len);
+		if (err)
+			goto release_firmware;
+		err = spu_write_u16(card, IF_SPI_CARD_INT_CAUSE_REG ,
+					IF_SPI_CIC_CMD_DOWNLOAD_OVER);
+		if (err)
+			goto release_firmware;
+		prev_len = len;
+	}
+	if (bytes > prev_len) {
+		lbs_pr_err("firmware load wants fewer bytes than "
+			   "we have to offer.\n");
+	}
+
+	/* Confirm firmware download */
+	err = spu_wait_for_u32(card, IF_SPI_SCRATCH_4_REG,
+					SUCCESSFUL_FW_DOWNLOAD_MAGIC);
+	if (err) {
+		lbs_pr_err("failed to confirm the firmware download\n");
+		goto release_firmware;
+	}
+
+release_firmware:
+	release_firmware(firmware);
+
+out:
+	if (err)
+		lbs_pr_err("failed to load firmware (err=%d)\n", err);
+	lbs_deb_leave_args(LBS_DEB_SPI, "err %d", err);
+	return err;
+}
+
+/*
+ * SPI Transfer Thread
+ *
+ * The SPI thread handles all SPI transfers, so there is no need for a lock.
+ */
+
+/* Move a command from the card to the host */
+static int if_spi_c2h_cmd(struct if_spi_card *card)
+{
+	struct lbs_private *priv = card->priv;
+	unsigned long flags;
+	int err = 0;
+	u16 len;
+	u8 i;
+
+	/* We need a buffer big enough to handle whatever people send to
+	 * hw_host_to_card */
+	BUILD_BUG_ON(IF_SPI_CMD_BUF_SIZE < LBS_CMD_BUFFER_SIZE);
+	BUILD_BUG_ON(IF_SPI_CMD_BUF_SIZE < LBS_UPLD_SIZE);
+
+	/* It's just annoying if the buffer size isn't a multiple of 4, because
+	 * then we might have len <  IF_SPI_CMD_BUF_SIZE but
+	 * ALIGN(len, 4) > IF_SPI_CMD_BUF_SIZE */
+	BUILD_BUG_ON(IF_SPI_CMD_BUF_SIZE % 4 != 0);
+
+	lbs_deb_enter(LBS_DEB_SPI);
+
+	/* How many bytes are there to read? */
+	err = spu_read_u16(card, IF_SPI_SCRATCH_2_REG, &len);
+	if (err)
+		goto out;
+	if (!len) {
+		lbs_pr_err("%s: error: card has no data for host\n",
+			   __func__);
+		err = -EINVAL;
+		goto out;
+	} else if (len > IF_SPI_CMD_BUF_SIZE) {
+		lbs_pr_err("%s: error: response packet too large: "
+			   "%d bytes, but maximum is %d\n",
+			   __func__, len, IF_SPI_CMD_BUF_SIZE);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Read the data from the WLAN module into our command buffer */
+	err = spu_read(card, IF_SPI_CMD_RDWRPORT_REG,
+				card->cmd_buffer, ALIGN(len, 4));
+	if (err)
+		goto out;
+
+	spin_lock_irqsave(&priv->driver_lock, flags);
+	i = (priv->resp_idx == 0) ? 1 : 0;
+	BUG_ON(priv->resp_len[i]);
+	priv->resp_len[i] = len;
+	memcpy(priv->resp_buf[i], card->cmd_buffer, len);
+	lbs_notify_command_response(priv, i);
+	spin_unlock_irqrestore(&priv->driver_lock, flags);
+
+out:
+	if (err)
+		lbs_pr_err("%s: err=%d\n", __func__, err);
+	lbs_deb_leave(LBS_DEB_SPI);
+	return err;
+}
+
+/* Move data from the card to the host */
+static int if_spi_c2h_data(struct if_spi_card *card)
+{
+	struct sk_buff *skb;
+	char *data;
+	u16 len;
+	int err = 0;
+
+	lbs_deb_enter(LBS_DEB_SPI);
+
+	/* How many bytes are there to read? */
+	err = spu_read_u16(card, IF_SPI_SCRATCH_1_REG, &len);
+	if (err)
+		goto out;
+	if (!len) {
+		lbs_pr_err("%s: error: card has no data for host\n",
+			   __func__);
+		err = -EINVAL;
+		goto out;
+	} else if (len > MRVDRV_ETH_RX_PACKET_BUFFER_SIZE) {
+		lbs_pr_err("%s: error: card has %d bytes of data, but "
+			   "our maximum skb size is %u\n",
+			   __func__, len, MRVDRV_ETH_RX_PACKET_BUFFER_SIZE);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* TODO: should we allocate a smaller skb if we have less data? */
+	skb = dev_alloc_skb(MRVDRV_ETH_RX_PACKET_BUFFER_SIZE);
+	if (!skb) {
+		err = -ENOBUFS;
+		goto out;
+	}
+	skb_reserve(skb, IPFIELD_ALIGN_OFFSET);
+	data = skb_put(skb, len);
+
+	/* Read the data from the WLAN module into our skb... */
+	err = spu_read(card, IF_SPI_DATA_RDWRPORT_REG, data, ALIGN(len, 4));
+	if (err)
+		goto free_skb;
+
+	/* pass the SKB to libertas */
+	err = lbs_process_rxed_packet(card->priv, skb);
+	if (err)
+		goto free_skb;
+
+	/* success */
+	goto out;
+
+free_skb:
+	dev_kfree_skb(skb);
+out:
+	if (err)
+		lbs_pr_err("%s: err=%d\n", __func__, err);
+	lbs_deb_leave(LBS_DEB_SPI);
+	return err;
+}
+
+/* Move data or a command from the host to the card. */
+static void if_spi_h2c(struct if_spi_card *card,
+			struct if_spi_packet *packet, int type)
+{
+	int err = 0;
+	u16 int_type, port_reg;
+
+	switch (type) {
+	case MVMS_DAT:
+		int_type = IF_SPI_CIC_TX_DOWNLOAD_OVER;
+		port_reg = IF_SPI_DATA_RDWRPORT_REG;
+		break;
+	case MVMS_CMD:
+		int_type = IF_SPI_CIC_CMD_DOWNLOAD_OVER;
+		port_reg = IF_SPI_CMD_RDWRPORT_REG;
+		break;
+	default:
+		lbs_pr_err("can't transfer buffer of type %d\n", type);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Write the data to the card */
+	err = spu_write(card, port_reg, packet->buffer, packet->blen);
+	if (err)
+		goto out;
+
+out:
+	kfree(packet);
+
+	if (err)
+		lbs_pr_err("%s: error %d\n", __func__, err);
+}
+
+/* Inform the host about a card event */
+static void if_spi_e2h(struct if_spi_card *card)
+{
+	int err = 0;
+	unsigned long flags;
+	u32 cause;
+	struct lbs_private *priv = card->priv;
+
+	err = spu_read_u32(card, IF_SPI_SCRATCH_3_REG, &cause);
+	if (err)
+		goto out;
+
+	spin_lock_irqsave(&priv->driver_lock, flags);
+	lbs_queue_event(priv, cause & 0xff);
+	spin_unlock_irqrestore(&priv->driver_lock, flags);
+
+out:
+	if (err)
+		lbs_pr_err("%s: error %d\n", __func__, err);
+}
+
+static int lbs_spi_thread(void *data)
+{
+	int err;
+	struct if_spi_card *card = data;
+	u16 hiStatus;
+	unsigned long flags;
+	struct if_spi_packet *packet;
+
+	while (1) {
+		/* Wait to be woken up by one of two things.  First, our ISR
+		 * could tell us that something happened on the WLAN.
+		 * Secondly, libertas could call hw_host_to_card with more
+		 * data, which we might be able to send.
+		 */
+		do {
+			err = down_interruptible(&card->spi_ready);
+			if (!card->run_thread) {
+				up(&card->spi_thread_terminated);
+				do_exit(0);
+			}
+		} while (err == EINTR);
+
+		/* Read the host interrupt status register to see what we
+		 * can do. */
+		err = spu_read_u16(card, IF_SPI_HOST_INT_STATUS_REG,
+					&hiStatus);
+		if (err) {
+			lbs_pr_err("I/O error\n");
+			goto err;
+		}
+
+		if (hiStatus & IF_SPI_HIST_CMD_UPLOAD_RDY)
+			err = if_spi_c2h_cmd(card);
+			if (err)
+				goto err;
+		if (hiStatus & IF_SPI_HIST_RX_UPLOAD_RDY)
+			err = if_spi_c2h_data(card);
+			if (err)
+				goto err;
+		if (hiStatus & IF_SPI_HIST_CMD_DOWNLOAD_RDY) {
+			/* This means two things. First of all,
+			 * if there was a previous command sent, the card has
+			 * successfully received it.
+			 * Secondly, it is now ready to download another
+			 * command.
+			 */
+			lbs_host_to_card_done(card->priv);
+
+			/* Do we have any command packets from the host to
+			 * send? */
+			packet = NULL;
+			spin_lock_irqsave(&card->buffer_lock, flags);
+			if (!list_empty(&card->cmd_packet_list)) {
+				packet = (struct if_spi_packet *)(card->
+						cmd_packet_list.next);
+				list_del(&packet->list);
+			}
+			spin_unlock_irqrestore(&card->buffer_lock, flags);
+
+			if (packet)
+				if_spi_h2c(card, packet, MVMS_CMD);
+		}
+		if (hiStatus & IF_SPI_HIST_TX_DOWNLOAD_RDY) {
+			/* Do we have any data packets from the host to
+			 * send? */
+			packet = NULL;
+			spin_lock_irqsave(&card->buffer_lock, flags);
+			if (!list_empty(&card->data_packet_list)) {
+				packet = (struct if_spi_packet *)(card->
+						data_packet_list.next);
+				list_del(&packet->list);
+			}
+			spin_unlock_irqrestore(&card->buffer_lock, flags);
+
+			if (packet)
+				if_spi_h2c(card, packet, MVMS_DAT);
+		}
+		if (hiStatus & IF_SPI_HIST_CARD_EVENT)
+			if_spi_e2h(card);
+
+err:
+		if (err)
+			lbs_pr_err("%s: got error %d\n", __func__, err);
+	}
+}
+
+/* Block until lbs_spi_thread thread has terminated */
+static void if_spi_terminate_spi_thread(struct if_spi_card *card)
+{
+	/* It would be nice to use kthread_stop here, but that function
+	 * can't wake threads waiting for a semaphore. */
+	card->run_thread = 0;
+	up(&card->spi_ready);
+	down(&card->spi_thread_terminated);
+}
+
+/*
+ * Host to Card
+ *
+ * Called from Libertas to transfer some data to the WLAN device
+ * We can't sleep here. */
+static int if_spi_host_to_card(struct lbs_private *priv,
+				u8 type, u8 *buf, u16 nb)
+{
+	int err = 0;
+	unsigned long flags;
+	struct if_spi_card *card = priv->card;
+	struct if_spi_packet *packet;
+	u16 blen;
+
+	lbs_deb_enter_args(LBS_DEB_SPI, "type %d, bytes %d", type, nb);
+
+	if (nb == 0) {
+		lbs_pr_err("%s: invalid size requested: %d\n", __func__, nb);
+		err = -EINVAL;
+		goto out;
+	}
+	blen = ALIGN(nb, 4);
+	packet = kzalloc(sizeof(struct if_spi_packet) + blen, GFP_ATOMIC);
+	if (!packet) {
+		err = -ENOMEM;
+		goto out;
+	}
+	packet->blen = blen;
+	memcpy(packet->buffer, buf, nb);
+	memset(packet->buffer + nb, 0, blen - nb);
+
+	switch (type) {
+	case MVMS_CMD:
+		priv->dnld_sent = DNLD_CMD_SENT;
+		spin_lock_irqsave(&card->buffer_lock, flags);
+		list_add_tail(&packet->list, &card->cmd_packet_list);
+		spin_unlock_irqrestore(&card->buffer_lock, flags);
+		break;
+	case MVMS_DAT:
+		priv->dnld_sent = DNLD_DATA_SENT;
+		spin_lock_irqsave(&card->buffer_lock, flags);
+		list_add_tail(&packet->list, &card->data_packet_list);
+		spin_unlock_irqrestore(&card->buffer_lock, flags);
+		break;
+	default:
+		lbs_pr_err("can't transfer buffer of type %d", type);
+		err = -EINVAL;
+		break;
+	}
+
+	/* Wake up the spi thread */
+	up(&card->spi_ready);
+out:
+	lbs_deb_leave_args(LBS_DEB_SPI, "err=%d", err);
+	return err;
+}
+
+/*
+ * Host Interrupts
+ *
+ * Service incoming interrupts from the WLAN device. We can't sleep here, so
+ * don't try to talk on the SPI bus, just wake up the SPI thread.
+ */
+static irqreturn_t if_spi_host_interrupt(int irq, void *dev_id)
+{
+	struct if_spi_card *card = dev_id;
+
+	up(&card->spi_ready);
+	return IRQ_HANDLED;
+}
+
+/*
+ * SPI callbacks
+ */
+
+static int if_spi_calculate_fw_names(u16 card_id,
+			      char *helper_fw, char *main_fw)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(chip_id_to_device_name); ++i) {
+		if (card_id == chip_id_to_device_name[i].chip_id)
+			break;
+	}
+	if (i == ARRAY_SIZE(chip_id_to_device_name)) {
+		lbs_pr_err("Unsupported chip_id: 0x%02x\n", card_id);
+		return -EAFNOSUPPORT;
+	}
+	snprintf(helper_fw, FIRMWARE_NAME_MAX, "libertas/gspi%d_hlp.bin",
+		 chip_id_to_device_name[i].name);
+	snprintf(main_fw, FIRMWARE_NAME_MAX, "libertas/gspi%d.bin",
+		 chip_id_to_device_name[i].name);
+	return 0;
+}
+
+static int __devinit if_spi_probe(struct spi_device *spi)
+{
+	struct if_spi_card *card;
+	struct lbs_private *priv = NULL;
+	struct libertas_spi_platform_data *pdata = spi->dev.platform_data;
+	int err = 0;
+	u32 scratch;
+
+	lbs_deb_enter(LBS_DEB_SPI);
+
+	/* Allocate card structure to represent this specific device */
+	card = kzalloc(sizeof(struct if_spi_card), GFP_KERNEL);
+	if (!card) {
+		err = -ENOMEM;
+		goto out;
+	}
+	spi_set_drvdata(spi, card);
+	card->spi = spi;
+	card->gpio_cs = pdata->gpio_cs;
+	card->prev_xfer_time = jiffies;
+
+	sema_init(&card->spi_ready, 0);
+	sema_init(&card->spi_thread_terminated, 0);
+	INIT_LIST_HEAD(&card->cmd_packet_list);
+	INIT_LIST_HEAD(&card->data_packet_list);
+	spin_lock_init(&card->buffer_lock);
+
+	/* set up GPIO CS line. TODO: use  regular CS line */
+	err = gpio_request(card->gpio_cs, "if_spi_gpio_chip_select");
+	if (err)
+		goto free_card;
+	err = gpio_direction_output(card->gpio_cs, 1);
+	if (err)
+		goto free_gpio;
+
+	/* Initialize the SPI Interface Unit */
+	err = spu_init(card, pdata->use_dummy_writes);
+	if (err)
+		goto free_gpio;
+	err = spu_get_chip_revision(card, &card->card_id, &card->card_rev);
+	if (err)
+		goto free_gpio;
+
+	/* Firmware load */
+	err = spu_read_u32(card, IF_SPI_SCRATCH_4_REG, &scratch);
+	if (err)
+		goto free_gpio;
+	if (scratch == SUCCESSFUL_FW_DOWNLOAD_MAGIC)
+		lbs_deb_spi("Firmware is already loaded for "
+			    "Marvell WLAN 802.11 adapter\n");
+	else {
+		err = if_spi_calculate_fw_names(card->card_id,
+				card->helper_fw_name, card->main_fw_name);
+		if (err)
+			goto free_gpio;
+
+		lbs_deb_spi("Initializing FW for Marvell WLAN 802.11 adapter "
+				"(chip_id = 0x%04x, chip_rev = 0x%02x) "
+				"attached to SPI bus_num %d, chip_select %d. "
+				"spi->max_speed_hz=%d\n",
+				card->card_id, card->card_rev,
+				spi->master->bus_num, spi->chip_select,
+				spi->max_speed_hz);
+		err = if_spi_prog_helper_firmware(card);
+		if (err)
+			goto free_gpio;
+		err = if_spi_prog_main_firmware(card);
+		if (err)
+			goto free_gpio;
+		lbs_deb_spi("loaded FW for Marvell WLAN 802.11 adapter\n");
+	}
+
+	err = spu_set_interrupt_mode(card, 0, 1);
+	if (err)
+		goto free_gpio;
+
+	/* Register our card with libertas.
+	 * This will call alloc_etherdev */
+	priv = lbs_add_card(card, &spi->dev);
+	if (!priv) {
+		err = -ENOMEM;
+		goto free_gpio;
+	}
+	card->priv = priv;
+	priv->card = card;
+	priv->hw_host_to_card = if_spi_host_to_card;
+	priv->fw_ready = 1;
+	priv->ps_supported = 1;
+
+	/* Initialize interrupt handling stuff. */
+	card->run_thread = 1;
+	card->spi_thread = kthread_run(lbs_spi_thread, card, "lbs_spi_thread");
+	if (IS_ERR(card->spi_thread)) {
+		card->run_thread = 0;
+		err = PTR_ERR(card->spi_thread);
+		lbs_pr_err("error creating SPI thread: err=%d\n", err);
+		goto remove_card;
+	}
+	err = request_irq(spi->irq, if_spi_host_interrupt,
+			IRQF_TRIGGER_FALLING, "libertas_spi", card);
+	if (err) {
+		lbs_pr_err("can't get host irq line-- request_irq failed\n");
+		goto terminate_thread;
+	}
+
+	/* Start the card.
+	 * This will call register_netdev, and we'll start
+	 * getting interrupts... */
+	err = lbs_start_card(priv);
+	if (err)
+		goto release_irq;
+
+	lbs_deb_spi("Finished initializing WLAN module.\n");
+
+	/* successful exit */
+	goto out;
+
+release_irq:
+	free_irq(spi->irq, card);
+terminate_thread:
+	if_spi_terminate_spi_thread(card);
+remove_card:
+	lbs_remove_card(priv); /* will call free_netdev */
+free_gpio:
+	gpio_free(card->gpio_cs);
+free_card:
+	free_if_spi_card(card);
+out:
+	lbs_deb_leave_args(LBS_DEB_SPI, "err %d\n", err);
+	return err;
+}
+
+static int __devexit libertas_spi_remove(struct spi_device *spi)
+{
+	struct if_spi_card *card = spi_get_drvdata(spi);
+	struct lbs_private *priv = card->priv;
+
+	lbs_deb_spi("libertas_spi_remove\n");
+	lbs_deb_enter(LBS_DEB_SPI);
+	priv->surpriseremoved = 1;
+
+	lbs_stop_card(priv);
+	free_irq(spi->irq, card);
+	if_spi_terminate_spi_thread(card);
+	lbs_remove_card(priv); /* will call free_netdev */
+	gpio_free(card->gpio_cs);
+	free_if_spi_card(card);
+	lbs_deb_leave(LBS_DEB_SPI);
+	return 0;
+}
+
+static struct spi_driver libertas_spi_driver = {
+	.probe	= if_spi_probe,
+	.remove = __devexit_p(libertas_spi_remove),
+	.driver = {
+		.name	= "libertas_spi",
+		.bus	= &spi_bus_type,
+		.owner	= THIS_MODULE,
+	},
+};
+
+/*
+ * Module functions
+ */
+
+static int __init if_spi_init_module(void)
+{
+	int ret = 0;
+	lbs_deb_enter(LBS_DEB_SPI);
+	printk(KERN_INFO "libertas_spi: Libertas SPI driver\n");
+	ret = spi_register_driver(&libertas_spi_driver);
+	lbs_deb_leave(LBS_DEB_SPI);
+	return ret;
+}
+
+static void __exit if_spi_exit_module(void)
+{
+	lbs_deb_enter(LBS_DEB_SPI);
+	spi_unregister_driver(&libertas_spi_driver);
+	lbs_deb_leave(LBS_DEB_SPI);
+}
+
+module_init(if_spi_init_module);
+module_exit(if_spi_exit_module);
+
+MODULE_DESCRIPTION("Libertas SPI WLAN Driver");
+MODULE_AUTHOR("Andrey Yurovsky <andrey@cozybit.com>, "
+	      "Colin McCabe <colin@cozybit.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/wireless/libertas/if_spi.h b/drivers/net/wireless/libertas/if_spi.h
new file mode 100644
index 000000000000..2103869cc5b0
--- /dev/null
+++ b/drivers/net/wireless/libertas/if_spi.h
@@ -0,0 +1,208 @@
+/*
+ *	linux/drivers/net/wireless/libertas/if_spi.c
+ *
+ *	Driver for Marvell SPI WLAN cards.
+ *
+ *	Copyright 2008 Analog Devices Inc.
+ *
+ *	Authors:
+ *	Andrey Yurovsky <andrey@cozybit.com>
+ *	Colin McCabe <colin@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef _LBS_IF_SPI_H_
+#define _LBS_IF_SPI_H_
+
+#define IPFIELD_ALIGN_OFFSET 2
+#define IF_SPI_CMD_BUF_SIZE 2400
+
+/***************** Firmware *****************/
+struct chip_ident {
+	u16 chip_id;
+	u16 name;
+};
+
+#define MAX_MAIN_FW_LOAD_CRC_ERR 10
+
+/* Chunk size when loading the helper firmware */
+#define HELPER_FW_LOAD_CHUNK_SZ 64
+
+/* Value to write to indicate end of helper firmware dnld */
+#define FIRMWARE_DNLD_OK 0x0000
+
+/* Value to check once the main firmware is downloaded */
+#define SUCCESSFUL_FW_DOWNLOAD_MAGIC 0x88888888
+
+/***************** SPI Interface Unit *****************/
+/* Masks used in SPI register read/write operations */
+#define IF_SPI_READ_OPERATION_MASK 0x0
+#define IF_SPI_WRITE_OPERATION_MASK 0x8000
+
+/* SPI register offsets. 4-byte aligned. */
+#define IF_SPI_DEVICEID_CTRL_REG 0x00	/* DeviceID controller reg */
+#define IF_SPI_IO_READBASE_REG 0x04 	/* Read I/O base reg */
+#define IF_SPI_IO_WRITEBASE_REG 0x08	/* Write I/O base reg */
+#define IF_SPI_IO_RDWRPORT_REG 0x0C	/* Read/Write I/O port reg */
+
+#define IF_SPI_CMD_READBASE_REG 0x10	/* Read command base reg */
+#define IF_SPI_CMD_WRITEBASE_REG 0x14	/* Write command base reg */
+#define IF_SPI_CMD_RDWRPORT_REG 0x18	/* Read/Write command port reg */
+
+#define IF_SPI_DATA_READBASE_REG 0x1C	/* Read data base reg */
+#define IF_SPI_DATA_WRITEBASE_REG 0x20	/* Write data base reg */
+#define IF_SPI_DATA_RDWRPORT_REG 0x24	/* Read/Write data port reg */
+
+#define IF_SPI_SCRATCH_1_REG 0x28	/* Scratch reg 1 */
+#define IF_SPI_SCRATCH_2_REG 0x2C	/* Scratch reg 2 */
+#define IF_SPI_SCRATCH_3_REG 0x30	/* Scratch reg 3 */
+#define IF_SPI_SCRATCH_4_REG 0x34	/* Scratch reg 4 */
+
+#define IF_SPI_TX_FRAME_SEQ_NUM_REG 0x38 /* Tx frame sequence number reg */
+#define IF_SPI_TX_FRAME_STATUS_REG 0x3C	/* Tx frame status reg */
+
+#define IF_SPI_HOST_INT_CTRL_REG 0x40	/* Host interrupt controller reg */
+
+#define IF_SPI_CARD_INT_CAUSE_REG 0x44	/* Card interrupt cause reg */
+#define IF_SPI_CARD_INT_STATUS_REG 0x48 /* Card interupt status reg */
+#define IF_SPI_CARD_INT_EVENT_MASK_REG 0x4C /* Card interrupt event mask */
+#define IF_SPI_CARD_INT_STATUS_MASK_REG	0x50 /* Card interrupt status mask */
+
+#define IF_SPI_CARD_INT_RESET_SELECT_REG 0x54 /* Card interrupt reset select */
+
+#define IF_SPI_HOST_INT_CAUSE_REG 0x58	/* Host interrupt cause reg */
+#define IF_SPI_HOST_INT_STATUS_REG 0x5C	/* Host interrupt status reg */
+#define IF_SPI_HOST_INT_EVENT_MASK_REG 0x60 /* Host interrupt event mask */
+#define IF_SPI_HOST_INT_STATUS_MASK_REG	0x64 /* Host interrupt status mask */
+#define IF_SPI_HOST_INT_RESET_SELECT_REG 0x68 /* Host interrupt reset select */
+
+#define IF_SPI_DELAY_READ_REG 0x6C	/* Delay read reg */
+#define IF_SPI_SPU_BUS_MODE_REG 0x70	/* SPU BUS mode reg */
+
+/***************** IF_SPI_DEVICEID_CTRL_REG *****************/
+#define IF_SPI_DEVICEID_CTRL_REG_TO_CARD_ID(dc) ((dc & 0xffff0000)>>16)
+#define IF_SPI_DEVICEID_CTRL_REG_TO_CARD_REV(dc) (dc & 0x000000ff)
+
+/***************** IF_SPI_HOST_INT_CTRL_REG *****************/
+/** Host Interrupt Control bit : Wake up */
+#define IF_SPI_HICT_WAKE_UP				(1<<0)
+/** Host Interrupt Control bit : WLAN ready */
+#define IF_SPI_HICT_WLAN_READY				(1<<1)
+/*#define IF_SPI_HICT_FIFO_FIRST_HALF_EMPTY		(1<<2) */
+/*#define IF_SPI_HICT_FIFO_SECOND_HALF_EMPTY		(1<<3) */
+/*#define IF_SPI_HICT_IRQSRC_WLAN			(1<<4) */
+/** Host Interrupt Control bit : Tx auto download */
+#define IF_SPI_HICT_TX_DOWNLOAD_OVER_AUTO		(1<<5)
+/** Host Interrupt Control bit : Rx auto upload */
+#define IF_SPI_HICT_RX_UPLOAD_OVER_AUTO			(1<<6)
+/** Host Interrupt Control bit : Command auto download */
+#define IF_SPI_HICT_CMD_DOWNLOAD_OVER_AUTO		(1<<7)
+/** Host Interrupt Control bit : Command auto upload */
+#define IF_SPI_HICT_CMD_UPLOAD_OVER_AUTO		(1<<8)
+
+/***************** IF_SPI_CARD_INT_CAUSE_REG *****************/
+/** Card Interrupt Case bit : Tx download over */
+#define IF_SPI_CIC_TX_DOWNLOAD_OVER			(1<<0)
+/** Card Interrupt Case bit : Rx upload over */
+#define IF_SPI_CIC_RX_UPLOAD_OVER			(1<<1)
+/** Card Interrupt Case bit : Command download over */
+#define IF_SPI_CIC_CMD_DOWNLOAD_OVER			(1<<2)
+/** Card Interrupt Case bit : Host event */
+#define IF_SPI_CIC_HOST_EVENT				(1<<3)
+/** Card Interrupt Case bit : Command upload over */
+#define IF_SPI_CIC_CMD_UPLOAD_OVER			(1<<4)
+/** Card Interrupt Case bit : Power down */
+#define IF_SPI_CIC_POWER_DOWN				(1<<5)
+
+/***************** IF_SPI_CARD_INT_STATUS_REG *****************/
+#define IF_SPI_CIS_TX_DOWNLOAD_OVER			(1<<0)
+#define IF_SPI_CIS_RX_UPLOAD_OVER			(1<<1)
+#define IF_SPI_CIS_CMD_DOWNLOAD_OVER			(1<<2)
+#define IF_SPI_CIS_HOST_EVENT				(1<<3)
+#define IF_SPI_CIS_CMD_UPLOAD_OVER			(1<<4)
+#define IF_SPI_CIS_POWER_DOWN				(1<<5)
+
+/***************** IF_SPI_HOST_INT_CAUSE_REG *****************/
+#define IF_SPI_HICU_TX_DOWNLOAD_RDY			(1<<0)
+#define IF_SPI_HICU_RX_UPLOAD_RDY			(1<<1)
+#define IF_SPI_HICU_CMD_DOWNLOAD_RDY			(1<<2)
+#define IF_SPI_HICU_CARD_EVENT				(1<<3)
+#define IF_SPI_HICU_CMD_UPLOAD_RDY			(1<<4)
+#define IF_SPI_HICU_IO_WR_FIFO_OVERFLOW			(1<<5)
+#define IF_SPI_HICU_IO_RD_FIFO_UNDERFLOW		(1<<6)
+#define IF_SPI_HICU_DATA_WR_FIFO_OVERFLOW		(1<<7)
+#define IF_SPI_HICU_DATA_RD_FIFO_UNDERFLOW		(1<<8)
+#define IF_SPI_HICU_CMD_WR_FIFO_OVERFLOW		(1<<9)
+#define IF_SPI_HICU_CMD_RD_FIFO_UNDERFLOW		(1<<10)
+
+/***************** IF_SPI_HOST_INT_STATUS_REG *****************/
+/** Host Interrupt Status bit : Tx download ready */
+#define IF_SPI_HIST_TX_DOWNLOAD_RDY			(1<<0)
+/** Host Interrupt Status bit : Rx upload ready */
+#define IF_SPI_HIST_RX_UPLOAD_RDY			(1<<1)
+/** Host Interrupt Status bit : Command download ready */
+#define IF_SPI_HIST_CMD_DOWNLOAD_RDY			(1<<2)
+/** Host Interrupt Status bit : Card event */
+#define IF_SPI_HIST_CARD_EVENT				(1<<3)
+/** Host Interrupt Status bit : Command upload ready */
+#define IF_SPI_HIST_CMD_UPLOAD_RDY			(1<<4)
+/** Host Interrupt Status bit : I/O write FIFO overflow */
+#define IF_SPI_HIST_IO_WR_FIFO_OVERFLOW			(1<<5)
+/** Host Interrupt Status bit : I/O read FIFO underflow */
+#define IF_SPI_HIST_IO_RD_FIFO_UNDRFLOW			(1<<6)
+/** Host Interrupt Status bit : Data write FIFO overflow */
+#define IF_SPI_HIST_DATA_WR_FIFO_OVERFLOW		(1<<7)
+/** Host Interrupt Status bit : Data read FIFO underflow */
+#define IF_SPI_HIST_DATA_RD_FIFO_UNDERFLOW		(1<<8)
+/** Host Interrupt Status bit : Command write FIFO overflow */
+#define IF_SPI_HIST_CMD_WR_FIFO_OVERFLOW		(1<<9)
+/** Host Interrupt Status bit : Command read FIFO underflow */
+#define IF_SPI_HIST_CMD_RD_FIFO_UNDERFLOW		(1<<10)
+
+/***************** IF_SPI_HOST_INT_STATUS_MASK_REG *****************/
+/** Host Interrupt Status Mask bit : Tx download ready */
+#define IF_SPI_HISM_TX_DOWNLOAD_RDY			(1<<0)
+/** Host Interrupt Status Mask bit : Rx upload ready */
+#define IF_SPI_HISM_RX_UPLOAD_RDY			(1<<1)
+/** Host Interrupt Status Mask bit : Command download ready */
+#define IF_SPI_HISM_CMD_DOWNLOAD_RDY			(1<<2)
+/** Host Interrupt Status Mask bit : Card event */
+#define IF_SPI_HISM_CARDEVENT				(1<<3)
+/** Host Interrupt Status Mask bit : Command upload ready */
+#define IF_SPI_HISM_CMD_UPLOAD_RDY			(1<<4)
+/** Host Interrupt Status Mask bit : I/O write FIFO overflow */
+#define IF_SPI_HISM_IO_WR_FIFO_OVERFLOW			(1<<5)
+/** Host Interrupt Status Mask bit : I/O read FIFO underflow */
+#define IF_SPI_HISM_IO_RD_FIFO_UNDERFLOW		(1<<6)
+/** Host Interrupt Status Mask bit : Data write FIFO overflow */
+#define IF_SPI_HISM_DATA_WR_FIFO_OVERFLOW		(1<<7)
+/** Host Interrupt Status Mask bit : Data write FIFO underflow */
+#define IF_SPI_HISM_DATA_RD_FIFO_UNDERFLOW		(1<<8)
+/** Host Interrupt Status Mask bit : Command write FIFO overflow */
+#define IF_SPI_HISM_CMD_WR_FIFO_OVERFLOW		(1<<9)
+/** Host Interrupt Status Mask bit : Command write FIFO underflow */
+#define IF_SPI_HISM_CMD_RD_FIFO_UNDERFLOW		(1<<10)
+
+/***************** IF_SPI_SPU_BUS_MODE_REG *****************/
+/* SCK edge on which the WLAN module outputs data on MISO */
+#define IF_SPI_BUS_MODE_SPI_CLOCK_PHASE_FALLING 0x8
+#define IF_SPI_BUS_MODE_SPI_CLOCK_PHASE_RISING 0x0
+
+/* In a SPU read operation, there is a delay between writing the SPU
+ * register name and getting back data from the WLAN module.
+ * This can be specified in terms of nanoseconds or in terms of dummy
+ * clock cycles which the master must output before receiving a response. */
+#define IF_SPI_BUS_MODE_DELAY_METHOD_DUMMY_CLOCK 0x4
+#define IF_SPI_BUS_MODE_DELAY_METHOD_TIMED 0x0
+
+/* Some different modes of SPI operation */
+#define IF_SPI_BUS_MODE_8_BIT_ADDRESS_16_BIT_DATA 0x00
+#define IF_SPI_BUS_MODE_8_BIT_ADDRESS_32_BIT_DATA 0x01
+#define IF_SPI_BUS_MODE_16_BIT_ADDRESS_16_BIT_DATA 0x02
+#define IF_SPI_BUS_MODE_16_BIT_ADDRESS_32_BIT_DATA 0x03
+
+#endif
diff --git a/include/linux/spi/libertas_spi.h b/include/linux/spi/libertas_spi.h
new file mode 100644
index 000000000000..ada71b4f3788
--- /dev/null
+++ b/include/linux/spi/libertas_spi.h
@@ -0,0 +1,25 @@
+/*
+ * board-specific data for the libertas_spi driver.
+ *
+ * Copyright 2008 Analog Devices Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+#ifndef _LIBERTAS_SPI_H_
+#define _LIBERTAS_SPI_H_
+struct libertas_spi_platform_data {
+	/* There are two ways to read data from the WLAN module's SPI
+	 * interface. Setting 0 or 1 here controls which one is used.
+	 *
+	 * Usually you want to set use_dummy_writes = 1.
+	 * However, if that doesn't work or if you are using a slow SPI clock
+	 * speed, you may want to use 0 here. */
+	u16 use_dummy_writes;
+
+	/* GPIO number to use as chip select */
+	u16 gpio_cs;
+};
+#endif
-- 
cgit v1.2.3-71-gd317


From d03415e6771cd709b2b2ec64d3e6315cc3ebfa74 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Mon, 12 Jan 2009 14:24:40 +0200
Subject: nl80211: Fix documentation errors

Couple of '_ATTR's were missing and SEC_CHAN_OFFSET to CHANNEL_TYPE
rename was missed in couple of places.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index ee742bc9761e..4e7a7986a521 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -47,7 +47,7 @@
  * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or
  *	%NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME,
  *	%NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, and/or
- *	%NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET.
+ *	%NL80211_ATTR_WIPHY_CHANNEL_TYPE.
  * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request
  *	or rename notification. Has attributes %NL80211_ATTR_WIPHY and
  *	%NL80211_ATTR_WIPHY_NAME.
@@ -84,7 +84,7 @@
  *	%NL80222_CMD_NEW_BEACON message)
  * @NL80211_CMD_SET_BEACON: set the beacon on an access point interface
  *	using the %NL80211_ATTR_BEACON_INTERVAL, %NL80211_ATTR_DTIM_PERIOD,
- *	%NL80211_BEACON_HEAD and %NL80211_BEACON_TAIL attributes.
+ *	%NL80211_ATTR_BEACON_HEAD and %NL80211_ATTR_BEACON_TAIL attributes.
  * @NL80211_CMD_NEW_BEACON: add a new beacon to an access point interface,
  *	parameters are like for %NL80211_CMD_SET_BEACON.
  * @NL80211_CMD_DEL_BEACON: remove the beacon, stop sending it
@@ -362,7 +362,7 @@ enum nl80211_attrs {
 #define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES
 #define NL80211_ATTR_WIPHY_TXQ_PARAMS NL80211_ATTR_WIPHY_TXQ_PARAMS
 #define NL80211_ATTR_WIPHY_FREQ NL80211_ATTR_WIPHY_FREQ
-#define NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET
+#define NL80211_ATTR_WIPHY_CHANNEL_TYPE NL80211_ATTR_WIPHY_CHANNEL_TYPE
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
-- 
cgit v1.2.3-71-gd317


From 9dbeb91a8b97e2892c04461e28d2bdd0198b719d Mon Sep 17 00:00:00 2001
From: Gabor Juhos <juhosg@openwrt.org>
Date: Wed, 14 Jan 2009 20:17:08 +0100
Subject: ath9k: get EEPROM contents from platform data on AHB bus

On the AR913x SOCs we have to provide EEPROM contents via platform_data,
because accessing the flash via MMIO is not safe. Additionally different
boards may store the radio calibration data at different locations.

Changes-licensed-under: ISC

Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
Signed-off-by: Imre Kaloz <kaloz@openwrt.org>
Tested-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath9k/ahb.c    | 27 ++++++++++++++++++++
 drivers/net/wireless/ath9k/core.h   |  1 +
 drivers/net/wireless/ath9k/eeprom.c | 51 +++----------------------------------
 drivers/net/wireless/ath9k/pci.c    | 18 +++++++++++++
 include/linux/ath9k_platform.h      | 28 ++++++++++++++++++++
 5 files changed, 77 insertions(+), 48 deletions(-)
 create mode 100644 include/linux/ath9k_platform.h

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath9k/ahb.c b/drivers/net/wireless/ath9k/ahb.c
index 8cbd4c2a7fa0..7f2c3a09bcac 100644
--- a/drivers/net/wireless/ath9k/ahb.c
+++ b/drivers/net/wireless/ath9k/ahb.c
@@ -18,6 +18,7 @@
 
 #include <linux/nl80211.h>
 #include <linux/platform_device.h>
+#include <linux/ath9k_platform.h>
 #include "core.h"
 #include "reg.h"
 #include "hw.h"
@@ -33,9 +34,29 @@ static void ath_ahb_cleanup(struct ath_softc *sc)
 	iounmap(sc->mem);
 }
 
+static bool ath_ahb_eeprom_read(struct ath_hal *ah, u32 off, u16 *data)
+{
+	struct ath_softc *sc = ah->ah_sc;
+	struct platform_device *pdev = to_platform_device(sc->dev);
+	struct ath9k_platform_data *pdata;
+
+	pdata = (struct ath9k_platform_data *) pdev->dev.platform_data;
+	if (off >= (ARRAY_SIZE(pdata->eeprom_data))) {
+		DPRINTF(ah->ah_sc, ATH_DBG_FATAL,
+			"%s: flash read failed, offset %08x is out of range\n",
+				__func__, off);
+		return false;
+	}
+
+	*data = pdata->eeprom_data[off];
+	return true;
+}
+
 static struct ath_bus_ops ath_ahb_bus_ops  = {
 	.read_cachesize = ath_ahb_read_cachesize,
 	.cleanup = ath_ahb_cleanup,
+
+	.eeprom_read = ath_ahb_eeprom_read,
 };
 
 static int ath_ahb_probe(struct platform_device *pdev)
@@ -48,6 +69,12 @@ static int ath_ahb_probe(struct platform_device *pdev)
 	int ret = 0;
 	struct ath_hal *ah;
 
+	if (!pdev->dev.platform_data) {
+		dev_err(&pdev->dev, "no platform data specified\n");
+		ret = -EINVAL;
+		goto err_out;
+	}
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL) {
 		dev_err(&pdev->dev, "no memory resource found\n");
diff --git a/drivers/net/wireless/ath9k/core.h b/drivers/net/wireless/ath9k/core.h
index c5dae11d6086..b687ae9c9e17 100644
--- a/drivers/net/wireless/ath9k/core.h
+++ b/drivers/net/wireless/ath9k/core.h
@@ -696,6 +696,7 @@ enum PROT_MODE {
 struct ath_bus_ops {
 	void		(*read_cachesize)(struct ath_softc *sc, int *csz);
 	void		(*cleanup)(struct ath_softc *sc);
+	bool		(*eeprom_read)(struct ath_hal *ah, u32 off, u16 *data);
 };
 
 struct ath_softc {
diff --git a/drivers/net/wireless/ath9k/eeprom.c b/drivers/net/wireless/ath9k/eeprom.c
index 1ef8b5a70e5b..50cb3883416a 100644
--- a/drivers/net/wireless/ath9k/eeprom.c
+++ b/drivers/net/wireless/ath9k/eeprom.c
@@ -91,53 +91,11 @@ static inline bool ath9k_hw_get_lower_upper_index(u8 target, u8 *pList,
 	return false;
 }
 
-static bool ath9k_hw_eeprom_read(struct ath_hal *ah, u32 off, u16 *data)
-{
-	(void)REG_READ(ah, AR5416_EEPROM_OFFSET + (off << AR5416_EEPROM_S));
-
-	if (!ath9k_hw_wait(ah,
-			   AR_EEPROM_STATUS_DATA,
-			   AR_EEPROM_STATUS_DATA_BUSY |
-			   AR_EEPROM_STATUS_DATA_PROT_ACCESS, 0)) {
-		return false;
-	}
-
-	*data = MS(REG_READ(ah, AR_EEPROM_STATUS_DATA),
-		   AR_EEPROM_STATUS_DATA_VAL);
-
-	return true;
-}
-
-static int ath9k_hw_flash_map(struct ath_hal *ah)
-{
-	struct ath_hal_5416 *ahp = AH5416(ah);
-
-	ahp->ah_cal_mem = ioremap(AR5416_EEPROM_START_ADDR, AR5416_EEPROM_MAX);
-
-	if (!ahp->ah_cal_mem) {
-		DPRINTF(ah->ah_sc, ATH_DBG_EEPROM,
-			"cannot remap eeprom region \n");
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static bool ath9k_hw_flash_read(struct ath_hal *ah, u32 off, u16 *data)
-{
-	struct ath_hal_5416 *ahp = AH5416(ah);
-
-	*data = ioread16(ahp->ah_cal_mem + off);
-
-	return true;
-}
-
 static inline bool ath9k_hw_nvram_read(struct ath_hal *ah, u32 off, u16 *data)
 {
-	if (ath9k_hw_use_flash(ah))
-		return ath9k_hw_flash_read(ah, off, data);
-	else
-		return ath9k_hw_eeprom_read(ah, off, data);
+	struct ath_softc *sc = ah->ah_sc;
+
+	return sc->bus_ops->eeprom_read(ah, off, data);
 }
 
 static bool ath9k_hw_fill_4k_eeprom(struct ath_hal *ah)
@@ -2825,9 +2783,6 @@ int ath9k_hw_eeprom_attach(struct ath_hal *ah)
 	int status;
 	struct ath_hal_5416 *ahp = AH5416(ah);
 
-	if (ath9k_hw_use_flash(ah))
-		ath9k_hw_flash_map(ah);
-
 	if (AR_SREV_9285(ah))
 		ahp->ah_eep_map = EEP_MAP_4KBITS;
 	else
diff --git a/drivers/net/wireless/ath9k/pci.c b/drivers/net/wireless/ath9k/pci.c
index 4ff1caa9ba99..05612bf28360 100644
--- a/drivers/net/wireless/ath9k/pci.c
+++ b/drivers/net/wireless/ath9k/pci.c
@@ -58,9 +58,27 @@ static void ath_pci_cleanup(struct ath_softc *sc)
 	pci_disable_device(pdev);
 }
 
+static bool ath_pci_eeprom_read(struct ath_hal *ah, u32 off, u16 *data)
+{
+	(void)REG_READ(ah, AR5416_EEPROM_OFFSET + (off << AR5416_EEPROM_S));
+
+	if (!ath9k_hw_wait(ah,
+			   AR_EEPROM_STATUS_DATA,
+			   AR_EEPROM_STATUS_DATA_BUSY |
+			   AR_EEPROM_STATUS_DATA_PROT_ACCESS, 0)) {
+		return false;
+	}
+
+	*data = MS(REG_READ(ah, AR_EEPROM_STATUS_DATA),
+		   AR_EEPROM_STATUS_DATA_VAL);
+
+	return true;
+}
+
 static struct ath_bus_ops ath_pci_bus_ops = {
 	.read_cachesize = ath_pci_read_cachesize,
 	.cleanup = ath_pci_cleanup,
+	.eeprom_read = ath_pci_eeprom_read,
 };
 
 static int ath_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
new file mode 100644
index 000000000000..b847fc7b93f9
--- /dev/null
+++ b/include/linux/ath9k_platform.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2008 Atheros Communications Inc.
+ * Copyright (c) 2009 Gabor Juhos <juhosg@openwrt.org>
+ * Copyright (c) 2009 Imre Kaloz <kaloz@openwrt.org>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _LINUX_ATH9K_PLATFORM_H
+#define _LINUX_ATH9K_PLATFORM_H
+
+#define ATH9K_PLAT_EEP_MAX_WORDS	2048
+
+struct ath9k_platform_data {
+	u16 eeprom_data[ATH9K_PLAT_EEP_MAX_WORDS];
+};
+
+#endif /* _LINUX_ATH9K_PLATFORM_H */
-- 
cgit v1.2.3-71-gd317


From 9aed3cc124343d92be6697e9af3928bdfe8eb03e Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Tue, 13 Jan 2009 16:03:29 +0200
Subject: nl80211: New command for adding extra IE(s) into management frames

A new nl80211 command, NL80211_CMD_SET_MGMT_EXTRA_IE, can be used to
add arbitrary IE data into the end of management frames. The interface
allows extra IEs to be configured for each management frame subtype, but
only some of them (ProbeReq, ProbeResp, Auth, (Re)AssocReq, Deauth,
Disassoc) are currently accepted in mac80211 implementation.

This makes it easier to implement IEEE 802.11 extensions like WPS and
FT that add IE(s) into some management frames. In addition, this can
be useful for testing and experimentation purposes.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    | 22 +++++++++++++
 include/net/cfg80211.h     | 26 +++++++++++++++
 net/mac80211/cfg.c         | 82 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h | 16 +++++++++
 net/mac80211/iface.c       |  7 ++++
 net/mac80211/mlme.c        | 52 +++++++++++++++++++++++++----
 net/wireless/nl80211.c     | 47 ++++++++++++++++++++++++++
 7 files changed, 246 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 4e7a7986a521..76aae3d8e97e 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -133,6 +133,14 @@
  * @NL80211_CMD_SET_MESH_PARAMS: Set mesh networking properties for the
  *      interface identified by %NL80211_ATTR_IFINDEX
  *
+ * @NL80211_CMD_SET_MGMT_EXTRA_IE: Set extra IEs for management frames. The
+ *	interface is identified with %NL80211_ATTR_IFINDEX and the management
+ *	frame subtype with %NL80211_ATTR_MGMT_SUBTYPE. The extra IE data to be
+ *	added to the end of the specified management frame is specified with
+ *	%NL80211_ATTR_IE. If the command succeeds, the requested data will be
+ *	added to all specified management frames generated by
+ *	kernel/firmware/driver.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -178,6 +186,8 @@ enum nl80211_commands {
 	NL80211_CMD_GET_MESH_PARAMS,
 	NL80211_CMD_SET_MESH_PARAMS,
 
+	NL80211_CMD_SET_MGMT_EXTRA_IE,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -190,6 +200,7 @@ enum nl80211_commands {
  * here
  */
 #define NL80211_CMD_SET_BSS NL80211_CMD_SET_BSS
+#define NL80211_CMD_SET_MGMT_EXTRA_IE NL80211_CMD_SET_MGMT_EXTRA_IE
 
 /**
  * enum nl80211_attrs - nl80211 netlink attributes
@@ -284,6 +295,12 @@ enum nl80211_commands {
  *	supported interface types, each a flag attribute with the number
  *	of the interface mode.
  *
+ * @NL80211_ATTR_MGMT_SUBTYPE: Management frame subtype for
+ *	%NL80211_CMD_SET_MGMT_EXTRA_IE.
+ *
+ * @NL80211_ATTR_IE: Information element(s) data (used, e.g., with
+ *	%NL80211_CMD_SET_MGMT_EXTRA_IE).
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -348,6 +365,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_KEY_DEFAULT_MGMT,
 
+	NL80211_ATTR_MGMT_SUBTYPE,
+	NL80211_ATTR_IE,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -363,6 +383,8 @@ enum nl80211_attrs {
 #define NL80211_ATTR_WIPHY_TXQ_PARAMS NL80211_ATTR_WIPHY_TXQ_PARAMS
 #define NL80211_ATTR_WIPHY_FREQ NL80211_ATTR_WIPHY_FREQ
 #define NL80211_ATTR_WIPHY_CHANNEL_TYPE NL80211_ATTR_WIPHY_CHANNEL_TYPE
+#define NL80211_ATTR_MGMT_SUBTYPE NL80211_ATTR_MGMT_SUBTYPE
+#define NL80211_ATTR_IE NL80211_ATTR_IE
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index df78abc496f1..c7da88fb15b7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -433,6 +433,26 @@ struct ieee80211_txq_params {
 	u8 aifs;
 };
 
+/**
+ * struct mgmt_extra_ie_params - Extra management frame IE parameters
+ *
+ * Used to add extra IE(s) into management frames. If the driver cannot add the
+ * requested data into all management frames of the specified subtype that are
+ * generated in kernel or firmware/hardware, it must reject the configuration
+ * call. The IE data buffer is added to the end of the specified management
+ * frame body after all other IEs. This addition is not applied to frames that
+ * are injected through a monitor interface.
+ *
+ * @subtype: Management frame subtype
+ * @ies: IE data buffer or %NULL to remove previous data
+ * @ies_len: Length of @ies in octets
+ */
+struct mgmt_extra_ie_params {
+	u8 subtype;
+	u8 *ies;
+	int ies_len;
+};
+
 /* from net/wireless.h */
 struct wiphy;
 
@@ -501,6 +521,8 @@ struct ieee80211_channel;
  * @set_txq_params: Set TX queue parameters
  *
  * @set_channel: Set channel
+ *
+ * @set_mgmt_extra_ie: Set extra IE data for management frames
  */
 struct cfg80211_ops {
 	int	(*add_virtual_intf)(struct wiphy *wiphy, char *name,
@@ -571,6 +593,10 @@ struct cfg80211_ops {
 	int	(*set_channel)(struct wiphy *wiphy,
 			       struct ieee80211_channel *chan,
 			       enum nl80211_channel_type channel_type);
+
+	int	(*set_mgmt_extra_ie)(struct wiphy *wiphy,
+				     struct net_device *dev,
+				     struct mgmt_extra_ie_params *params);
 };
 
 /* temporary wext handlers */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 72c106915433..d1ac3ab2c515 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1175,6 +1175,87 @@ static int ieee80211_set_channel(struct wiphy *wiphy,
 	return ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
 }
 
+static int set_mgmt_extra_ie_sta(struct ieee80211_if_sta *ifsta, u8 subtype,
+				 u8 *ies, size_t ies_len)
+{
+	switch (subtype) {
+	case IEEE80211_STYPE_PROBE_REQ >> 4:
+		kfree(ifsta->ie_probereq);
+		ifsta->ie_probereq = ies;
+		ifsta->ie_probereq_len = ies_len;
+		return 0;
+	case IEEE80211_STYPE_PROBE_RESP >> 4:
+		kfree(ifsta->ie_proberesp);
+		ifsta->ie_proberesp = ies;
+		ifsta->ie_proberesp_len = ies_len;
+		return 0;
+	case IEEE80211_STYPE_AUTH >> 4:
+		kfree(ifsta->ie_auth);
+		ifsta->ie_auth = ies;
+		ifsta->ie_auth_len = ies_len;
+		return 0;
+	case IEEE80211_STYPE_ASSOC_REQ >> 4:
+		kfree(ifsta->ie_assocreq);
+		ifsta->ie_assocreq = ies;
+		ifsta->ie_assocreq_len = ies_len;
+		return 0;
+	case IEEE80211_STYPE_REASSOC_REQ >> 4:
+		kfree(ifsta->ie_reassocreq);
+		ifsta->ie_reassocreq = ies;
+		ifsta->ie_reassocreq_len = ies_len;
+		return 0;
+	case IEEE80211_STYPE_DEAUTH >> 4:
+		kfree(ifsta->ie_deauth);
+		ifsta->ie_deauth = ies;
+		ifsta->ie_deauth_len = ies_len;
+		return 0;
+	case IEEE80211_STYPE_DISASSOC >> 4:
+		kfree(ifsta->ie_disassoc);
+		ifsta->ie_disassoc = ies;
+		ifsta->ie_disassoc_len = ies_len;
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int ieee80211_set_mgmt_extra_ie(struct wiphy *wiphy,
+				       struct net_device *dev,
+				       struct mgmt_extra_ie_params *params)
+{
+	struct ieee80211_sub_if_data *sdata;
+	u8 *ies;
+	size_t ies_len;
+	int ret = -EOPNOTSUPP;
+
+	if (params->ies) {
+		ies = kmemdup(params->ies, params->ies_len, GFP_KERNEL);
+		if (ies == NULL)
+			return -ENOMEM;
+		ies_len = params->ies_len;
+	} else {
+		ies = NULL;
+		ies_len = 0;
+	}
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+		ret = set_mgmt_extra_ie_sta(&sdata->u.sta, params->subtype,
+					    ies, ies_len);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	if (ret)
+		kfree(ies);
+	return ret;
+}
+
 struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -1204,4 +1285,5 @@ struct cfg80211_ops mac80211_config_ops = {
 	.change_bss = ieee80211_change_bss,
 	.set_txq_params = ieee80211_set_txq_params,
 	.set_channel = ieee80211_set_channel,
+	.set_mgmt_extra_ie = ieee80211_set_mgmt_extra_ie,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c9ffadb55d36..5eafd3affe27 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -331,6 +331,22 @@ struct ieee80211_if_sta {
 	u32 supp_rates_bits[IEEE80211_NUM_BANDS];
 
 	int wmm_last_param_set;
+
+	/* Extra IE data for management frames */
+	u8 *ie_probereq;
+	size_t ie_probereq_len;
+	u8 *ie_proberesp;
+	size_t ie_proberesp_len;
+	u8 *ie_auth;
+	size_t ie_auth_len;
+	u8 *ie_assocreq;
+	size_t ie_assocreq_len;
+	u8 *ie_reassocreq;
+	size_t ie_reassocreq_len;
+	u8 *ie_deauth;
+	size_t ie_deauth_len;
+	u8 *ie_disassoc;
+	size_t ie_disassoc_len;
 };
 
 struct ieee80211_if_mesh {
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 5d5a029228be..8dc2c2188d92 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -632,6 +632,13 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 		kfree(sdata->u.sta.assocreq_ies);
 		kfree(sdata->u.sta.assocresp_ies);
 		kfree_skb(sdata->u.sta.probe_resp);
+		kfree(sdata->u.sta.ie_probereq);
+		kfree(sdata->u.sta.ie_proberesp);
+		kfree(sdata->u.sta.ie_auth);
+		kfree(sdata->u.sta.ie_assocreq);
+		kfree(sdata->u.sta.ie_reassocreq);
+		kfree(sdata->u.sta.ie_deauth);
+		kfree(sdata->u.sta.ie_disassoc);
 		break;
 	case NL80211_IFTYPE_WDS:
 	case NL80211_IFTYPE_AP_VLAN:
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f0d42498c257..43da6227b37c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -131,6 +131,12 @@ u64 ieee80211_sta_get_rates(struct ieee80211_local *local,
 
 /* frame sending functions */
 
+static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
+{
+	if (ies)
+		memcpy(skb_put(skb, ies_len), ies, ies_len);
+}
+
 /* also used by scanning code */
 void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 			      u8 *ssid, size_t ssid_len)
@@ -142,7 +148,8 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 	u8 *pos, *supp_rates, *esupp_rates = NULL;
 	int i;
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200);
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
+			    sdata->u.sta.ie_probereq_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
 		       "request\n", sdata->dev->name);
@@ -189,6 +196,9 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 		*pos = rate->bitrate / 5;
 	}
 
+	add_extra_ies(skb, sdata->u.sta.ie_probereq,
+		      sdata->u.sta.ie_probereq_len);
+
 	ieee80211_tx_skb(sdata, skb, 0);
 }
 
@@ -202,7 +212,8 @@ static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_mgmt *mgmt;
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    sizeof(*mgmt) + 6 + extra_len);
+			    sizeof(*mgmt) + 6 + extra_len +
+			    sdata->u.sta.ie_auth_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
 		       "frame\n", sdata->dev->name);
@@ -225,6 +236,7 @@ static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	mgmt->u.auth.status_code = cpu_to_le16(0);
 	if (extra)
 		memcpy(skb_put(skb, extra_len), extra, extra_len);
+	add_extra_ies(skb, sdata->u.sta.ie_auth, sdata->u.sta.ie_auth_len);
 
 	ieee80211_tx_skb(sdata, skb, encrypt);
 }
@@ -235,17 +247,26 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *ies, *ht_ie;
+	u8 *pos, *ies, *ht_ie, *e_ies;
 	int i, len, count, rates_len, supp_rates_len;
 	u16 capab;
 	struct ieee80211_bss *bss;
 	int wmm = 0;
 	struct ieee80211_supported_band *sband;
 	u64 rates = 0;
+	size_t e_ies_len;
+
+	if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
+		e_ies = sdata->u.sta.ie_reassocreq;
+		e_ies_len = sdata->u.sta.ie_reassocreq_len;
+	} else {
+		e_ies = sdata->u.sta.ie_assocreq;
+		e_ies_len = sdata->u.sta.ie_assocreq_len;
+	}
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
 			    sizeof(*mgmt) + 200 + ifsta->extra_ie_len +
-			    ifsta->ssid_len);
+			    ifsta->ssid_len + e_ies_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
 		       "frame\n", sdata->dev->name);
@@ -436,6 +457,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 		memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs));
 	}
 
+	add_extra_ies(skb, e_ies, e_ies_len);
+
 	kfree(ifsta->assocreq_ies);
 	ifsta->assocreq_ies_len = (skb->data + skb->len) - ies;
 	ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL);
@@ -453,8 +476,19 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
+	u8 *ies;
+	size_t ies_len;
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
+	if (stype == IEEE80211_STYPE_DEAUTH) {
+		ies = sdata->u.sta.ie_deauth;
+		ies_len = sdata->u.sta.ie_deauth_len;
+	} else {
+		ies = sdata->u.sta.ie_disassoc;
+		ies_len = sdata->u.sta.ie_disassoc_len;
+	}
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) +
+			    ies_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for "
 		       "deauth/disassoc frame\n", sdata->dev->name);
@@ -472,6 +506,8 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	/* u.deauth.reason_code == u.disassoc.reason_code */
 	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
 
+	add_extra_ies(skb, ies, ies_len);
+
 	ieee80211_tx_skb(sdata, skb, ifsta->flags & IEEE80211_STA_MFP_ENABLED);
 }
 
@@ -1473,7 +1509,8 @@ static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_supported_band *sband;
 	union iwreq_data wrqu;
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400 +
+			    sdata->u.sta.ie_proberesp_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
 		       "response\n", sdata->dev->name);
@@ -1556,6 +1593,9 @@ static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		memcpy(pos, &bss->supp_rates[8], rates);
 	}
 
+	add_extra_ies(skb, sdata->u.sta.ie_proberesp,
+		      sdata->u.sta.ie_proberesp_len);
+
 	ifsta->probe_resp = skb;
 
 	ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 123d3b160fad..09a5d0f1d6dc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -105,6 +105,10 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 
 	[NL80211_ATTR_HT_CAPABILITY] = { .type = NLA_BINARY,
 					 .len = NL80211_HT_CAPABILITY_LEN },
+
+	[NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
+	[NL80211_ATTR_IE] = { .type = NLA_BINARY,
+			      .len = IEEE80211_MAX_DATA_LEN },
 };
 
 /* message building helper */
@@ -2149,6 +2153,43 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
 	return -EINVAL;
 }
 
+static int nl80211_set_mgmt_extra_ie(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	int err;
+	struct net_device *dev;
+	struct mgmt_extra_ie_params params;
+
+	memset(&params, 0, sizeof(params));
+
+	if (!info->attrs[NL80211_ATTR_MGMT_SUBTYPE])
+		return -EINVAL;
+	params.subtype = nla_get_u8(info->attrs[NL80211_ATTR_MGMT_SUBTYPE]);
+	if (params.subtype > 15)
+		return -EINVAL; /* FC Subtype field is 4 bits (0..15) */
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		params.ies = nla_data(info->attrs[NL80211_ATTR_IE]);
+		params.ies_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		return err;
+
+	if (drv->ops->set_mgmt_extra_ie) {
+		rtnl_lock();
+		err = drv->ops->set_mgmt_extra_ie(&drv->wiphy, dev, &params);
+		rtnl_unlock();
+	} else
+		err = -EOPNOTSUPP;
+
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+	return err;
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -2310,6 +2351,12 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_SET_MGMT_EXTRA_IE,
+		.doit = nl80211_set_mgmt_extra_ie,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 /* multicast groups */
-- 
cgit v1.2.3-71-gd317


From f797eb7e2903571e9c0e7e5d64113f51209f8dc4 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Mon, 19 Jan 2009 18:48:46 +0200
Subject: mac80211: Fix MFP Association Comeback to use Timeout Interval IE

The separate Association Comeback Time IE was removed from IEEE 802.11w
and the Timeout Interval IE (from IEEE 802.11r) is used instead. The
editing on this is still somewhat incomplete in IEEE 802.11w/D7.0, but
still, the use of Timeout Interval IE is the expected mechanism.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  | 8 +++++++-
 net/mac80211/ieee80211_i.h | 4 ++--
 net/mac80211/mlme.c        | 5 +++--
 net/mac80211/util.c        | 6 +++---
 4 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 7800e20f197f..b1bb817d1427 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1036,8 +1036,8 @@ enum ieee80211_eid {
 	WLAN_EID_HT_INFORMATION = 61,
 	/* 802.11i */
 	WLAN_EID_RSN = 48,
+	WLAN_EID_TIMEOUT_INTERVAL = 56,
 	WLAN_EID_MMIE = 76 /* 802.11w */,
-	WLAN_EID_ASSOC_COMEBACK_TIME = 77,
 	WLAN_EID_WPA = 221,
 	WLAN_EID_GENERIC = 221,
 	WLAN_EID_VENDOR_SPECIFIC = 221,
@@ -1126,6 +1126,12 @@ struct ieee80211_country_ie_triplet {
 	};
 } __attribute__ ((packed));
 
+enum ieee80211_timeout_interval_type {
+	WLAN_TIMEOUT_REASSOC_DEADLINE = 1 /* 802.11r */,
+	WLAN_TIMEOUT_KEY_LIFETIME = 2 /* 802.11r */,
+	WLAN_TIMEOUT_ASSOC_COMEBACK = 3 /* 802.11w */,
+};
+
 /* BACK action code */
 enum ieee80211_back_actioncode {
 	WLAN_ACTION_ADDBA_REQ = 0,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index faa2476a2451..a8c72742a8b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -837,7 +837,7 @@ struct ieee802_11_elems {
 	u8 *country_elem;
 	u8 *pwr_constr_elem;
 	u8 *quiet_elem; 	/* first quite element */
-	u8 *assoc_comeback;
+	u8 *timeout_int;
 
 	/* length of them, respectively */
 	u8 ssid_len;
@@ -865,7 +865,7 @@ struct ieee802_11_elems {
 	u8 pwr_constr_elem_len;
 	u8 quiet_elem_len;
 	u8 num_of_quiet_elem;	/* can be more the one */
-	u8 assoc_comeback_len;
+	u8 timeout_int_len;
 };
 
 static inline struct ieee80211_local *hw_to_local(
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 43da6227b37c..b9e4b93089c4 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1317,9 +1317,10 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
 
 	if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY &&
-	    elems.assoc_comeback && elems.assoc_comeback_len == 4) {
+	    elems.timeout_int && elems.timeout_int_len == 5 &&
+	    elems.timeout_int[0] == WLAN_TIMEOUT_ASSOC_COMEBACK) {
 		u32 tu, ms;
-		tu = get_unaligned_le32(elems.assoc_comeback);
+		tu = get_unaligned_le32(elems.timeout_int + 1);
 		ms = tu * 1024 / 1000;
 		printk(KERN_DEBUG "%s: AP rejected association temporarily; "
 		       "comeback duration %u TU (%u ms)\n",
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 963e0473205c..3f559e3d0a7c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -653,9 +653,9 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 			elems->pwr_constr_elem = pos;
 			elems->pwr_constr_elem_len = elen;
 			break;
-		case WLAN_EID_ASSOC_COMEBACK_TIME:
-			elems->assoc_comeback = pos;
-			elems->assoc_comeback_len = elen;
+		case WLAN_EID_TIMEOUT_INTERVAL:
+			elems->timeout_int = pos;
+			elems->timeout_int_len = elen;
 			break;
 		default:
 			break;
-- 
cgit v1.2.3-71-gd317


From f677d7702d48b7b3dfcce3b2c0db601dbee0aa24 Mon Sep 17 00:00:00 2001
From: Tulio Magno Quites Machado Filho <tuliom@gmail.com>
Date: Sun, 25 Jan 2009 23:54:25 +0100
Subject: ath5k: support LED's on emachines E510 notebook

Add vendor ID for AMBIT and use it to set the ath5k LED gpio.

base.c:
Changes-licensed-under: 3-Clause-BSD

Signed-off-by: Tulio Magno Quites Machado Filho <tuliom@gmail.com>
Acked-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath5k/base.c | 8 ++++++--
 include/linux/pci_ids.h           | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath5k/base.c b/drivers/net/wireless/ath5k/base.c
index b3f41acb9065..368db944f11f 100644
--- a/drivers/net/wireless/ath5k/base.c
+++ b/drivers/net/wireless/ath5k/base.c
@@ -2626,8 +2626,12 @@ ath5k_init_leds(struct ath5k_softc *sc)
 		sc->led_pin = 1;
 		sc->led_on = 1;  /* active high */
 	}
-	/* Pin 3 on Foxconn chips used in Acer Aspire One (0x105b:e008) */
-	if (pdev->subsystem_vendor == PCI_VENDOR_ID_FOXCONN) {
+	/*
+	 * Pin 3 on Foxconn chips used in Acer Aspire One (0x105b:e008) and
+	 * in emachines notebooks with AMBIT subsystem.
+	 */
+	if (pdev->subsystem_vendor == PCI_VENDOR_ID_FOXCONN ||
+	    pdev->subsystem_vendor == PCI_VENDOR_ID_AMBIT) {
 		__set_bit(ATH_STAT_LEDSOFT, sc->status);
 		sc->led_pin = 3;
 		sc->led_on = 0;  /* active low */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5b7a48c1d616..b7697c934a49 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1967,6 +1967,8 @@
 
 #define PCI_VENDOR_ID_SAMSUNG		0x144d
 
+#define PCI_VENDOR_ID_AMBIT		0x1468
+
 #define PCI_VENDOR_ID_MYRICOM		0x14c1
 
 #define PCI_VENDOR_ID_TITAN		0x14D2
-- 
cgit v1.2.3-71-gd317


From 5d0d9be8ef456afc6c3fb5f8aad06ef19b704b05 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 29 Jan 2009 14:19:48 +0000
Subject: gro: Move common completion code into helpers

Currently VLAN still has a bit of common code handling the aftermath
of GRO that's shared with the common path.  This patch moves them
into shared helpers to reduce code duplication.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 ++
 net/8021q/vlan_core.c     | 39 +++---------------------
 net/core/dev.c            | 76 ++++++++++++++++++++++++++++++++---------------
 3 files changed, 59 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dd8a35b3e8b2..20419508eec1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1375,12 +1375,15 @@ extern int		netif_receive_skb(struct sk_buff *skb);
 extern void		napi_gro_flush(struct napi_struct *napi);
 extern int		dev_gro_receive(struct napi_struct *napi,
 					struct sk_buff *skb);
+extern int		napi_skb_finish(int ret, struct sk_buff *skb);
 extern int		napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
 extern void		napi_reuse_skb(struct napi_struct *napi,
 				       struct sk_buff *skb);
 extern struct sk_buff *	napi_fraginfo_skb(struct napi_struct *napi,
 					  struct napi_gro_fraginfo *info);
+extern int		napi_frags_finish(struct napi_struct *napi,
+					  struct sk_buff *skb, int ret);
 extern int		napi_gro_frags(struct napi_struct *napi,
 				       struct napi_gro_fraginfo *info);
 extern void		netif_nit_deliver(struct sk_buff *skb);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index e9db889d6222..2eb057a74654 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -98,22 +98,7 @@ drop:
 int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 		     unsigned int vlan_tci, struct sk_buff *skb)
 {
-	int err = NET_RX_SUCCESS;
-
-	switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
-	case -1:
-		return netif_receive_skb(skb);
-
-	case 2:
-		err = NET_RX_DROP;
-		/* fall through */
-
-	case 1:
-		kfree_skb(skb);
-		break;
-	}
-
-	return err;
+	return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
 }
 EXPORT_SYMBOL(vlan_gro_receive);
 
@@ -121,27 +106,11 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 		   unsigned int vlan_tci, struct napi_gro_fraginfo *info)
 {
 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
-	int err = NET_RX_DROP;
 
 	if (!skb)
-		goto out;
-
-	err = NET_RX_SUCCESS;
-
-	switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
-	case -1:
-		return netif_receive_skb(skb);
-
-	case 2:
-		err = NET_RX_DROP;
-		/* fall through */
-
-	case 1:
-		napi_reuse_skb(napi, skb);
-		break;
-	}
+		return NET_RX_DROP;
 
-out:
-	return err;
+	return napi_frags_finish(napi, skb,
+				 vlan_gro_common(napi, grp, vlan_tci, skb));
 }
 EXPORT_SYMBOL(vlan_gro_frags);
diff --git a/net/core/dev.c b/net/core/dev.c
index e61b95c11fc0..cd23ae15a1d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,14 @@
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
+enum {
+	GRO_MERGED,
+	GRO_MERGED_FREE,
+	GRO_HELD,
+	GRO_NORMAL,
+	GRO_DROP,
+};
+
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -2369,7 +2377,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	int count = 0;
 	int same_flow;
 	int mac_len;
-	int free;
+	int ret;
 
 	if (!(skb->dev->features & NETIF_F_GRO))
 		goto normal;
@@ -2412,7 +2420,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		goto normal;
 
 	same_flow = NAPI_GRO_CB(skb)->same_flow;
-	free = NAPI_GRO_CB(skb)->free;
+	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
 	if (pp) {
 		struct sk_buff *nskb = *pp;
@@ -2435,12 +2443,13 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	skb_shinfo(skb)->gso_size = skb->len;
 	skb->next = napi->gro_list;
 	napi->gro_list = skb;
+	ret = GRO_HELD;
 
 ok:
-	return free;
+	return ret;
 
 normal:
-	return -1;
+	return GRO_NORMAL;
 }
 EXPORT_SYMBOL(dev_gro_receive);
 
@@ -2456,18 +2465,30 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+int napi_skb_finish(int ret, struct sk_buff *skb)
 {
-	switch (__napi_gro_receive(napi, skb)) {
-	case -1:
+	int err = NET_RX_SUCCESS;
+
+	switch (ret) {
+	case GRO_NORMAL:
 		return netif_receive_skb(skb);
 
-	case 1:
+	case GRO_DROP:
+		err = NET_RX_DROP;
+		/* fall through */
+
+	case GRO_MERGED_FREE:
 		kfree_skb(skb);
 		break;
 	}
 
-	return NET_RX_SUCCESS;
+	return err;
+}
+EXPORT_SYMBOL(napi_skb_finish);
+
+int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
@@ -2520,29 +2541,36 @@ out:
 }
 EXPORT_SYMBOL(napi_fraginfo_skb);
 
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 {
-	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
-	int err = NET_RX_DROP;
-
-	if (!skb)
-		goto out;
+	int err = NET_RX_SUCCESS;
 
-	err = NET_RX_SUCCESS;
-
-	switch (__napi_gro_receive(napi, skb)) {
-	case -1:
+	switch (ret) {
+	case GRO_NORMAL:
 		return netif_receive_skb(skb);
 
-	case 0:
-		goto out;
-	}
+	case GRO_DROP:
+		err = NET_RX_DROP;
+		/* fall through */
 
-	napi_reuse_skb(napi, skb);
+	case GRO_MERGED_FREE:
+		napi_reuse_skb(napi, skb);
+		break;
+	}
 
-out:
 	return err;
 }
+EXPORT_SYMBOL(napi_frags_finish);
+
+int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+{
+	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+
+	if (!skb)
+		return NET_RX_DROP;
+
+	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
+}
 EXPORT_SYMBOL(napi_gro_frags);
 
 static int process_backlog(struct napi_struct *napi, int quota)
-- 
cgit v1.2.3-71-gd317


From 86911732d3996a9da07914b280621450111bb6da Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 29 Jan 2009 14:19:50 +0000
Subject: gro: Avoid copying headers of unmerged packets

Unfortunately simplicity isn't always the best.  The fraginfo
interface turned out to be suboptimal.  The problem was quite
obvious.  For every packet, we have to copy the headers from
the frags structure into skb->head, even though for 99% of the
packets this part is immediately thrown away after the merge.

LRO didn't have this problem because it directly read the headers
from the frags structure.

This patch attempts to address this by creating an interface
that allows GRO to access the headers in the first frag without
having to copy it.  Because all drivers that use frags place the
headers in the first frag this optimisation should be enough.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 26 ++++++++++++++++++
 include/linux/skbuff.h    |  2 --
 net/8021q/vlan_core.c     |  2 ++
 net/core/dev.c            | 70 +++++++++++++++++++++++++++++++++++++++--------
 net/core/skbuff.c         | 23 ++++++++++------
 net/ipv4/af_inet.c        | 10 +++----
 net/ipv4/tcp.c            | 16 +++++------
 net/ipv4/tcp_ipv4.c       |  2 +-
 net/ipv6/af_inet6.c       | 30 +++++++++++++-------
 net/ipv6/tcp_ipv6.c       |  2 +-
 10 files changed, 137 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 20419508eec1..7a5057fbb7cd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -984,6 +984,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 void netif_napi_del(struct napi_struct *napi);
 
 struct napi_gro_cb {
+	/* This indicates where we are processing relative to skb->data. */
+	int data_offset;
+
 	/* This is non-zero if the packet may be of the same flow. */
 	int same_flow;
 
@@ -1087,6 +1090,29 @@ extern int		dev_restart(struct net_device *dev);
 #ifdef CONFIG_NETPOLL_TRAP
 extern int		netpoll_trap(void);
 #endif
+extern void	      *skb_gro_header(struct sk_buff *skb, unsigned int hlen);
+extern int	       skb_gro_receive(struct sk_buff **head,
+				       struct sk_buff *skb);
+
+static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
+{
+	return NAPI_GRO_CB(skb)->data_offset;
+}
+
+static inline unsigned int skb_gro_len(const struct sk_buff *skb)
+{
+	return skb->len - NAPI_GRO_CB(skb)->data_offset;
+}
+
+static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
+{
+	NAPI_GRO_CB(skb)->data_offset += len;
+}
+
+static inline void skb_gro_reset_offset(struct sk_buff *skb)
+{
+	NAPI_GRO_CB(skb)->data_offset = 0;
+}
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a2c2378a9c58..08670d017479 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1687,8 +1687,6 @@ extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
 				 int shiftlen);
 
 extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
-extern int	       skb_gro_receive(struct sk_buff **head,
-				       struct sk_buff *skb);
 
 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 				       int len, void *buffer)
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 2eb057a74654..378fa69d625a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -98,6 +98,8 @@ drop:
 int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 		     unsigned int vlan_tci, struct sk_buff *skb)
 {
+	skb_gro_reset_offset(skb);
+
 	return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
 }
 EXPORT_SYMBOL(vlan_gro_receive);
diff --git a/net/core/dev.c b/net/core/dev.c
index cd23ae15a1d5..df406dcf7482 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -215,6 +215,13 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 }
 
+static inline void *skb_gro_mac_header(struct sk_buff *skb)
+{
+	return skb_headlen(skb) ? skb_mac_header(skb) :
+	       page_address(skb_shinfo(skb)->frags[0].page) +
+	       skb_shinfo(skb)->frags[0].page_offset;
+}
+
 /* Device list insertion */
 static int list_netdevice(struct net_device *dev)
 {
@@ -2350,7 +2357,6 @@ static int napi_gro_complete(struct sk_buff *skb)
 
 out:
 	skb_shinfo(skb)->gso_size = 0;
-	__skb_push(skb, -skb_network_offset(skb));
 	return netif_receive_skb(skb);
 }
 
@@ -2368,6 +2374,25 @@ void napi_gro_flush(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
+void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
+{
+	unsigned int offset = skb_gro_offset(skb);
+
+	hlen += offset;
+	if (hlen <= skb_headlen(skb))
+		return skb->data + offset;
+
+	if (unlikely(!skb_shinfo(skb)->nr_frags ||
+		     skb_shinfo(skb)->frags[0].size <=
+		     hlen - skb_headlen(skb) ||
+		     PageHighMem(skb_shinfo(skb)->frags[0].page)))
+		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
+
+	return page_address(skb_shinfo(skb)->frags[0].page) +
+	       skb_shinfo(skb)->frags[0].page_offset + offset;
+}
+EXPORT_SYMBOL(skb_gro_header);
+
 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
@@ -2388,11 +2413,13 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
 		struct sk_buff *p;
+		void *mac;
 
 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
 			continue;
 
-		skb_reset_network_header(skb);
+		skb_set_network_header(skb, skb_gro_offset(skb));
+		mac = skb_gro_mac_header(skb);
 		mac_len = skb->network_header - skb->mac_header;
 		skb->mac_len = mac_len;
 		NAPI_GRO_CB(skb)->same_flow = 0;
@@ -2406,8 +2433,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 				continue;
 
 			if (p->mac_len != mac_len ||
-			    memcmp(skb_mac_header(p), skb_mac_header(skb),
-				   mac_len))
+			    memcmp(skb_mac_header(p), mac, mac_len))
 				NAPI_GRO_CB(p)->same_flow = 0;
 		}
 
@@ -2434,13 +2460,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	if (same_flow)
 		goto ok;
 
-	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
-		__skb_push(skb, -skb_network_offset(skb));
+	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS)
 		goto normal;
-	}
 
 	NAPI_GRO_CB(skb)->count = 1;
-	skb_shinfo(skb)->gso_size = skb->len;
+	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	skb->next = napi->gro_list;
 	napi->gro_list = skb;
 	ret = GRO_HELD;
@@ -2488,6 +2512,8 @@ EXPORT_SYMBOL(napi_skb_finish);
 
 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
+	skb_gro_reset_offset(skb);
+
 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
 }
 EXPORT_SYMBOL(napi_gro_receive);
@@ -2506,6 +2532,7 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 {
 	struct net_device *dev = napi->dev;
 	struct sk_buff *skb = napi->skb;
+	struct ethhdr *eth;
 
 	napi->skb = NULL;
 
@@ -2525,13 +2552,23 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 	skb->len += info->len;
 	skb->truesize += info->len;
 
-	if (!pskb_may_pull(skb, ETH_HLEN)) {
+	skb_reset_mac_header(skb);
+	skb_gro_reset_offset(skb);
+
+	eth = skb_gro_header(skb, sizeof(*eth));
+	if (!eth) {
 		napi_reuse_skb(napi, skb);
 		skb = NULL;
 		goto out;
 	}
 
-	skb->protocol = eth_type_trans(skb, dev);
+	skb_gro_pull(skb, sizeof(*eth));
+
+	/*
+	 * This works because the only protocols we care about don't require
+	 * special handling.  We'll fix it up properly at the end.
+	 */
+	skb->protocol = eth->h_proto;
 
 	skb->ip_summed = info->ip_summed;
 	skb->csum = info->csum;
@@ -2544,10 +2581,21 @@ EXPORT_SYMBOL(napi_fraginfo_skb);
 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 {
 	int err = NET_RX_SUCCESS;
+	int may;
 
 	switch (ret) {
 	case GRO_NORMAL:
-		return netif_receive_skb(skb);
+	case GRO_HELD:
+		may = pskb_may_pull(skb, skb_gro_offset(skb));
+		BUG_ON(!may);
+
+		skb->protocol = eth_type_trans(skb, napi->dev);
+
+		if (ret == GRO_NORMAL)
+			return netif_receive_skb(skb);
+
+		skb_gro_pull(skb, -ETH_HLEN);
+		break;
 
 	case GRO_DROP:
 		err = NET_RX_DROP;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2e5f2ca3bdcd..f9f4065a7e9b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2584,17 +2584,21 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	struct sk_buff *p = *head;
 	struct sk_buff *nskb;
 	unsigned int headroom;
-	unsigned int hlen = p->data - skb_mac_header(p);
-	unsigned int len = skb->len;
+	unsigned int len = skb_gro_len(skb);
 
-	if (hlen + p->len + len >= 65536)
+	if (p->len + len >= 65536)
 		return -E2BIG;
 
 	if (skb_shinfo(p)->frag_list)
 		goto merge;
-	else if (!skb_headlen(p) && !skb_headlen(skb) &&
-		 skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <
+	else if (skb_headlen(skb) <= skb_gro_offset(skb) &&
+		 skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <=
 		 MAX_SKB_FRAGS) {
+		skb_shinfo(skb)->frags[0].page_offset +=
+			skb_gro_offset(skb) - skb_headlen(skb);
+		skb_shinfo(skb)->frags[0].size -=
+			skb_gro_offset(skb) - skb_headlen(skb);
+
 		memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags,
 		       skb_shinfo(skb)->frags,
 		       skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
@@ -2611,7 +2615,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	}
 
 	headroom = skb_headroom(p);
-	nskb = netdev_alloc_skb(p->dev, headroom);
+	nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p));
 	if (unlikely(!nskb))
 		return -ENOMEM;
 
@@ -2619,12 +2623,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	nskb->mac_len = p->mac_len;
 
 	skb_reserve(nskb, headroom);
+	__skb_put(nskb, skb_gro_offset(p));
 
-	skb_set_mac_header(nskb, -hlen);
+	skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
 	skb_set_network_header(nskb, skb_network_offset(p));
 	skb_set_transport_header(nskb, skb_transport_offset(p));
 
-	memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen);
+	__skb_pull(p, skb_gro_offset(p));
+	memcpy(skb_mac_header(nskb), skb_mac_header(p),
+	       p->data - skb_mac_header(p));
 
 	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
 	skb_shinfo(nskb)->frag_list = p;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f5542d65a..d6770f295d5b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1253,10 +1253,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	int proto;
 	int id;
 
-	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+	iph = skb_gro_header(skb, sizeof(*iph));
+	if (unlikely(!iph))
 		goto out;
 
-	iph = ip_hdr(skb);
 	proto = iph->protocol & (MAX_INET_PROTOS - 1);
 
 	rcu_read_lock();
@@ -1270,7 +1270,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
 		goto out_unlock;
 
-	flush = ntohs(iph->tot_len) != skb->len ||
+	flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
 		iph->frag_off != htons(IP_DF);
 	id = ntohs(iph->id);
 
@@ -1298,8 +1298,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	}
 
 	NAPI_GRO_CB(skb)->flush |= flush;
-	__skb_pull(skb, sizeof(*iph));
-	skb_reset_transport_header(skb);
+	skb_gro_pull(skb, sizeof(*iph));
+	skb_set_transport_header(skb, skb_gro_offset(skb));
 
 	pp = ops->gro_receive(head, skb);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0cd71b84e483..1cd608253940 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2481,19 +2481,19 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	unsigned int mss = 1;
 	int flush = 1;
 
-	if (!pskb_may_pull(skb, sizeof(*th)))
+	th = skb_gro_header(skb, sizeof(*th));
+	if (unlikely(!th))
 		goto out;
 
-	th = tcp_hdr(skb);
 	thlen = th->doff * 4;
 	if (thlen < sizeof(*th))
 		goto out;
 
-	if (!pskb_may_pull(skb, thlen))
+	th = skb_gro_header(skb, thlen);
+	if (unlikely(!th))
 		goto out;
 
-	th = tcp_hdr(skb);
-	__skb_pull(skb, thlen);
+	skb_gro_pull(skb, thlen);
 
 	flags = tcp_flag_word(th);
 
@@ -2521,10 +2521,10 @@ found:
 	flush |= th->ack_seq != th2->ack_seq || th->window != th2->window;
 	flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th));
 
-	total = p->len;
+	total = skb_gro_len(p);
 	mss = skb_shinfo(p)->gso_size;
 
-	flush |= skb->len > mss || skb->len <= 0;
+	flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb);
 	flush |= ntohl(th2->seq) + total != ntohl(th->seq);
 
 	if (flush || skb_gro_receive(head, skb)) {
@@ -2537,7 +2537,7 @@ found:
 	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
 
 out_check_final:
-	flush = skb->len < mss;
+	flush = skb_gro_len(skb) < mss;
 	flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
 			  TCP_FLAG_SYN | TCP_FLAG_FIN);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 19d7b429a262..f6b962f56ab4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2355,7 +2355,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
-		if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr,
+		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
 				  skb->csum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			break;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c802bc1658a8..bd91eadcbe3f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -799,24 +799,34 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 	int proto;
 	__wsum csum;
 
-	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+	iph = skb_gro_header(skb, sizeof(*iph));
+	if (unlikely(!iph))
 		goto out;
 
-	iph = ipv6_hdr(skb);
-	__skb_pull(skb, sizeof(*iph));
+	skb_gro_pull(skb, sizeof(*iph));
+	skb_set_transport_header(skb, skb_gro_offset(skb));
 
-	flush += ntohs(iph->payload_len) != skb->len;
+	flush += ntohs(iph->payload_len) != skb_gro_len(skb);
 
 	rcu_read_lock();
-	proto = ipv6_gso_pull_exthdrs(skb, iph->nexthdr);
-	iph = ipv6_hdr(skb);
-	IPV6_GRO_CB(skb)->proto = proto;
+	proto = iph->nexthdr;
 	ops = rcu_dereference(inet6_protos[proto]);
-	if (!ops || !ops->gro_receive)
-		goto out_unlock;
+	if (!ops || !ops->gro_receive) {
+		__pskb_pull(skb, skb_gro_offset(skb));
+		proto = ipv6_gso_pull_exthdrs(skb, proto);
+		skb_gro_pull(skb, -skb_transport_offset(skb));
+		skb_reset_transport_header(skb);
+		__skb_push(skb, skb_gro_offset(skb));
+
+		if (!ops || !ops->gro_receive)
+			goto out_unlock;
+
+		iph = ipv6_hdr(skb);
+	}
+
+	IPV6_GRO_CB(skb)->proto = proto;
 
 	flush--;
-	skb_reset_transport_header(skb);
 	nlen = skb_network_header_len(skb);
 
 	for (p = *head; p; p = p->next) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e5b85d45bee8..00f1269e11e9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -948,7 +948,7 @@ struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
-		if (!tcp_v6_check(skb->len, &iph->saddr, &iph->daddr,
+		if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr,
 				  skb->csum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			break;
-- 
cgit v1.2.3-71-gd317


From d23f028a4ddce8b783c212bfe911d1d307ff3617 Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Tue, 27 Jan 2009 06:51:11 +0000
Subject: smsc911x: add external phy detection overrides

On LAN9115/LAN9117/LAN9215/LAN9217, external phys are supported.  These
are usually indicated by a hardware strap which sets an "external PHY
detected" bit in the HW_CFG register.

In some cases it is desirable to override this hardware strap and force
use of either the internal phy or an external PHY.  This patch adds
SMSC911X_FORCE_INTERNAL_PHY and SMSC911X_FORCE_EXTERNAL_PHY flags so a
platform can indicate this preference via its platform_data.

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Acked-by: Sascha Hauer <s.hauer@pengutronix.de>
Tested-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/smsc911x.c   | 75 ++++++++++++++++++++++++------------------------
 include/linux/smsc911x.h |  2 ++
 2 files changed, 40 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index 6aa2b165de6a..27d2d7c45519 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -368,48 +368,53 @@ out:
 	return reg;
 }
 
-/* Autodetects and initialises external phy for SMSC9115 and SMSC9117 flavors.
- * If something goes wrong, returns -ENODEV to revert back to internal phy.
- * Performed at initialisation only, so interrupts are enabled */
-static int smsc911x_phy_initialise_external(struct smsc911x_data *pdata)
+/* Switch to external phy. Assumes tx and rx are stopped. */
+static void smsc911x_phy_enable_external(struct smsc911x_data *pdata)
 {
 	unsigned int hwcfg = smsc911x_reg_read(pdata, HW_CFG);
 
-	/* External phy is requested, supported, and detected */
-	if (hwcfg & HW_CFG_EXT_PHY_DET_) {
+	/* Disable phy clocks to the MAC */
+	hwcfg &= (~HW_CFG_PHY_CLK_SEL_);
+	hwcfg |= HW_CFG_PHY_CLK_SEL_CLK_DIS_;
+	smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+	udelay(10);	/* Enough time for clocks to stop */
 
-		/* Switch to external phy. Assuming tx and rx are stopped
-		 * because smsc911x_phy_initialise is called before
-		 * smsc911x_rx_initialise and tx_initialise. */
+	/* Switch to external phy */
+	hwcfg |= HW_CFG_EXT_PHY_EN_;
+	smsc911x_reg_write(pdata, HW_CFG, hwcfg);
 
-		/* Disable phy clocks to the MAC */
-		hwcfg &= (~HW_CFG_PHY_CLK_SEL_);
-		hwcfg |= HW_CFG_PHY_CLK_SEL_CLK_DIS_;
-		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
-		udelay(10);	/* Enough time for clocks to stop */
+	/* Enable phy clocks to the MAC */
+	hwcfg &= (~HW_CFG_PHY_CLK_SEL_);
+	hwcfg |= HW_CFG_PHY_CLK_SEL_EXT_PHY_;
+	smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+	udelay(10);	/* Enough time for clocks to restart */
 
-		/* Switch to external phy */
-		hwcfg |= HW_CFG_EXT_PHY_EN_;
-		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
-
-		/* Enable phy clocks to the MAC */
-		hwcfg &= (~HW_CFG_PHY_CLK_SEL_);
-		hwcfg |= HW_CFG_PHY_CLK_SEL_EXT_PHY_;
-		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
-		udelay(10);	/* Enough time for clocks to restart */
+	hwcfg |= HW_CFG_SMI_SEL_;
+	smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+}
 
-		hwcfg |= HW_CFG_SMI_SEL_;
-		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+/* Autodetects and enables external phy if present on supported chips.
+ * autodetection can be overridden by specifying SMSC911X_FORCE_INTERNAL_PHY
+ * or SMSC911X_FORCE_EXTERNAL_PHY in the platform_data flags. */
+static void smsc911x_phy_initialise_external(struct smsc911x_data *pdata)
+{
+	unsigned int hwcfg = smsc911x_reg_read(pdata, HW_CFG);
 
-		SMSC_TRACE(HW, "Successfully switched to external PHY");
+	if (pdata->config.flags & SMSC911X_FORCE_INTERNAL_PHY) {
+		SMSC_TRACE(HW, "Forcing internal PHY");
+		pdata->using_extphy = 0;
+	} else if (pdata->config.flags & SMSC911X_FORCE_EXTERNAL_PHY) {
+		SMSC_TRACE(HW, "Forcing external PHY");
+		smsc911x_phy_enable_external(pdata);
+		pdata->using_extphy = 1;
+	} else if (hwcfg & HW_CFG_EXT_PHY_DET_) {
+		SMSC_TRACE(HW, "HW_CFG EXT_PHY_DET set, using external PHY");
+		smsc911x_phy_enable_external(pdata);
 		pdata->using_extphy = 1;
 	} else {
-		SMSC_WARNING(HW, "No external PHY detected, "
-			"Using internal PHY instead.");
-		/* Use internal phy */
-		return -ENODEV;
+		SMSC_TRACE(HW, "HW_CFG EXT_PHY_DET clear, using internal PHY");
+		pdata->using_extphy = 0;
 	}
-	return 0;
 }
 
 /* Fetches a tx status out of the status fifo */
@@ -825,22 +830,18 @@ static int __devinit smsc911x_mii_init(struct platform_device *pdev,
 
 	pdata->mii_bus->parent = &pdev->dev;
 
-	pdata->using_extphy = 0;
-
 	switch (pdata->idrev & 0xFFFF0000) {
 	case 0x01170000:
 	case 0x01150000:
 	case 0x117A0000:
 	case 0x115A0000:
 		/* External PHY supported, try to autodetect */
-		if (smsc911x_phy_initialise_external(pdata) < 0) {
-			SMSC_TRACE(HW, "No external PHY detected, "
-				"using internal PHY");
-		}
+		smsc911x_phy_initialise_external(pdata);
 		break;
 	default:
 		SMSC_TRACE(HW, "External PHY is not supported, "
 			"using internal PHY");
+		pdata->using_extphy = 0;
 		break;
 	}
 
diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h
index 1cbf0313adde..170c76b8f7a6 100644
--- a/include/linux/smsc911x.h
+++ b/include/linux/smsc911x.h
@@ -43,5 +43,7 @@ struct smsc911x_platform_config {
 /* Constants for flags */
 #define SMSC911X_USE_16BIT 			(BIT(0))
 #define SMSC911X_USE_32BIT 			(BIT(1))
+#define SMSC911X_FORCE_INTERNAL_PHY		(BIT(2))
+#define SMSC911X_FORCE_EXTERNAL_PHY 		(BIT(3))
 
 #endif /* __LINUX_SMSC911X_H__ */
-- 
cgit v1.2.3-71-gd317


From 31f4574774e98aa275aeeee94f41ce042285ed8e Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Tue, 27 Jan 2009 06:51:12 +0000
Subject: smsc911x: allow mac address to be saved before device reset

Some platforms (for example pcm037) do not have an EEPROM fitted,
instead storing their mac address somewhere else.  The bootloader
fetches this and configures the ethernet adapter before the kernel is
started.

This patch allows a platform to indicate to the driver via the
SMSC911X_SAVE_MAC_ADDRESS flag that the mac address has already been
configured via such a mechanism, and should be saved before resetting
the chip.

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Acked-by: Sascha Hauer <s.hauer@pengutronix.de>
Tested-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/smsc911x.c   | 30 ++++++++++++++++++++++--------
 include/linux/smsc911x.h |  1 +
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index 27d2d7c45519..6e175e5555a1 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -1742,6 +1742,21 @@ static const struct net_device_ops smsc911x_netdev_ops = {
 #endif
 };
 
+/* copies the current mac address from hardware to dev->dev_addr */
+static void __devinit smsc911x_read_mac_address(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	u32 mac_high16 = smsc911x_mac_read(pdata, ADDRH);
+	u32 mac_low32 = smsc911x_mac_read(pdata, ADDRL);
+
+	dev->dev_addr[0] = (u8)(mac_low32);
+	dev->dev_addr[1] = (u8)(mac_low32 >> 8);
+	dev->dev_addr[2] = (u8)(mac_low32 >> 16);
+	dev->dev_addr[3] = (u8)(mac_low32 >> 24);
+	dev->dev_addr[4] = (u8)(mac_high16);
+	dev->dev_addr[5] = (u8)(mac_high16 >> 8);
+}
+
 /* Initializing private device structures, only called from probe */
 static int __devinit smsc911x_init(struct net_device *dev)
 {
@@ -1829,6 +1844,12 @@ static int __devinit smsc911x_init(struct net_device *dev)
 		SMSC_WARNING(PROBE,
 			"This driver is not intended for this chip revision");
 
+	/* workaround for platforms without an eeprom, where the mac address
+	 * is stored elsewhere and set by the bootloader.  This saves the
+	 * mac address before resetting the device */
+	if (pdata->config.flags & SMSC911X_SAVE_MAC_ADDRESS)
+		smsc911x_read_mac_address(dev);
+
 	/* Reset the LAN911x */
 	if (smsc911x_soft_reset(pdata))
 		return -ENODEV;
@@ -2009,14 +2030,7 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
 	} else {
 		/* Try reading mac address from device. if EEPROM is present
 		 * it will already have been set */
-		u32 mac_high16 = smsc911x_mac_read(pdata, ADDRH);
-		u32 mac_low32 = smsc911x_mac_read(pdata, ADDRL);
-		dev->dev_addr[0] = (u8)(mac_low32);
-		dev->dev_addr[1] = (u8)(mac_low32 >> 8);
-		dev->dev_addr[2] = (u8)(mac_low32 >> 16);
-		dev->dev_addr[3] = (u8)(mac_low32 >> 24);
-		dev->dev_addr[4] = (u8)(mac_high16);
-		dev->dev_addr[5] = (u8)(mac_high16 >> 8);
+		smsc911x_read_mac_address(dev);
 
 		if (is_valid_ether_addr(dev->dev_addr)) {
 			/* eeprom values are valid  so use them */
diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h
index 170c76b8f7a6..b32725075d71 100644
--- a/include/linux/smsc911x.h
+++ b/include/linux/smsc911x.h
@@ -45,5 +45,6 @@ struct smsc911x_platform_config {
 #define SMSC911X_USE_32BIT 			(BIT(1))
 #define SMSC911X_FORCE_INTERNAL_PHY		(BIT(2))
 #define SMSC911X_FORCE_EXTERNAL_PHY 		(BIT(3))
+#define SMSC911X_SAVE_MAC_ADDRESS		(BIT(4))
 
 #endif /* __LINUX_SMSC911X_H__ */
-- 
cgit v1.2.3-71-gd317


From eefef1cf7653cd4e0aaf743c00ae8345086cdc01 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Sun, 1 Feb 2009 01:04:33 -0800
Subject: net: add ARP notify option for devices

This adds another inet device option to enable gratuitous ARP
when device is brought up or address change. This is handy for
clusters or virtualization.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 6 ++++++
 include/linux/inetdevice.h             | 1 +
 include/linux/sysctl.h                 | 1 +
 kernel/sysctl_check.c                  | 1 +
 net/ipv4/devinet.c                     | 9 +++++++++
 5 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index c7712787933c..ff3f219ee4d7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -782,6 +782,12 @@ arp_ignore - INTEGER
 	The max value from conf/{all,interface}/arp_ignore is used
 	when ARP request is received on the {interface}
 
+arp_notify - BOOLEAN
+	Define mode for notification of address and device changes.
+	0 - (default): do nothing
+	1 - Generate gratuitous arp replies when device is brought up
+	    or hardware address changes.
+
 arp_accept - BOOLEAN
 	Define behavior when gratuitous arp replies are received:
 	0 - drop gratuitous arp frames
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 06fcdb45106b..acef2a770b6b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -108,6 +108,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_ARPFILTER(in_dev)	IN_DEV_ORCONF((in_dev), ARPFILTER)
 #define IN_DEV_ARP_ANNOUNCE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
 #define IN_DEV_ARP_IGNORE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
+#define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
 
 struct in_ifaddr
 {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 39d471d1163b..e76d3b22a466 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -490,6 +490,7 @@ enum
 	NET_IPV4_CONF_ARP_IGNORE=19,
 	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
 	NET_IPV4_CONF_ARP_ACCEPT=21,
+	NET_IPV4_CONF_ARP_NOTIFY=22,
 	__NET_IPV4_CONF_MAX
 };
 
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index fafeb48f27c0..b38423ca711a 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
 	{ NET_IPV4_CONF_ARP_IGNORE,		"arp_ignore" },
 	{ NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
 	{ NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" },
+	{ NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },
 	{}
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 309997edc8a5..d519a6a66726 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1075,6 +1075,14 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 			}
 		}
 		ip_mc_up(in_dev);
+		/* fall through */
+	case NETDEV_CHANGEADDR:
+		if (IN_DEV_ARP_NOTIFY(in_dev))
+			arp_send(ARPOP_REQUEST, ETH_P_ARP,
+				 in_dev->ifa_list->ifa_address,
+				 dev,
+				 in_dev->ifa_list->ifa_address,
+				 NULL, dev->dev_addr, NULL);
 		break;
 	case NETDEV_DOWN:
 		ip_mc_down(in_dev);
@@ -1439,6 +1447,7 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
 		DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
 		DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-- 
cgit v1.2.3-71-gd317


From 659aaf2bb5496a425ba14036b5b5900f593e4484 Mon Sep 17 00:00:00 2001
From: Rajiv Andrade <srajiv@linux.vnet.ibm.com>
Date: Mon, 2 Feb 2009 15:23:44 -0200
Subject: TPM: integrity interface

This patch adds internal kernel support for:
 - reading/extending a pcr value
 - looking up the tpm_chip for a given chip number

Signed-off-by: Rajiv Andrade <srajiv@linux.vnet.ibm.com>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 drivers/char/tpm/tpm.c | 129 +++++++++++++++++++++++++++++++++++++++++--------
 drivers/char/tpm/tpm.h |  18 +++++++
 include/linux/tpm.h    |  35 ++++++++++++++
 3 files changed, 163 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/tpm.h

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index 9b9eb761cba9..62a5682578ca 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -661,28 +661,125 @@ ssize_t tpm_show_temp_deactivated(struct device * dev,
 }
 EXPORT_SYMBOL_GPL(tpm_show_temp_deactivated);
 
-static const u8 pcrread[] = {
-	0, 193,			/* TPM_TAG_RQU_COMMAND */
-	0, 0, 0, 14,		/* length */
-	0, 0, 0, 21,		/* TPM_ORD_PcrRead */
-	0, 0, 0, 0		/* PCR index */
+/*
+ * tpm_chip_find_get - return tpm_chip for given chip number
+ */
+static struct tpm_chip *tpm_chip_find_get(int chip_num)
+{
+	struct tpm_chip *pos;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(pos, &tpm_chip_list, list) {
+		if (chip_num != TPM_ANY_NUM && chip_num != pos->dev_num)
+			continue;
+
+		if (try_module_get(pos->dev->driver->owner))
+			break;
+	}
+	rcu_read_unlock();
+	return pos;
+}
+
+#define TPM_ORDINAL_PCRREAD cpu_to_be32(21)
+#define READ_PCR_RESULT_SIZE 30
+static struct tpm_input_header pcrread_header = {
+	.tag = TPM_TAG_RQU_COMMAND,
+	.length = cpu_to_be32(14),
+	.ordinal = TPM_ORDINAL_PCRREAD
+};
+
+int __tpm_pcr_read(struct tpm_chip *chip, int pcr_idx, u8 *res_buf)
+{
+	int rc;
+	struct tpm_cmd_t cmd;
+
+	cmd.header.in = pcrread_header;
+	cmd.params.pcrread_in.pcr_idx = cpu_to_be32(pcr_idx);
+	BUILD_BUG_ON(cmd.header.in.length > READ_PCR_RESULT_SIZE);
+	rc = transmit_cmd(chip, &cmd, cmd.header.in.length,
+			  "attempting to read a pcr value");
+
+	if (rc == 0)
+		memcpy(res_buf, cmd.params.pcrread_out.pcr_result,
+		       TPM_DIGEST_SIZE);
+	return rc;
+}
+
+/**
+ * tpm_pcr_read - read a pcr value
+ * @chip_num: 	tpm idx # or ANY
+ * @pcr_idx:	pcr idx to retrieve
+ * @res_buf: 	TPM_PCR value
+ * 		size of res_buf is 20 bytes (or NULL if you don't care)
+ *
+ * The TPM driver should be built-in, but for whatever reason it
+ * isn't, protect against the chip disappearing, by incrementing
+ * the module usage count.
+ */
+int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf)
+{
+	struct tpm_chip *chip;
+	int rc;
+
+	chip = tpm_chip_find_get(chip_num);
+	if (chip == NULL)
+		return -ENODEV;
+	rc = __tpm_pcr_read(chip, pcr_idx, res_buf);
+	module_put(chip->dev->driver->owner);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(tpm_pcr_read);
+
+/**
+ * tpm_pcr_extend - extend pcr value with hash
+ * @chip_num: 	tpm idx # or AN&
+ * @pcr_idx:	pcr idx to extend
+ * @hash: 	hash value used to extend pcr value
+ *
+ * The TPM driver should be built-in, but for whatever reason it
+ * isn't, protect against the chip disappearing, by incrementing
+ * the module usage count.
+ */
+#define TPM_ORD_PCR_EXTEND cpu_to_be32(20)
+#define EXTEND_PCR_SIZE 34
+static struct tpm_input_header pcrextend_header = {
+	.tag = TPM_TAG_RQU_COMMAND,
+	.length = cpu_to_be32(34),
+	.ordinal = TPM_ORD_PCR_EXTEND
 };
 
+int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash)
+{
+	struct tpm_cmd_t cmd;
+	int rc;
+	struct tpm_chip *chip;
+
+	chip = tpm_chip_find_get(chip_num);
+	if (chip == NULL)
+		return -ENODEV;
+
+	cmd.header.in = pcrextend_header;
+	BUILD_BUG_ON(be32_to_cpu(cmd.header.in.length) > EXTEND_PCR_SIZE);
+	cmd.params.pcrextend_in.pcr_idx = cpu_to_be32(pcr_idx);
+	memcpy(cmd.params.pcrextend_in.hash, hash, TPM_DIGEST_SIZE);
+	rc = transmit_cmd(chip, &cmd, cmd.header.in.length,
+			  "attempting extend a PCR value");
+
+	module_put(chip->dev->driver->owner);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(tpm_pcr_extend);
+
 ssize_t tpm_show_pcrs(struct device *dev, struct device_attribute *attr,
 		      char *buf)
 {
 	cap_t cap;
-	u8 *data;
+	u8 digest[TPM_DIGEST_SIZE];
 	ssize_t rc;
 	int i, j, num_pcrs;
-	__be32 index;
 	char *str = buf;
 	struct tpm_chip *chip = dev_get_drvdata(dev);
 
-	data = kzalloc(TPM_INTERNAL_RESULT_SIZE, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
 	rc = tpm_getcap(dev, TPM_CAP_PROP_PCR, &cap,
 			"attempting to determine the number of PCRS");
 	if (rc)
@@ -690,20 +787,14 @@ ssize_t tpm_show_pcrs(struct device *dev, struct device_attribute *attr,
 
 	num_pcrs = be32_to_cpu(cap.num_pcrs);
 	for (i = 0; i < num_pcrs; i++) {
-		memcpy(data, pcrread, sizeof(pcrread));
-		index = cpu_to_be32(i);
-		memcpy(data + 10, &index, 4);
-		rc = transmit_cmd(chip, (struct tpm_cmd_t *)data,
-				  TPM_INTERNAL_RESULT_SIZE,
-				  "attempting to read a PCR");
+		rc = __tpm_pcr_read(chip, i, digest);
 		if (rc)
 			break;
 		str += sprintf(str, "PCR-%02d: ", i);
 		for (j = 0; j < TPM_DIGEST_SIZE; j++)
-			str += sprintf(str, "%02X ", *(data + 10 + j));
+			str += sprintf(str, "%02X ", digest[j]);
 		str += sprintf(str, "\n");
 	}
-	kfree(data);
 	return str - buf;
 }
 EXPORT_SYMBOL_GPL(tpm_show_pcrs);
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index d64f6b7e5b82..8e00b4ddd083 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -26,6 +26,7 @@
 #include <linux/miscdevice.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
+#include <linux/tpm.h>
 
 enum tpm_timeout {
 	TPM_TIMEOUT = 5,	/* msecs */
@@ -234,11 +235,28 @@ typedef union {
 	struct	tpm_output_header out;
 } tpm_cmd_header;
 
+#define TPM_DIGEST_SIZE 20
+struct tpm_pcrread_out {
+	u8	pcr_result[TPM_DIGEST_SIZE];
+}__attribute__((packed));
+
+struct tpm_pcrread_in {
+	__be32	pcr_idx;
+}__attribute__((packed));
+
+struct tpm_pcrextend_in {
+	__be32	pcr_idx;
+	u8	hash[TPM_DIGEST_SIZE];
+}__attribute__((packed));
+
 typedef union {
 	struct	tpm_getcap_params_out getcap_out;
 	struct	tpm_readpubek_params_out readpubek_out;
 	u8	readpubek_out_buffer[sizeof(struct tpm_readpubek_params_out)];
 	struct	tpm_getcap_params_in getcap_in;
+	struct	tpm_pcrread_in	pcrread_in;
+	struct	tpm_pcrread_out	pcrread_out;
+	struct	tpm_pcrextend_in pcrextend_in;
 } tpm_cmd_params;
 
 struct tpm_cmd_t {
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
new file mode 100644
index 000000000000..3338b3f5c21a
--- /dev/null
+++ b/include/linux/tpm.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2004,2007,2008 IBM Corporation
+ *
+ * Authors:
+ * Leendert van Doorn <leendert@watson.ibm.com>
+ * Dave Safford <safford@watson.ibm.com>
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Kylene Hall <kjhall@us.ibm.com>
+ * Debora Velarde <dvelarde@us.ibm.com>
+ *
+ * Maintained by: <tpmdd_devel@lists.sourceforge.net>
+ *
+ * Device driver for TCG/TCPA TPM (trusted platform module).
+ * Specifications at www.trustedcomputinggroup.org
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ */
+#ifndef __LINUX_TPM_H__
+#define __LINUX_TPM_H__
+
+/*
+ * Chip num is this value or a valid tpm idx
+ */
+#define	TPM_ANY_NUM 0xFFFF
+
+#if defined(CONFIG_TCG_TPM)
+
+extern int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf);
+extern int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash);
+#endif
+#endif
-- 
cgit v1.2.3-71-gd317


From 1a5645bc901aea6f3f446888061b2b084bbf1ba6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 2 Feb 2009 23:22:04 -0800
Subject: connector: create connector workqueue only while needed once

The netlink connector uses its own workqueue to relay the datas sent
from userspace to the appropriate callback.  If you launch the test
from Documentation/connector and change it a bit to send a high flow
of data, you will see thousands of events coming to the "cqueue"
workqueue by looking at the workqueue tracer.

This flow of events can be sent very quickly. So, to not encumber the
kevent workqueue and delay other jobs, the "cqueue" workqueue should
remain.

But this workqueue is pointless most of the time, it will always be
created (assuming you have built it of course) although only
developpers with specific needs will use it.

So avoid this "most of the time useless task", this patch proposes to
create this workqueue only when needed once.  The first jobs to be
sent to connector callbacks will be sent to kevent while the "cqueue"
thread creation will be scheduled to kevent too.

The following jobs will continue to be scheduled to keventd until the
cqueue workqueue is created, and then the rest of the jobs will
continue to perform as usual, through this dedicated workqueue.

Each time I tested this patch, only the first event was sent to
keventd, the rest has been sent to cqueue which have been created
quickly.

Also, this patch fixes some trailing whitespaces on the connector files.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_queue.c  | 80 +++++++++++++++++++++++++++++++++++++------
 drivers/connector/connector.c | 19 +++++-----
 include/linux/connector.h     |  8 +++++
 3 files changed, 87 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/connector/cn_queue.c b/drivers/connector/cn_queue.c
index b6fe7e7a2c2f..c769ef269fb5 100644
--- a/drivers/connector/cn_queue.c
+++ b/drivers/connector/cn_queue.c
@@ -1,9 +1,9 @@
 /*
  * 	cn_queue.c
- * 
+ *
  * 2004-2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -31,6 +31,48 @@
 #include <linux/connector.h>
 #include <linux/delay.h>
 
+
+/*
+ * This job is sent to the kevent workqueue.
+ * While no event is once sent to any callback, the connector workqueue
+ * is not created to avoid a useless waiting kernel task.
+ * Once the first event is received, we create this dedicated workqueue which
+ * is necessary because the flow of data can be high and we don't want
+ * to encumber keventd with that.
+ */
+static void cn_queue_create(struct work_struct *work)
+{
+	struct cn_queue_dev *dev;
+
+	dev = container_of(work, struct cn_queue_dev, wq_creation);
+
+	dev->cn_queue = create_singlethread_workqueue(dev->name);
+	/* If we fail, we will use keventd for all following connector jobs */
+	WARN_ON(!dev->cn_queue);
+}
+
+/*
+ * Queue a data sent to a callback.
+ * If the connector workqueue is already created, we queue the job on it.
+ * Otherwise, we queue the job to kevent and queue the connector workqueue
+ * creation too.
+ */
+int queue_cn_work(struct cn_callback_entry *cbq, struct work_struct *work)
+{
+	struct cn_queue_dev *pdev = cbq->pdev;
+
+	if (likely(pdev->cn_queue))
+		return queue_work(pdev->cn_queue, work);
+
+	/* Don't create the connector workqueue twice */
+	if (atomic_inc_return(&pdev->wq_requested) == 1)
+		schedule_work(&pdev->wq_creation);
+	else
+		atomic_dec(&pdev->wq_requested);
+
+	return schedule_work(work);
+}
+
 void cn_queue_wrapper(struct work_struct *work)
 {
 	struct cn_callback_entry *cbq =
@@ -58,14 +100,17 @@ static struct cn_callback_entry *cn_queue_alloc_callback_entry(char *name, struc
 	snprintf(cbq->id.name, sizeof(cbq->id.name), "%s", name);
 	memcpy(&cbq->id.id, id, sizeof(struct cb_id));
 	cbq->data.callback = callback;
-	
+
 	INIT_WORK(&cbq->work, &cn_queue_wrapper);
 	return cbq;
 }
 
 static void cn_queue_free_callback(struct cn_callback_entry *cbq)
 {
-	flush_workqueue(cbq->pdev->cn_queue);
+	/* The first jobs have been sent to kevent, flush them too */
+	flush_scheduled_work();
+	if (cbq->pdev->cn_queue)
+		flush_workqueue(cbq->pdev->cn_queue);
 
 	kfree(cbq);
 }
@@ -143,14 +188,11 @@ struct cn_queue_dev *cn_queue_alloc_dev(char *name, struct sock *nls)
 	atomic_set(&dev->refcnt, 0);
 	INIT_LIST_HEAD(&dev->queue_list);
 	spin_lock_init(&dev->queue_lock);
+	init_waitqueue_head(&dev->wq_created);
 
 	dev->nls = nls;
 
-	dev->cn_queue = create_singlethread_workqueue(dev->name);
-	if (!dev->cn_queue) {
-		kfree(dev);
-		return NULL;
-	}
+	INIT_WORK(&dev->wq_creation, cn_queue_create);
 
 	return dev;
 }
@@ -158,9 +200,25 @@ struct cn_queue_dev *cn_queue_alloc_dev(char *name, struct sock *nls)
 void cn_queue_free_dev(struct cn_queue_dev *dev)
 {
 	struct cn_callback_entry *cbq, *n;
+	long timeout;
+	DEFINE_WAIT(wait);
+
+	/* Flush the first pending jobs queued on kevent */
+	flush_scheduled_work();
+
+	/* If the connector workqueue creation is still pending, wait for it */
+	prepare_to_wait(&dev->wq_created, &wait, TASK_UNINTERRUPTIBLE);
+	if (atomic_read(&dev->wq_requested) && !dev->cn_queue) {
+		timeout = schedule_timeout(HZ * 2);
+		if (!timeout && !dev->cn_queue)
+			WARN_ON(1);
+	}
+	finish_wait(&dev->wq_created, &wait);
 
-	flush_workqueue(dev->cn_queue);
-	destroy_workqueue(dev->cn_queue);
+	if (dev->cn_queue) {
+		flush_workqueue(dev->cn_queue);
+		destroy_workqueue(dev->cn_queue);
+	}
 
 	spin_lock_bh(&dev->queue_lock);
 	list_for_each_entry_safe(cbq, n, &dev->queue_list, callback_entry)
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index bf4830082a13..fd336c5a9057 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -1,9 +1,9 @@
 /*
  * 	connector.c
- * 
+ *
  * 2004-2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
  * All rights reserved.
- * 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -145,14 +145,13 @@ static int cn_call_callback(struct cn_msg *msg, void (*destruct_data)(void *), v
 				__cbq->data.ddata = data;
 				__cbq->data.destruct_data = destruct_data;
 
-				if (queue_work(dev->cbdev->cn_queue,
-							&__cbq->work))
+				if (queue_cn_work(__cbq, &__cbq->work))
 					err = 0;
 				else
 					err = -EINVAL;
 			} else {
 				struct cn_callback_data *d;
-				
+
 				err = -ENOMEM;
 				__new_cbq = kzalloc(sizeof(struct cn_callback_entry), GFP_ATOMIC);
 				if (__new_cbq) {
@@ -163,10 +162,12 @@ static int cn_call_callback(struct cn_msg *msg, void (*destruct_data)(void *), v
 					d->destruct_data = destruct_data;
 					d->free = __new_cbq;
 
+					__new_cbq->pdev = __cbq->pdev;
+
 					INIT_WORK(&__new_cbq->work,
 							&cn_queue_wrapper);
 
-					if (queue_work(dev->cbdev->cn_queue,
+					if (queue_cn_work(__new_cbq,
 						    &__new_cbq->work))
 						err = 0;
 					else {
@@ -237,7 +238,7 @@ static void cn_notify(struct cb_id *id, u32 notify_event)
 
 		req = (struct cn_notify_req *)ctl->data;
 		for (i = 0; i < ctl->idx_notify_num; ++i, ++req) {
-			if (id->idx >= req->first && 
+			if (id->idx >= req->first &&
 					id->idx < req->first + req->range) {
 				idx_found = 1;
 				break;
@@ -245,7 +246,7 @@ static void cn_notify(struct cb_id *id, u32 notify_event)
 		}
 
 		for (i = 0; i < ctl->val_notify_num; ++i, ++req) {
-			if (id->val >= req->first && 
+			if (id->val >= req->first &&
 					id->val < req->first + req->range) {
 				val_found = 1;
 				break;
@@ -459,7 +460,7 @@ static int __devinit cn_init(void)
 		netlink_kernel_release(dev->nls);
 		return -EINVAL;
 	}
-	
+
 	cn_already_initialized = 1;
 
 	err = cn_add_callback(&dev->id, "connector", &cn_callback);
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 34f2789d9b9b..fc65d219d88c 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -109,6 +109,12 @@ struct cn_queue_dev {
 	unsigned char name[CN_CBQ_NAMELEN];
 
 	struct workqueue_struct *cn_queue;
+	/* Sent to kevent to create cn_queue only when needed */
+	struct work_struct wq_creation;
+	/* Tell if the wq_creation job is pending/completed */
+	atomic_t wq_requested;
+	/* Wait for cn_queue to be created */
+	wait_queue_head_t wq_created;
 
 	struct list_head queue_list;
 	spinlock_t queue_lock;
@@ -164,6 +170,8 @@ int cn_netlink_send(struct cn_msg *, u32, gfp_t);
 int cn_queue_add_callback(struct cn_queue_dev *dev, char *name, struct cb_id *id, void (*callback)(void *));
 void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id);
 
+int queue_cn_work(struct cn_callback_entry *cbq, struct work_struct *work);
+
 struct cn_queue_dev *cn_queue_alloc_dev(char *name, struct sock *);
 void cn_queue_free_dev(struct cn_queue_dev *dev);
 
-- 
cgit v1.2.3-71-gd317


From 2a41f71d3bd97dde3305b4e1c43ab0eca46e7c71 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Wed, 4 Feb 2009 09:02:34 +0000
Subject: virtio_net: Add a virtqueue for outbound control commands

This will be used for RX mode, MAC filter table, VLAN filtering, etc...

The control transaction consists of one or more "out" sg entries and
one or more "in" sg entries.  The first out entry contains a header
defining the class and command.  Additional out entries may provide
data for the command.  The last in entry provides a status response
back from the command.

Virtqueues typically run asynchronous, running a callback function
when there's data in the channel.  We can't readily make use of this
in the command paths where we need to use this.  Instead, we kick
the virtqueue and spin.  The kick causes an I/O write, triggering an
immediate trap into the hypervisor.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c   | 68 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/virtio_net.h | 18 ++++++++++++
 2 files changed, 83 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fe576e75a538..67bb583b7fc9 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -37,10 +37,12 @@ module_param(gso, bool, 0444);
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
 
+#define VIRTNET_SEND_COMMAND_SG_MAX    0
+
 struct virtnet_info
 {
 	struct virtio_device *vdev;
-	struct virtqueue *rvq, *svq;
+	struct virtqueue *rvq, *svq, *cvq;
 	struct net_device *dev;
 	struct napi_struct napi;
 	unsigned int status;
@@ -589,6 +591,53 @@ static int virtnet_open(struct net_device *dev)
 	return 0;
 }
 
+/*
+ * Send command via the control virtqueue and check status.  Commands
+ * supported by the hypervisor, as indicated by feature bits, should
+ * never fail unless improperly formated.
+ */
+static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
+				 struct scatterlist *data, int out, int in)
+{
+	struct scatterlist sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+	struct virtio_net_ctrl_hdr ctrl;
+	virtio_net_ctrl_ack status = ~0;
+	unsigned int tmp;
+
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
+		BUG();  /* Caller should know better */
+		return false;
+	}
+
+	BUG_ON(out + in > VIRTNET_SEND_COMMAND_SG_MAX);
+
+	out++; /* Add header */
+	in++; /* Add return status */
+
+	ctrl.class = class;
+	ctrl.cmd = cmd;
+
+	sg_init_table(sg, out + in);
+
+	sg_set_buf(&sg[0], &ctrl, sizeof(ctrl));
+	memcpy(&sg[1], data, sizeof(struct scatterlist) * (out + in - 2));
+	sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
+
+	if (vi->cvq->vq_ops->add_buf(vi->cvq, sg, out, in, vi) != 0)
+		BUG();
+
+	vi->cvq->vq_ops->kick(vi->cvq);
+
+	/*
+	 * Spin for a response, the kick causes an ioport write, trapping
+	 * into the hypervisor, so the request should be handled immediately.
+	 */
+	while (!vi->cvq->vq_ops->get_buf(vi->cvq, &tmp))
+		cpu_relax();
+
+	return status == VIRTIO_NET_OK;
+}
+
 static int virtnet_close(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
@@ -752,6 +801,14 @@ static int virtnet_probe(struct virtio_device *vdev)
 		goto free_recv;
 	}
 
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
+		vi->cvq = vdev->config->find_vq(vdev, 2, NULL);
+		if (IS_ERR(vi->cvq)) {
+			err = PTR_ERR(vi->svq);
+			goto free_send;
+		}
+	}
+
 	/* Initialize our empty receive and send queues. */
 	skb_queue_head_init(&vi->recv);
 	skb_queue_head_init(&vi->send);
@@ -764,7 +821,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
-		goto free_send;
+		goto free_ctrl;
 	}
 
 	/* Last of all, set up some receive buffers. */
@@ -784,6 +841,9 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 unregister:
 	unregister_netdev(dev);
+free_ctrl:
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ))
+		vdev->config->del_vq(vi->cvq);
 free_send:
 	vdev->config->del_vq(vi->svq);
 free_recv:
@@ -815,6 +875,8 @@ static void virtnet_remove(struct virtio_device *vdev)
 
 	vdev->config->del_vq(vi->svq);
 	vdev->config->del_vq(vi->rvq);
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ))
+		vdev->config->del_vq(vi->cvq);
 	unregister_netdev(vi->dev);
 
 	while (vi->pages)
@@ -834,7 +896,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
 	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
 	VIRTIO_NET_F_GUEST_ECN, /* We don't yet handle UFO input. */
-	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS,
+	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
 	VIRTIO_F_NOTIFY_ON_EMPTY,
 };
 
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index d8e362d52fd8..245eda829aa8 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -23,6 +23,7 @@
 #define VIRTIO_NET_F_HOST_UFO	14	/* Host can handle UFO in. */
 #define VIRTIO_NET_F_MRG_RXBUF	15	/* Host can merge receive buffers. */
 #define VIRTIO_NET_F_STATUS	16	/* virtio_net_config.status available */
+#define VIRTIO_NET_F_CTRL_VQ	17	/* Control channel available */
 
 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 
@@ -59,4 +60,21 @@ struct virtio_net_hdr_mrg_rxbuf {
 	__u16 num_buffers;	/* Number of merged rx buffers */
 };
 
+/*
+ * Control virtqueue data structures
+ *
+ * The control virtqueue expects a header in the first sg entry
+ * and an ack/status response in the last entry.  Data for the
+ * command goes in between.
+ */
+struct virtio_net_ctrl_hdr {
+	__u8 class;
+	__u8 cmd;
+} __attribute__((packed));
+
+typedef __u8 virtio_net_ctrl_ack;
+
+#define VIRTIO_NET_OK     0
+#define VIRTIO_NET_ERR    1
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3-71-gd317


From 2af7698e2dd698d452ab9d63a9ca5956bbe8fc3b Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Wed, 4 Feb 2009 09:02:40 +0000
Subject: virtio_net: Add a set_rx_mode interface

Make use of the RX_MODE control virtqueue class to enable the
set_rx_mode netdev interface.  This allows us to selectively
enable/disable promiscuous and allmulti mode so we don't see
packets we don't want.  For now, we automatically enable these
as needed if additional unicast or multicast addresses are
requested.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c   | 34 +++++++++++++++++++++++++++++++++-
 include/linux/virtio_net.h | 11 +++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 67bb583b7fc9..1abea9dc6f0f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -37,7 +37,7 @@ module_param(gso, bool, 0444);
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
 
-#define VIRTNET_SEND_COMMAND_SG_MAX    0
+#define VIRTNET_SEND_COMMAND_SG_MAX    1
 
 struct virtnet_info
 {
@@ -658,6 +658,36 @@ static int virtnet_set_tx_csum(struct net_device *dev, u32 data)
 	return ethtool_op_set_tx_hw_csum(dev, data);
 }
 
+static void virtnet_set_rx_mode(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct scatterlist sg;
+	u8 promisc, allmulti;
+
+	/* We can't dynamicaly set ndo_set_rx_mode, so return gracefully */
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
+		return;
+
+	promisc = ((dev->flags & IFF_PROMISC) != 0 || dev->uc_count > 0);
+	allmulti = ((dev->flags & IFF_ALLMULTI) != 0 || dev->mc_count > 0);
+
+	sg_set_buf(&sg, &promisc, sizeof(promisc));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
+				  VIRTIO_NET_CTRL_RX_PROMISC,
+				  &sg, 1, 0))
+		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
+			 promisc ? "en" : "dis");
+
+	sg_set_buf(&sg, &allmulti, sizeof(allmulti));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
+				  VIRTIO_NET_CTRL_RX_ALLMULTI,
+				  &sg, 1, 0))
+		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
+			 allmulti ? "en" : "dis");
+}
+
 static struct ethtool_ops virtnet_ethtool_ops = {
 	.set_tx_csum = virtnet_set_tx_csum,
 	.set_sg = ethtool_op_set_sg,
@@ -682,6 +712,7 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_start_xmit      = start_xmit,
 	.ndo_validate_addr   = eth_validate_addr,
 	.ndo_set_mac_address = eth_mac_addr,
+	.ndo_set_rx_mode     = virtnet_set_rx_mode,
 	.ndo_change_mtu	     = virtnet_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = virtnet_netpoll,
@@ -897,6 +928,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
 	VIRTIO_NET_F_GUEST_ECN, /* We don't yet handle UFO input. */
 	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
+	VIRTIO_NET_F_CTRL_RX,
 	VIRTIO_F_NOTIFY_ON_EMPTY,
 };
 
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 245eda829aa8..63b2461a40f1 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -24,6 +24,7 @@
 #define VIRTIO_NET_F_MRG_RXBUF	15	/* Host can merge receive buffers. */
 #define VIRTIO_NET_F_STATUS	16	/* virtio_net_config.status available */
 #define VIRTIO_NET_F_CTRL_VQ	17	/* Control channel available */
+#define VIRTIO_NET_F_CTRL_RX	18	/* Control channel RX mode support */
 
 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 
@@ -77,4 +78,14 @@ typedef __u8 virtio_net_ctrl_ack;
 #define VIRTIO_NET_OK     0
 #define VIRTIO_NET_ERR    1
 
+/*
+ * Control the RX mode, ie. promisucous and allmulti.  PROMISC and
+ * ALLMULTI commands require an "out" sg entry containing a 1 byte
+ * state value, zero = disable, non-zero = enable.  These commands
+ * are supported with the VIRTIO_NET_F_CTRL_RX feature.
+ */
+#define VIRTIO_NET_CTRL_RX    0
+ #define VIRTIO_NET_CTRL_RX_PROMISC      0
+ #define VIRTIO_NET_CTRL_RX_ALLMULTI     1
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3-71-gd317


From f565a7c259d71cc186753653d978c646d2354b36 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Wed, 4 Feb 2009 09:02:45 +0000
Subject: virtio_net: Add a MAC filter table

Make use of the MAC control virtqueue class to support a MAC
filter table.  The filter table is managed by the hypervisor.
We consider the table to be available if the CTRL_RX feature
bit is set.  We leave it to the hypervisor to manage the table
and enable promiscuous or all-multi mode as necessary depending
on the resources available to it.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c   | 55 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/virtio_net.h | 23 +++++++++++++++++++
 2 files changed, 70 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 1abea9dc6f0f..daab9c9b0a40 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -37,7 +37,7 @@ module_param(gso, bool, 0444);
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
 
-#define VIRTNET_SEND_COMMAND_SG_MAX    1
+#define VIRTNET_SEND_COMMAND_SG_MAX    2
 
 struct virtnet_info
 {
@@ -661,31 +661,70 @@ static int virtnet_set_tx_csum(struct net_device *dev, u32 data)
 static void virtnet_set_rx_mode(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	struct scatterlist sg;
+	struct scatterlist sg[2];
 	u8 promisc, allmulti;
+	struct virtio_net_ctrl_mac *mac_data;
+	struct dev_addr_list *addr;
+	void *buf;
+	int i;
 
 	/* We can't dynamicaly set ndo_set_rx_mode, so return gracefully */
 	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
 		return;
 
-	promisc = ((dev->flags & IFF_PROMISC) != 0 || dev->uc_count > 0);
-	allmulti = ((dev->flags & IFF_ALLMULTI) != 0 || dev->mc_count > 0);
+	promisc = ((dev->flags & IFF_PROMISC) != 0);
+	allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
 
-	sg_set_buf(&sg, &promisc, sizeof(promisc));
+	sg_set_buf(sg, &promisc, sizeof(promisc));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_PROMISC,
-				  &sg, 1, 0))
+				  sg, 1, 0))
 		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
 			 promisc ? "en" : "dis");
 
-	sg_set_buf(&sg, &allmulti, sizeof(allmulti));
+	sg_set_buf(sg, &allmulti, sizeof(allmulti));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_ALLMULTI,
-				  &sg, 1, 0))
+				  sg, 1, 0))
 		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
 			 allmulti ? "en" : "dis");
+
+	/* MAC filter - use one buffer for both lists */
+	mac_data = buf = kzalloc(((dev->uc_count + dev->mc_count) * ETH_ALEN) +
+				 (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
+	if (!buf) {
+		dev_warn(&dev->dev, "No memory for MAC address buffer\n");
+		return;
+	}
+
+	/* Store the unicast list and count in the front of the buffer */
+	mac_data->entries = dev->uc_count;
+	addr = dev->uc_list;
+	for (i = 0; i < dev->uc_count; i++, addr = addr->next)
+		memcpy(&mac_data->macs[i][0], addr->da_addr, ETH_ALEN);
+
+	sg_set_buf(&sg[0], mac_data,
+		   sizeof(mac_data->entries) + (dev->uc_count * ETH_ALEN));
+
+	/* multicast list and count fill the end */
+	mac_data = (void *)&mac_data->macs[dev->uc_count][0];
+
+	mac_data->entries = dev->mc_count;
+	addr = dev->mc_list;
+	for (i = 0; i < dev->mc_count; i++, addr = addr->next)
+		memcpy(&mac_data->macs[i][0], addr->da_addr, ETH_ALEN);
+
+	sg_set_buf(&sg[1], mac_data,
+		   sizeof(mac_data->entries) + (dev->mc_count * ETH_ALEN));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
+				  VIRTIO_NET_CTRL_MAC_TABLE_SET,
+				  sg, 2, 0))
+		dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
+
+	kfree(buf);
 }
 
 static struct ethtool_ops virtnet_ethtool_ops = {
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 63b2461a40f1..ba82b653cace 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -88,4 +88,27 @@ typedef __u8 virtio_net_ctrl_ack;
  #define VIRTIO_NET_CTRL_RX_PROMISC      0
  #define VIRTIO_NET_CTRL_RX_ALLMULTI     1
 
+/*
+ * Control the MAC filter table.
+ *
+ * The MAC filter table is managed by the hypervisor, the guest should
+ * assume the size is infinite.  Filtering should be considered
+ * non-perfect, ie. based on hypervisor resources, the guest may
+ * received packets from sources not specified in the filter list.
+ *
+ * In addition to the class/cmd header, the TABLE_SET command requires
+ * two out scatterlists.  Each contains a 4 byte count of entries followed
+ * by a concatenated byte stream of the ETH_ALEN MAC addresses.  The
+ * first sg list contains unicast addresses, the second is for multicast.
+ * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
+ * is available.
+ */
+struct virtio_net_ctrl_mac {
+	__u32 entries;
+	__u8 macs[][ETH_ALEN];
+} __attribute__((packed));
+
+#define VIRTIO_NET_CTRL_MAC    1
+ #define VIRTIO_NET_CTRL_MAC_TABLE_SET        0
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3-71-gd317


From 0bde95690d65653e420d04856c5d5783155c747c Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Wed, 4 Feb 2009 09:02:50 +0000
Subject: virtio_net: Add support for VLAN filtering in the hypervisor

VLAN filtering allows the hypervisor to drop packets from VLANs
that we're not a part of, further reducing the number of extraneous
packets recieved.  This makes use of the VLAN virtqueue command class.
The CTRL_VLAN feature bit tells us whether the backend supports VLAN
filtering.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c   | 31 ++++++++++++++++++++++++++++++-
 include/linux/virtio_net.h | 14 ++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index daab9c9b0a40..e68813a246db 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -727,6 +727,30 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	kfree(buf);
 }
 
+static void virnet_vlan_rx_add_vid(struct net_device *dev, u16 vid)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct scatterlist sg;
+
+	sg_set_buf(&sg, &vid, sizeof(vid));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
+				  VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0))
+		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
+}
+
+static void virnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct scatterlist sg;
+
+	sg_set_buf(&sg, &vid, sizeof(vid));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
+				  VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0))
+		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
+}
+
 static struct ethtool_ops virtnet_ethtool_ops = {
 	.set_tx_csum = virtnet_set_tx_csum,
 	.set_sg = ethtool_op_set_sg,
@@ -753,6 +777,8 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_set_rx_mode     = virtnet_set_rx_mode,
 	.ndo_change_mtu	     = virtnet_change_mtu,
+	.ndo_vlan_rx_add_vid = virnet_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid = virnet_vlan_rx_kill_vid,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = virtnet_netpoll,
 #endif
@@ -877,6 +903,9 @@ static int virtnet_probe(struct virtio_device *vdev)
 			err = PTR_ERR(vi->svq);
 			goto free_send;
 		}
+
+		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
+			dev->features |= NETIF_F_HW_VLAN_FILTER;
 	}
 
 	/* Initialize our empty receive and send queues. */
@@ -967,7 +996,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
 	VIRTIO_NET_F_GUEST_ECN, /* We don't yet handle UFO input. */
 	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
-	VIRTIO_NET_F_CTRL_RX,
+	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
 	VIRTIO_F_NOTIFY_ON_EMPTY,
 };
 
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index ba82b653cace..242348bb3766 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -25,6 +25,7 @@
 #define VIRTIO_NET_F_STATUS	16	/* virtio_net_config.status available */
 #define VIRTIO_NET_F_CTRL_VQ	17	/* Control channel available */
 #define VIRTIO_NET_F_CTRL_RX	18	/* Control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN	19	/* Control channel VLAN filtering */
 
 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 
@@ -111,4 +112,17 @@ struct virtio_net_ctrl_mac {
 #define VIRTIO_NET_CTRL_MAC    1
  #define VIRTIO_NET_CTRL_MAC_TABLE_SET        0
 
+/*
+ * Control VLAN filtering
+ *
+ * The VLAN filter table is controlled via a simple ADD/DEL interface.
+ * VLAN IDs not added may be filterd by the hypervisor.  Del is the
+ * opposite of add.  Both commands expect an out entry containing a 2
+ * byte VLAN ID.  VLAN filterting is available with the
+ * VIRTIO_NET_F_CTRL_VLAN feature bit.
+ */
+#define VIRTIO_NET_CTRL_VLAN       2
+ #define VIRTIO_NET_CTRL_VLAN_ADD             0
+ #define VIRTIO_NET_CTRL_VLAN_DEL             1
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3-71-gd317


From f036be96dd9ce442ffb9ab33e3c165f5178815c0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Feb 2009 13:45:43 +0100
Subject: printk: introduce printk_once()

This pattern shows up frequently in the kernel:

  static int once = 1;
  ...

		if (once) {
			once = 0;
			printk(KERN_ERR "message\n");
		}
  ...

So add a printk_once() helper macro that reduces this to a single line
of:

		printk_once(KERN_ERR "message\n");

It works analogously to WARN_ONCE() & friends. (We use a macro not
an inline because vararg expansion in inlines looks awkward and the
macro is simple enough.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 343df9ef2412..3c183d9864ae 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,19 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+
+/*
+ * Print a one-time message (analogous to WARN_ONCE() et al):
+ */
+#define printk_once(x...) ({			\
+	static int __print_once = 1;		\
+						\
+	if (__print_once) {			\
+		__print_once = 0;		\
+		printk(x);			\
+	}					\
+})
+
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -253,6 +266,10 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+
+/* No effect, but we still get type checking even in the !PRINTK case: */
+#define printk_once(x...) printk(x)
+
 #endif
 
 extern int printk_needs_cpu(int cpu);
-- 
cgit v1.2.3-71-gd317


From 6146f0d5e47ca4047ffded0fb79b6c25359b386c Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 4 Feb 2009 09:06:57 -0500
Subject: integrity: IMA hooks

This patch replaces the generic integrity hooks, for which IMA registered
itself, with IMA integrity hooks in the appropriate places directly
in the fs directory.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/kernel-parameters.txt |  1 +
 fs/exec.c                           | 10 +++++++++
 fs/file_table.c                     |  2 ++
 fs/inode.c                          | 24 ++++++++++++++------
 fs/namei.c                          |  8 +++++++
 include/linux/ima.h                 | 44 +++++++++++++++++++++++++++++++++++++
 mm/mmap.c                           |  4 ++++
 7 files changed, 86 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/ima.h

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a2d8805c03d5..7c67b94d1823 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -44,6 +44,7 @@ parameter is applicable:
 	FB	The frame buffer device is enabled.
 	HW	Appropriate hardware is enabled.
 	IA-64	IA-64 architecture is enabled.
+	IMA     Integrity measurement architecture is enabled.
 	IOSCHED	More than one I/O scheduler is enabled.
 	IP_PNP	IP DHCP, BOOTP, or RARP is enabled.
 	ISAPNP	ISA PnP code is enabled.
diff --git a/fs/exec.c b/fs/exec.c
index 02d2e120542d..9c789a525cc4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -45,6 +45,7 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -128,6 +129,9 @@ asmlinkage long sys_uselib(const char __user * library)
 		goto exit;
 
 	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+	if (error)
+		goto exit;
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
 
@@ -681,6 +685,9 @@ struct file *open_exec(const char *name)
 		goto out_path_put;
 
 	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+	if (err)
+		goto out_path_put;
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
 
@@ -1207,6 +1214,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	}
 #endif
 	retval = security_bprm_check(bprm);
+	if (retval)
+		return retval;
+	retval = ima_bprm_check(bprm);
 	if (retval)
 		return retval;
 
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc3ea75..55895ccc08c6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -276,6 +277,7 @@ void __fput(struct file *file)
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
+	ima_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
diff --git a/fs/inode.c b/fs/inode.c
index 098a2443196f..ed22b14f2202 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -144,13 +145,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_cdev = NULL;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
-	if (security_inode_alloc(inode)) {
-		if (inode->i_sb->s_op->destroy_inode)
-			inode->i_sb->s_op->destroy_inode(inode);
-		else
-			kmem_cache_free(inode_cachep, (inode));
-		return NULL;
-	}
+
+	if (security_inode_alloc(inode))
+		goto out_free_inode;
+
+	/* allocate and initialize an i_integrity */
+	if (ima_inode_alloc(inode))
+		goto out_free_security;
 
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -186,6 +187,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_mapping = mapping;
 
 	return inode;
+
+out_free_security:
+	security_inode_free(inode);
+out_free_inode:
+	if (inode->i_sb->s_op->destroy_inode)
+		inode->i_sb->s_op->destroy_inode(inode);
+	else
+		kmem_cache_free(inode_cachep, (inode));
+	return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
 
diff --git a/fs/namei.c b/fs/namei.c
index af3783fff1de..734f2b5591bf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -24,6 +24,7 @@
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -860,6 +861,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		err = exec_permission_lite(inode);
 		if (err == -EAGAIN)
 			err = vfs_permission(nd, MAY_EXEC);
+		if (!err)
+			err = ima_path_check(&nd->path, MAY_EXEC);
  		if (err)
 			break;
 
@@ -1525,6 +1528,11 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	error = vfs_permission(nd, acc_mode);
 	if (error)
 		return error;
+
+	error = ima_path_check(&nd->path,
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+	if (error)
+		return error;
 	/*
 	 * An append-only file must be opened in append mode for writing.
 	 */
diff --git a/include/linux/ima.h b/include/linux/ima.h
new file mode 100644
index 000000000000..4ed1e4d962e2
--- /dev/null
+++ b/include/linux/ima.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2008 IBM Corporation
+ * Author: Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 2 of the License.
+ */
+
+#include <linux/fs.h>
+
+#ifndef _LINUX_IMA_H
+#define _LINUX_IMA_H
+
+static inline int ima_bprm_check(struct linux_binprm *bprm)
+{
+	return 0;
+}
+
+static inline int ima_inode_alloc(struct inode *inode)
+{
+	return 0;
+}
+
+static inline void ima_inode_free(struct inode *inode)
+{
+	return;
+}
+
+static inline int ima_path_check(struct path *path, int mask)
+{
+	return 0;
+}
+
+static inline void ima_file_free(struct file *file)
+{
+	return;
+}
+
+static inline int ima_file_mmap(struct file *file, unsigned long prot)
+{
+	return 0;
+}
+#endif /* _LINUX_IMA_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index d4855a682ab6..c3647f3b0621 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>
 #include <linux/module.h>
@@ -1048,6 +1049,9 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	}
 
 	error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+	if (error)
+		return error;
+	error = ima_file_mmap(file, prot);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3-71-gd317


From 3323eec921efd815178a23107ab63588c605c0b2 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 4 Feb 2009 09:06:58 -0500
Subject: integrity: IMA as an integrity service provider

IMA provides hardware (TPM) based measurement and attestation for
file measurements. As the Trusted Computing (TPM) model requires,
IMA measures all files before they are accessed in any way (on the
integrity_bprm_check, integrity_path_check and integrity_file_mmap
hooks), and commits the measurements to the TPM. Once added to the
TPM, measurements can not be removed.

In addition, IMA maintains a list of these file measurements, which
can be used to validate the aggregate value stored in the TPM.  The
TPM can sign these measurements, and thus the system can prove, to
itself and to a third party, the system's integrity in a way that
cannot be circumvented by malicious or compromised software.

- alloc ima_template_entry before calling ima_store_template()
- log ima_add_boot_aggregate() failure
- removed unused IMA_TEMPLATE_NAME_LEN
- replaced hard coded string length with #define name

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/kernel-parameters.txt |   9 ++
 include/linux/audit.h               |   5 +
 include/linux/ima.h                 |  10 ++
 security/Kconfig                    |   5 +-
 security/Makefile                   |   4 +
 security/integrity/ima/Kconfig      |  49 +++++++
 security/integrity/ima/Makefile     |   9 ++
 security/integrity/ima/ima.h        | 135 +++++++++++++++++
 security/integrity/ima/ima_api.c    | 190 ++++++++++++++++++++++++
 security/integrity/ima/ima_audit.c  |  78 ++++++++++
 security/integrity/ima/ima_crypto.c | 140 ++++++++++++++++++
 security/integrity/ima/ima_iint.c   | 185 ++++++++++++++++++++++++
 security/integrity/ima/ima_init.c   |  90 ++++++++++++
 security/integrity/ima/ima_main.c   | 280 ++++++++++++++++++++++++++++++++++++
 security/integrity/ima/ima_policy.c | 126 ++++++++++++++++
 security/integrity/ima/ima_queue.c  | 140 ++++++++++++++++++
 16 files changed, 1454 insertions(+), 1 deletion(-)
 create mode 100644 security/integrity/ima/Kconfig
 create mode 100644 security/integrity/ima/Makefile
 create mode 100644 security/integrity/ima/ima.h
 create mode 100644 security/integrity/ima/ima_api.c
 create mode 100644 security/integrity/ima/ima_audit.c
 create mode 100644 security/integrity/ima/ima_crypto.c
 create mode 100644 security/integrity/ima/ima_iint.c
 create mode 100644 security/integrity/ima/ima_init.c
 create mode 100644 security/integrity/ima/ima_main.c
 create mode 100644 security/integrity/ima/ima_policy.c
 create mode 100644 security/integrity/ima/ima_queue.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7c67b94d1823..31e0c2c3c6e3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -895,6 +895,15 @@ and is between 256 and 4096 characters. It is defined in the file
 	ihash_entries=	[KNL]
 			Set number of hash buckets for inode cache.
 
+	ima_audit=	[IMA]
+			Format: { "0" | "1" }
+			0 -- integrity auditing messages. (Default)
+			1 -- enable informational integrity auditing messages.
+
+	ima_hash=	[IMA]
+			Formt: { "sha1" | "md5" }
+			default: "sha1"
+
 	in2000=		[HW,SCSI]
 			See header of drivers/scsi/in2000.c.
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 26c4f6f65a46..8d1f67789b53 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -125,6 +125,11 @@
 #define AUDIT_LAST_KERN_ANOM_MSG    1799
 #define AUDIT_ANOM_PROMISCUOUS      1700 /* Device changed promiscuous mode */
 #define AUDIT_ANOM_ABEND            1701 /* Process ended abnormally */
+#define AUDIT_INTEGRITY_DATA	    1800 /* Data integrity verification */
+#define AUDIT_INTEGRITY_METADATA    1801 /* Metadata integrity verification */
+#define AUDIT_INTEGRITY_STATUS	    1802 /* Integrity enable status */
+#define AUDIT_INTEGRITY_HASH	    1803 /* Integrity HASH type */
+#define AUDIT_INTEGRITY_PCR	    1804 /* PCR invalidation msgs */
 
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 4ed1e4d962e2..dcc3664feee8 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -12,6 +12,15 @@
 #ifndef _LINUX_IMA_H
 #define _LINUX_IMA_H
 
+#ifdef CONFIG_IMA
+extern int ima_bprm_check(struct linux_binprm *bprm);
+extern int ima_inode_alloc(struct inode *inode);
+extern void ima_inode_free(struct inode *inode);
+extern int ima_path_check(struct path *path, int mask);
+extern void ima_file_free(struct file *file);
+extern int ima_file_mmap(struct file *file, unsigned long prot);
+
+#else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
 {
 	return 0;
@@ -41,4 +50,5 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 {
 	return 0;
 }
+#endif /* CONFIG_IMA_H */
 #endif /* _LINUX_IMA_H */
diff --git a/security/Kconfig b/security/Kconfig
index d9f47ce7e207..a79b23f73d03 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -55,7 +55,8 @@ config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help
 	  This will build the securityfs filesystem.  It is currently used by
-	  the TPM bios character driver.  It is not used by SELinux or SMACK.
+	  the TPM bios character driver and IMA, an integrity provider.  It is
+	  not used by SELinux or SMACK.
 
 	  If you are unsure how to answer this question, answer N.
 
@@ -126,5 +127,7 @@ config SECURITY_DEFAULT_MMAP_MIN_ADDR
 source security/selinux/Kconfig
 source security/smack/Kconfig
 
+source security/integrity/ima/Kconfig
+
 endmenu
 
diff --git a/security/Makefile b/security/Makefile
index c05c127fff9a..595536cbffb2 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -17,3 +17,7 @@ obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/built-in.o
 obj-$(CONFIG_SECURITY_SMACK)		+= smack/built-in.o
 obj-$(CONFIG_SECURITY_ROOTPLUG)		+= root_plug.o
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
+
+# Object integrity file lists
+subdir-$(CONFIG_IMA)			+= integrity/ima
+obj-$(CONFIG_IMA)			+= integrity/ima/built-in.o
diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
new file mode 100644
index 000000000000..2a761c8ac996
--- /dev/null
+++ b/security/integrity/ima/Kconfig
@@ -0,0 +1,49 @@
+# IBM Integrity Measurement Architecture
+#
+config IMA
+	bool "Integrity Measurement Architecture(IMA)"
+	depends on ACPI
+	select SECURITYFS
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select TCG_TPM
+	select TCG_TIS
+	help
+	  The Trusted Computing Group(TCG) runtime Integrity
+	  Measurement Architecture(IMA) maintains a list of hash
+	  values of executables and other sensitive system files,
+	  as they are read or executed. If an attacker manages
+	  to change the contents of an important system file
+	  being measured, we can tell.
+
+	  If your system has a TPM chip, then IMA also maintains
+	  an aggregate integrity value over this list inside the
+	  TPM hardware, so that the TPM can prove to a third party
+	  whether or not critical system files have been modified.
+	  Read <http://www.usenix.org/events/sec04/tech/sailer.html>
+	  to learn more about IMA.
+	  If unsure, say N.
+
+config IMA_MEASURE_PCR_IDX
+	int
+	depends on IMA
+	range 8 14
+	default 10
+	help
+	  IMA_MEASURE_PCR_IDX determines the TPM PCR register index
+	  that IMA uses to maintain the integrity aggregate of the
+	  measurement list.  If unsure, use the default 10.
+
+config IMA_AUDIT
+	bool
+	depends on IMA
+	default y
+	help
+	  This option adds a kernel parameter 'ima_audit', which
+	  allows informational auditing messages to be enabled
+	  at boot.  If this option is selected, informational integrity
+	  auditing messages can be enabled with 'ima_audit=1' on
+	  the kernel command line.
+
diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile
new file mode 100644
index 000000000000..9d6bf973b9be
--- /dev/null
+++ b/security/integrity/ima/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for building Trusted Computing Group's(TCG) runtime Integrity
+# Measurement Architecture(IMA).
+#
+
+obj-$(CONFIG_IMA) += ima.o
+
+ima-y := ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \
+	 ima_policy.o ima_iint.o ima_audit.o
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
new file mode 100644
index 000000000000..bfa72ed41b9b
--- /dev/null
+++ b/security/integrity/ima/ima.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2005,2006,2007,2008 IBM Corporation
+ *
+ * Authors:
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima.h
+ *	internal Integrity Measurement Architecture (IMA) definitions
+ */
+
+#ifndef __LINUX_IMA_H
+#define __LINUX_IMA_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+#include <linux/tpm.h>
+#include <linux/audit.h>
+
+enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_ASCII };
+enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 };
+
+/* digest size for IMA, fits SHA1 or MD5 */
+#define IMA_DIGEST_SIZE		20
+#define IMA_EVENT_NAME_LEN_MAX	255
+
+#define IMA_HASH_BITS 9
+#define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS)
+
+/* set during initialization */
+extern int ima_initialized;
+extern int ima_used_chip;
+extern char *ima_hash;
+
+/* IMA inode template definition */
+struct ima_template_data {
+	u8 digest[IMA_DIGEST_SIZE];	/* sha1/md5 measurement hash */
+	char file_name[IMA_EVENT_NAME_LEN_MAX + 1];	/* name + \0 */
+};
+
+struct ima_template_entry {
+	u8 digest[IMA_DIGEST_SIZE];	/* sha1 or md5 measurement hash */
+	char *template_name;
+	int template_len;
+	struct ima_template_data template;
+};
+
+struct ima_queue_entry {
+	struct hlist_node hnext;	/* place in hash collision list */
+	struct list_head later;		/* place in ima_measurements list */
+	struct ima_template_entry *entry;
+};
+extern struct list_head ima_measurements;	/* list of all measurements */
+
+/* declarations */
+void integrity_audit_msg(int audit_msgno, struct inode *inode,
+			 const unsigned char *fname, const char *op,
+			 const char *cause, int result, int info);
+
+/* Internal IMA function definitions */
+void ima_iintcache_init(void);
+int ima_init(void);
+int ima_add_template_entry(struct ima_template_entry *entry, int violation,
+			   const char *op, struct inode *inode);
+int ima_calc_hash(struct file *file, char *digest);
+int ima_calc_template_hash(int template_len, void *template, char *digest);
+int ima_calc_boot_aggregate(char *digest);
+void ima_add_violation(struct inode *inode, const unsigned char *filename,
+		       const char *op, const char *cause);
+
+/*
+ * used to protect h_table and sha_table
+ */
+extern spinlock_t ima_queue_lock;
+
+struct ima_h_table {
+	atomic_long_t len;	/* number of stored measurements in the list */
+	atomic_long_t violations;
+	struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE];
+};
+extern struct ima_h_table ima_htable;
+
+static inline unsigned long ima_hash_key(u8 *digest)
+{
+	return hash_long(*digest, IMA_HASH_BITS);
+}
+
+/* iint cache flags */
+#define IMA_MEASURED		1
+
+/* integrity data associated with an inode */
+struct ima_iint_cache {
+	u64 version;		/* track inode changes */
+	unsigned long flags;
+	u8 digest[IMA_DIGEST_SIZE];
+	struct mutex mutex;	/* protects: version, flags, digest */
+	long readcount;		/* measured files readcount */
+	long writecount;	/* measured files writecount */
+	struct kref refcount;	/* ima_iint_cache reference count */
+	struct rcu_head rcu;
+};
+
+/* LIM API function definitions */
+int ima_must_measure(struct ima_iint_cache *iint, struct inode *inode,
+		     int mask, int function);
+int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file);
+void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
+			   const unsigned char *filename);
+int ima_store_template(struct ima_template_entry *entry, int violation,
+		       struct inode *inode);
+
+/* radix tree calls to lookup, insert, delete
+ * integrity data associated with an inode.
+ */
+struct ima_iint_cache *ima_iint_insert(struct inode *inode);
+struct ima_iint_cache *ima_iint_find_get(struct inode *inode);
+struct ima_iint_cache *ima_iint_find_insert_get(struct inode *inode);
+void ima_iint_delete(struct inode *inode);
+void iint_free(struct kref *kref);
+void iint_rcu_free(struct rcu_head *rcu);
+
+/* IMA policy related functions */
+enum ima_hooks { PATH_CHECK = 1, FILE_MMAP, BPRM_CHECK };
+
+int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask);
+void ima_init_policy(void);
+void ima_update_policy(void);
+#endif
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
new file mode 100644
index 000000000000..a148a25804f6
--- /dev/null
+++ b/security/integrity/ima/ima_api.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (C) 2008 IBM Corporation
+ *
+ * Author: Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_api.c
+ *	Implements must_measure, collect_measurement, store_measurement,
+ *	and store_template.
+ */
+#include <linux/module.h>
+
+#include "ima.h"
+static char *IMA_TEMPLATE_NAME = "ima";
+
+/*
+ * ima_store_template - store ima template measurements
+ *
+ * Calculate the hash of a template entry, add the template entry
+ * to an ordered list of measurement entries maintained inside the kernel,
+ * and also update the aggregate integrity value (maintained inside the
+ * configured TPM PCR) over the hashes of the current list of measurement
+ * entries.
+ *
+ * Applications retrieve the current kernel-held measurement list through
+ * the securityfs entries in /sys/kernel/security/ima. The signed aggregate
+ * TPM PCR (called quote) can be retrieved using a TPM user space library
+ * and is used to validate the measurement list.
+ *
+ * Returns 0 on success, error code otherwise
+ */
+int ima_store_template(struct ima_template_entry *entry,
+		       int violation, struct inode *inode)
+{
+	const char *op = "add_template_measure";
+	const char *audit_cause = "hashing_error";
+	int result;
+
+	memset(entry->digest, 0, sizeof(entry->digest));
+	entry->template_name = IMA_TEMPLATE_NAME;
+	entry->template_len = sizeof(entry->template);
+
+	if (!violation) {
+		result = ima_calc_template_hash(entry->template_len,
+						&entry->template,
+						entry->digest);
+		if (result < 0) {
+			integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
+					    entry->template_name, op,
+					    audit_cause, result, 0);
+			return result;
+		}
+	}
+	result = ima_add_template_entry(entry, violation, op, inode);
+	return result;
+}
+
+/*
+ * ima_add_violation - add violation to measurement list.
+ *
+ * Violations are flagged in the measurement list with zero hash values.
+ * By extending the PCR with 0xFF's instead of with zeroes, the PCR
+ * value is invalidated.
+ */
+void ima_add_violation(struct inode *inode, const unsigned char *filename,
+		       const char *op, const char *cause)
+{
+	struct ima_template_entry *entry;
+	int violation = 1;
+	int result;
+
+	/* can overflow, only indicator */
+	atomic_long_inc(&ima_htable.violations);
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		result = -ENOMEM;
+		goto err_out;
+	}
+	memset(&entry->template, 0, sizeof(entry->template));
+	strncpy(entry->template.file_name, filename, IMA_EVENT_NAME_LEN_MAX);
+	result = ima_store_template(entry, violation, inode);
+	if (result < 0)
+		kfree(entry);
+err_out:
+	integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
+			    op, cause, result, 0);
+}
+
+/**
+ * ima_must_measure - measure decision based on policy.
+ * @inode: pointer to inode to measure
+ * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXECUTE)
+ * @function: calling function (PATH_CHECK, BPRM_CHECK, FILE_MMAP)
+ *
+ * The policy is defined in terms of keypairs:
+ * 		subj=, obj=, type=, func=, mask=, fsmagic=
+ *	subj,obj, and type: are LSM specific.
+ * 	func: PATH_CHECK | BPRM_CHECK | FILE_MMAP
+ * 	mask: contains the permission mask
+ *	fsmagic: hex value
+ *
+ * Must be called with iint->mutex held.
+ *
+ * Return 0 to measure. Return 1 if already measured.
+ * For matching a DONT_MEASURE policy, no policy, or other
+ * error, return an error code.
+*/
+int ima_must_measure(struct ima_iint_cache *iint, struct inode *inode,
+		     int mask, int function)
+{
+	int must_measure;
+
+	if (iint->flags & IMA_MEASURED)
+		return 1;
+
+	must_measure = ima_match_policy(inode, function, mask);
+	return must_measure ? 0 : -EACCES;
+}
+
+/*
+ * ima_collect_measurement - collect file measurement
+ *
+ * Calculate the file hash, if it doesn't already exist,
+ * storing the measurement and i_version in the iint.
+ *
+ * Must be called with iint->mutex held.
+ *
+ * Return 0 on success, error code otherwise
+ */
+int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file)
+{
+	int result = -EEXIST;
+
+	if (!(iint->flags & IMA_MEASURED)) {
+		u64 i_version = file->f_dentry->d_inode->i_version;
+
+		memset(iint->digest, 0, IMA_DIGEST_SIZE);
+		result = ima_calc_hash(file, iint->digest);
+		if (!result)
+			iint->version = i_version;
+	}
+	return result;
+}
+
+/*
+ * ima_store_measurement - store file measurement
+ *
+ * Create an "ima" template and then store the template by calling
+ * ima_store_template.
+ *
+ * We only get here if the inode has not already been measured,
+ * but the measurement could already exist:
+ * 	- multiple copies of the same file on either the same or
+ *	  different filesystems.
+ *	- the inode was previously flushed as well as the iint info,
+ *	  containing the hashing info.
+ *
+ * Must be called with iint->mutex held.
+ */
+void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
+			   const unsigned char *filename)
+{
+	const char *op = "add_template_measure";
+	const char *audit_cause = "ENOMEM";
+	int result = -ENOMEM;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ima_template_entry *entry;
+	int violation = 0;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
+				    op, audit_cause, result, 0);
+		return;
+	}
+	memset(&entry->template, 0, sizeof(entry->template));
+	memcpy(entry->template.digest, iint->digest, IMA_DIGEST_SIZE);
+	strncpy(entry->template.file_name, filename, IMA_EVENT_NAME_LEN_MAX);
+
+	result = ima_store_template(entry, violation, inode);
+	if (!result)
+		iint->flags |= IMA_MEASURED;
+	else
+		kfree(entry);
+}
diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
new file mode 100644
index 000000000000..8a0f1e23ccf1
--- /dev/null
+++ b/security/integrity/ima/ima_audit.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2008 IBM Corporation
+ * Author: Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 2 of the License.
+ *
+ * File: integrity_audit.c
+ * 	Audit calls for the integrity subsystem
+ */
+
+#include <linux/fs.h>
+#include <linux/audit.h>
+#include "ima.h"
+
+static int ima_audit;
+
+#ifdef CONFIG_IMA_AUDIT
+
+/* ima_audit_setup - enable informational auditing messages */
+static int __init ima_audit_setup(char *str)
+{
+	unsigned long audit;
+	int rc;
+	char *op;
+
+	rc = strict_strtoul(str, 0, &audit);
+	if (rc || audit > 1)
+		printk(KERN_INFO "ima: invalid ima_audit value\n");
+	else
+		ima_audit = audit;
+	op = ima_audit ? "ima_audit_enabled" : "ima_audit_not_enabled";
+	integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, NULL, op, 0, 0);
+	return 1;
+}
+__setup("ima_audit=", ima_audit_setup);
+#endif
+
+void integrity_audit_msg(int audit_msgno, struct inode *inode,
+			 const unsigned char *fname, const char *op,
+			 const char *cause, int result, int audit_info)
+{
+	struct audit_buffer *ab;
+
+	if (!ima_audit && audit_info == 1) /* Skip informational messages */
+		return;
+
+	ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno);
+	audit_log_format(ab, "integrity: pid=%d uid=%u auid=%u",
+			 current->pid, current->cred->uid,
+			 audit_get_loginuid(current));
+	audit_log_task_context(ab);
+	switch (audit_msgno) {
+	case AUDIT_INTEGRITY_DATA:
+	case AUDIT_INTEGRITY_METADATA:
+	case AUDIT_INTEGRITY_PCR:
+		audit_log_format(ab, " op=%s cause=%s", op, cause);
+		break;
+	case AUDIT_INTEGRITY_HASH:
+		audit_log_format(ab, " op=%s hash=%s", op, cause);
+		break;
+	case AUDIT_INTEGRITY_STATUS:
+	default:
+		audit_log_format(ab, " op=%s", op);
+	}
+	audit_log_format(ab, " comm=");
+	audit_log_untrustedstring(ab, current->comm);
+	if (fname) {
+		audit_log_format(ab, " name=");
+		audit_log_untrustedstring(ab, fname);
+	}
+	if (inode)
+		audit_log_format(ab, " dev=%s ino=%lu",
+				 inode->i_sb->s_id, inode->i_ino);
+	audit_log_format(ab, " res=%d", result);
+	audit_log_end(ab);
+}
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
new file mode 100644
index 000000000000..c2a46e40999d
--- /dev/null
+++ b/security/integrity/ima/ima_crypto.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2005,2006,2007,2008 IBM Corporation
+ *
+ * Authors:
+ * Mimi Zohar <zohar@us.ibm.com>
+ * Kylene Hall <kjhall@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 2 of the License.
+ *
+ * File: ima_crypto.c
+ * 	Calculates md5/sha1 file hash, template hash, boot-aggreate hash
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/err.h>
+#include "ima.h"
+
+static int init_desc(struct hash_desc *desc)
+{
+	int rc;
+
+	desc->tfm = crypto_alloc_hash(ima_hash, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(desc->tfm)) {
+		pr_info("failed to load %s transform: %ld\n",
+			ima_hash, PTR_ERR(desc->tfm));
+		rc = PTR_ERR(desc->tfm);
+		return rc;
+	}
+	desc->flags = 0;
+	rc = crypto_hash_init(desc);
+	if (rc)
+		crypto_free_hash(desc->tfm);
+	return rc;
+}
+
+/*
+ * Calculate the MD5/SHA1 file digest
+ */
+int ima_calc_hash(struct file *file, char *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg[1];
+	loff_t i_size;
+	char *rbuf;
+	int rc, offset = 0;
+
+	rc = init_desc(&desc);
+	if (rc != 0)
+		return rc;
+
+	rbuf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!rbuf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	i_size = i_size_read(file->f_dentry->d_inode);
+	while (offset < i_size) {
+		int rbuf_len;
+
+		rbuf_len = kernel_read(file, offset, rbuf, PAGE_SIZE);
+		if (rbuf_len < 0) {
+			rc = rbuf_len;
+			break;
+		}
+		offset += rbuf_len;
+		sg_set_buf(sg, rbuf, rbuf_len);
+
+		rc = crypto_hash_update(&desc, sg, rbuf_len);
+		if (rc)
+			break;
+	}
+	kfree(rbuf);
+	if (!rc)
+		rc = crypto_hash_final(&desc, digest);
+out:
+	crypto_free_hash(desc.tfm);
+	return rc;
+}
+
+/*
+ * Calculate the hash of a given template
+ */
+int ima_calc_template_hash(int template_len, void *template, char *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg[1];
+	int rc;
+
+	rc = init_desc(&desc);
+	if (rc != 0)
+		return rc;
+
+	sg_set_buf(sg, template, template_len);
+	rc = crypto_hash_update(&desc, sg, template_len);
+	if (!rc)
+		rc = crypto_hash_final(&desc, digest);
+	crypto_free_hash(desc.tfm);
+	return rc;
+}
+
+static void ima_pcrread(int idx, u8 *pcr)
+{
+	if (!ima_used_chip)
+		return;
+
+	if (tpm_pcr_read(TPM_ANY_NUM, idx, pcr) != 0)
+		pr_err("Error Communicating to TPM chip\n");
+}
+
+/*
+ * Calculate the boot aggregate hash
+ */
+int ima_calc_boot_aggregate(char *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	u8 pcr_i[IMA_DIGEST_SIZE];
+	int rc, i;
+
+	rc = init_desc(&desc);
+	if (rc != 0)
+		return rc;
+
+	/* cumulative sha1 over tpm registers 0-7 */
+	for (i = TPM_PCR0; i < TPM_PCR8; i++) {
+		ima_pcrread(i, pcr_i);
+		/* now accumulate with current aggregate */
+		sg_init_one(&sg, pcr_i, IMA_DIGEST_SIZE);
+		rc = crypto_hash_update(&desc, &sg, IMA_DIGEST_SIZE);
+	}
+	if (!rc)
+		crypto_hash_final(&desc, digest);
+	crypto_free_hash(desc.tfm);
+	return rc;
+}
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
new file mode 100644
index 000000000000..750db3c993a7
--- /dev/null
+++ b/security/integrity/ima/ima_iint.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2008 IBM Corporation
+ *
+ * Authors:
+ * Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_iint.c
+ * 	- implements the IMA hooks: ima_inode_alloc, ima_inode_free
+ *	- cache integrity information associated with an inode
+ *	  using a radix tree.
+ */
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/radix-tree.h>
+#include "ima.h"
+
+#define ima_iint_delete ima_inode_free
+
+RADIX_TREE(ima_iint_store, GFP_ATOMIC);
+DEFINE_SPINLOCK(ima_iint_lock);
+
+static struct kmem_cache *iint_cache __read_mostly;
+
+/* ima_iint_find_get - return the iint associated with an inode
+ *
+ * ima_iint_find_get gets a reference to the iint. Caller must
+ * remember to put the iint reference.
+ */
+struct ima_iint_cache *ima_iint_find_get(struct inode *inode)
+{
+	struct ima_iint_cache *iint;
+
+	rcu_read_lock();
+	iint = radix_tree_lookup(&ima_iint_store, (unsigned long)inode);
+	if (!iint)
+		goto out;
+	kref_get(&iint->refcount);
+out:
+	rcu_read_unlock();
+	return iint;
+}
+
+/* Allocate memory for the iint associated with the inode
+ * from the iint_cache slab, initialize the iint, and
+ * insert it into the radix tree.
+ *
+ * On success return a pointer to the iint; on failure return NULL.
+ */
+struct ima_iint_cache *ima_iint_insert(struct inode *inode)
+{
+	struct ima_iint_cache *iint = NULL;
+	int rc = 0;
+
+	if (!ima_initialized)
+		return iint;
+	iint = kmem_cache_alloc(iint_cache, GFP_KERNEL);
+	if (!iint)
+		return iint;
+
+	rc = radix_tree_preload(GFP_KERNEL);
+	if (rc < 0)
+		goto out;
+
+	spin_lock(&ima_iint_lock);
+	rc = radix_tree_insert(&ima_iint_store, (unsigned long)inode, iint);
+	spin_unlock(&ima_iint_lock);
+out:
+	if (rc < 0) {
+		kmem_cache_free(iint_cache, iint);
+		if (rc == -EEXIST) {
+			iint = radix_tree_lookup(&ima_iint_store,
+						 (unsigned long)inode);
+		} else
+			iint = NULL;
+	}
+	radix_tree_preload_end();
+	return iint;
+}
+
+/**
+ * ima_inode_alloc - allocate an iint associated with an inode
+ * @inode: pointer to the inode
+ *
+ * Return 0 on success, 1 on failure.
+ */
+int ima_inode_alloc(struct inode *inode)
+{
+	struct ima_iint_cache *iint;
+
+	if (!ima_initialized)
+		return 0;
+
+	iint = ima_iint_insert(inode);
+	if (!iint)
+		return 1;
+	return 0;
+}
+
+/* ima_iint_find_insert_get - get the iint associated with an inode
+ *
+ * Most insertions are done at inode_alloc, except those allocated
+ * before late_initcall. When the iint does not exist, allocate it,
+ * initialize and insert it, and increment the iint refcount.
+ *
+ * (Can't initialize at security_initcall before any inodes are
+ * allocated, got to wait at least until proc_init.)
+ *
+ *  Return the iint.
+ */
+struct ima_iint_cache *ima_iint_find_insert_get(struct inode *inode)
+{
+	struct ima_iint_cache *iint = NULL;
+
+	iint = ima_iint_find_get(inode);
+	if (iint)
+		return iint;
+
+	iint = ima_iint_insert(inode);
+	if (iint)
+		kref_get(&iint->refcount);
+
+	return iint;
+}
+
+/* iint_free - called when the iint refcount goes to zero */
+void iint_free(struct kref *kref)
+{
+	struct ima_iint_cache *iint = container_of(kref, struct ima_iint_cache,
+						   refcount);
+	iint->version = 0;
+	iint->flags = 0UL;
+	kref_set(&iint->refcount, 1);
+	kmem_cache_free(iint_cache, iint);
+}
+
+void iint_rcu_free(struct rcu_head *rcu_head)
+{
+	struct ima_iint_cache *iint = container_of(rcu_head,
+						   struct ima_iint_cache, rcu);
+	kref_put(&iint->refcount, iint_free);
+}
+
+/**
+ * ima_iint_delete - called on integrity_inode_free
+ * @inode: pointer to the inode
+ *
+ * Free the integrity information(iint) associated with an inode.
+ */
+void ima_iint_delete(struct inode *inode)
+{
+	struct ima_iint_cache *iint;
+
+	if (!ima_initialized)
+		return;
+	spin_lock(&ima_iint_lock);
+	iint = radix_tree_delete(&ima_iint_store, (unsigned long)inode);
+	spin_unlock(&ima_iint_lock);
+	if (iint)
+		call_rcu(&iint->rcu, iint_rcu_free);
+}
+
+static void init_once(void *foo)
+{
+	struct ima_iint_cache *iint = foo;
+
+	memset(iint, 0, sizeof *iint);
+	iint->version = 0;
+	iint->flags = 0UL;
+	mutex_init(&iint->mutex);
+	iint->readcount = 0;
+	iint->writecount = 0;
+	kref_set(&iint->refcount, 1);
+}
+
+void ima_iintcache_init(void)
+{
+	iint_cache =
+	    kmem_cache_create("iint_cache", sizeof(struct ima_iint_cache), 0,
+			      SLAB_PANIC, init_once);
+}
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
new file mode 100644
index 000000000000..e0f02e328d77
--- /dev/null
+++ b/security/integrity/ima/ima_init.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2005,2006,2007,2008 IBM Corporation
+ *
+ * Authors:
+ * Reiner Sailer      <sailer@watson.ibm.com>
+ * Leendert van Doorn <leendert@watson.ibm.com>
+ * Mimi Zohar         <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_init.c
+ *             initialization and cleanup functions
+ */
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/err.h>
+#include "ima.h"
+
+/* name for boot aggregate entry */
+static char *boot_aggregate_name = "boot_aggregate";
+int ima_used_chip;
+
+/* Add the boot aggregate to the IMA measurement list and extend
+ * the PCR register.
+ *
+ * Calculate the boot aggregate, a SHA1 over tpm registers 0-7,
+ * assuming a TPM chip exists, and zeroes if the TPM chip does not
+ * exist.  Add the boot aggregate measurement to the measurement
+ * list and extend the PCR register.
+ *
+ * If a tpm chip does not exist, indicate the core root of trust is
+ * not hardware based by invalidating the aggregate PCR value.
+ * (The aggregate PCR value is invalidated by adding one value to
+ * the measurement list and extending the aggregate PCR value with
+ * a different value.) Violations add a zero entry to the measurement
+ * list and extend the aggregate PCR value with ff...ff's.
+ */
+static void ima_add_boot_aggregate(void)
+{
+	struct ima_template_entry *entry;
+	const char *op = "add_boot_aggregate";
+	const char *audit_cause = "ENOMEM";
+	int result = -ENOMEM;
+	int violation = 1;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		goto err_out;
+
+	memset(&entry->template, 0, sizeof(entry->template));
+	strncpy(entry->template.file_name, boot_aggregate_name,
+		IMA_EVENT_NAME_LEN_MAX);
+	if (ima_used_chip) {
+		violation = 0;
+		result = ima_calc_boot_aggregate(entry->template.digest);
+		if (result < 0) {
+			audit_cause = "hashing_error";
+			kfree(entry);
+			goto err_out;
+		}
+	}
+	result = ima_store_template(entry, violation, NULL);
+	if (result < 0)
+		kfree(entry);
+	return;
+err_out:
+	integrity_audit_msg(AUDIT_INTEGRITY_PCR, NULL, boot_aggregate_name, op,
+			    audit_cause, result, 0);
+}
+
+int ima_init(void)
+{
+	u8 pcr_i[IMA_DIGEST_SIZE];
+	int rc;
+
+	ima_used_chip = 0;
+	rc = tpm_pcr_read(TPM_ANY_NUM, 0, pcr_i);
+	if (rc == 0)
+		ima_used_chip = 1;
+
+	if (!ima_used_chip)
+		pr_info("No TPM chip found, activating TPM-bypass!\n");
+
+	ima_add_boot_aggregate();	/* boot aggregate must be first entry */
+	ima_init_policy();
+	return 0;
+}
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
new file mode 100644
index 000000000000..53cee4c512ce
--- /dev/null
+++ b/security/integrity/ima/ima_main.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (C) 2005,2006,2007,2008 IBM Corporation
+ *
+ * Authors:
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Serge Hallyn <serue@us.ibm.com>
+ * Kylene Hall <kylene@us.ibm.com>
+ * Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_main.c
+ *             implements the IMA hooks: ima_bprm_check, ima_file_mmap,
+ *             and ima_path_check.
+ */
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/binfmts.h>
+#include <linux/mount.h>
+#include <linux/mman.h>
+
+#include "ima.h"
+
+int ima_initialized;
+
+char *ima_hash = "sha1";
+static int __init hash_setup(char *str)
+{
+	const char *op = "hash_setup";
+	const char *hash = "sha1";
+	int result = 0;
+	int audit_info = 0;
+
+	if (strncmp(str, "md5", 3) == 0) {
+		hash = "md5";
+		ima_hash = str;
+	} else if (strncmp(str, "sha1", 4) != 0) {
+		hash = "invalid_hash_type";
+		result = 1;
+	}
+	integrity_audit_msg(AUDIT_INTEGRITY_HASH, NULL, NULL, op, hash,
+			    result, audit_info);
+	return 1;
+}
+__setup("ima_hash=", hash_setup);
+
+/**
+ * ima_file_free - called on __fput()
+ * @file: pointer to file structure being freed
+ *
+ * Flag files that changed, based on i_version;
+ * and decrement the iint readcount/writecount.
+ */
+void ima_file_free(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ima_iint_cache *iint;
+
+	if (!ima_initialized || !S_ISREG(inode->i_mode))
+		return;
+	iint = ima_iint_find_get(inode);
+	if (!iint)
+		return;
+
+	mutex_lock(&iint->mutex);
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		iint->readcount--;
+
+	if (file->f_mode & FMODE_WRITE) {
+		iint->writecount--;
+		if (iint->writecount == 0) {
+			if (iint->version != inode->i_version)
+				iint->flags &= ~IMA_MEASURED;
+		}
+	}
+	mutex_unlock(&iint->mutex);
+	kref_put(&iint->refcount, iint_free);
+}
+
+/* ima_read_write_check - reflect possible reading/writing errors in the PCR.
+ *
+ * When opening a file for read, if the file is already open for write,
+ * the file could change, resulting in a file measurement error.
+ *
+ * Opening a file for write, if the file is already open for read, results
+ * in a time of measure, time of use (ToMToU) error.
+ *
+ * In either case invalidate the PCR.
+ */
+enum iint_pcr_error { TOMTOU, OPEN_WRITERS };
+static void ima_read_write_check(enum iint_pcr_error error,
+				 struct ima_iint_cache *iint,
+				 struct inode *inode,
+				 const unsigned char *filename)
+{
+	switch (error) {
+	case TOMTOU:
+		if (iint->readcount > 0)
+			ima_add_violation(inode, filename, "invalid_pcr",
+					  "ToMToU");
+		break;
+	case OPEN_WRITERS:
+		if (iint->writecount > 0)
+			ima_add_violation(inode, filename, "invalid_pcr",
+					  "open_writers");
+		break;
+	}
+}
+
+static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
+				const unsigned char *filename)
+{
+	int rc = 0;
+
+	if (IS_ERR(file)) {
+		pr_info("%s dentry_open failed\n", filename);
+		return rc;
+	}
+	iint->readcount++;
+
+	rc = ima_collect_measurement(iint, file);
+	if (!rc)
+		ima_store_measurement(iint, file, filename);
+	return rc;
+}
+
+/**
+ * ima_path_check - based on policy, collect/store measurement.
+ * @path: contains a pointer to the path to be measured
+ * @mask: contains MAY_READ, MAY_WRITE or MAY_EXECUTE
+ *
+ * Measure the file being open for readonly, based on the
+ * ima_must_measure() policy decision.
+ *
+ * Keep read/write counters for all files, but only
+ * invalidate the PCR for measured files:
+ * 	- Opening a file for write when already open for read,
+ *	  results in a time of measure, time of use (ToMToU) error.
+ *	- Opening a file for read when already open for write,
+ * 	  could result in a file measurement error.
+ *
+ * Return 0 on success, an error code on failure.
+ * (Based on the results of appraise_measurement().)
+ */
+int ima_path_check(struct path *path, int mask)
+{
+	struct inode *inode = path->dentry->d_inode;
+	struct ima_iint_cache *iint;
+	struct file *file = NULL;
+	int rc;
+
+	if (!ima_initialized || !S_ISREG(inode->i_mode))
+		return 0;
+	iint = ima_iint_find_insert_get(inode);
+	if (!iint)
+		return 0;
+
+	mutex_lock(&iint->mutex);
+	if ((mask & MAY_WRITE) || (mask == 0))
+		iint->writecount++;
+	else if (mask & (MAY_READ | MAY_EXEC))
+		iint->readcount++;
+
+	rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK);
+	if (rc < 0)
+		goto out;
+
+	if ((mask & MAY_WRITE) || (mask == 0))
+		ima_read_write_check(TOMTOU, iint, inode,
+				     path->dentry->d_name.name);
+
+	if ((mask & (MAY_WRITE | MAY_READ | MAY_EXEC)) != MAY_READ)
+		goto out;
+
+	ima_read_write_check(OPEN_WRITERS, iint, inode,
+			     path->dentry->d_name.name);
+	if (!(iint->flags & IMA_MEASURED)) {
+		struct dentry *dentry = dget(path->dentry);
+		struct vfsmount *mnt = mntget(path->mnt);
+
+		file = dentry_open(dentry, mnt, O_RDONLY, current->cred);
+		rc = get_path_measurement(iint, file, dentry->d_name.name);
+	}
+out:
+	mutex_unlock(&iint->mutex);
+	if (file)
+		fput(file);
+	kref_put(&iint->refcount, iint_free);
+	return 0;
+}
+
+static int process_measurement(struct file *file, const unsigned char *filename,
+			       int mask, int function)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ima_iint_cache *iint;
+	int rc;
+
+	if (!ima_initialized || !S_ISREG(inode->i_mode))
+		return 0;
+	iint = ima_iint_find_insert_get(inode);
+	if (!iint)
+		return -ENOMEM;
+
+	mutex_lock(&iint->mutex);
+	rc = ima_must_measure(iint, inode, mask, function);
+	if (rc != 0)
+		goto out;
+
+	rc = ima_collect_measurement(iint, file);
+	if (!rc)
+		ima_store_measurement(iint, file, filename);
+out:
+	mutex_unlock(&iint->mutex);
+	kref_put(&iint->refcount, iint_free);
+	return rc;
+}
+
+/**
+ * ima_file_mmap - based on policy, collect/store measurement.
+ * @file: pointer to the file to be measured (May be NULL)
+ * @prot: contains the protection that will be applied by the kernel.
+ *
+ * Measure files being mmapped executable based on the ima_must_measure()
+ * policy decision.
+ *
+ * Return 0 on success, an error code on failure.
+ * (Based on the results of appraise_measurement().)
+ */
+int ima_file_mmap(struct file *file, unsigned long prot)
+{
+	int rc;
+
+	if (!file)
+		return 0;
+	if (prot & PROT_EXEC)
+		rc = process_measurement(file, file->f_dentry->d_name.name,
+					 MAY_EXEC, FILE_MMAP);
+	return 0;
+}
+
+/**
+ * ima_bprm_check - based on policy, collect/store measurement.
+ * @bprm: contains the linux_binprm structure
+ *
+ * The OS protects against an executable file, already open for write,
+ * from being executed in deny_write_access() and an executable file,
+ * already open for execute, from being modified in get_write_access().
+ * So we can be certain that what we verify and measure here is actually
+ * what is being executed.
+ *
+ * Return 0 on success, an error code on failure.
+ * (Based on the results of appraise_measurement().)
+ */
+int ima_bprm_check(struct linux_binprm *bprm)
+{
+	int rc;
+
+	rc = process_measurement(bprm->file, bprm->filename,
+				 MAY_EXEC, BPRM_CHECK);
+	return 0;
+}
+
+static int __init init_ima(void)
+{
+	int error;
+
+	ima_iintcache_init();
+	error = ima_init();
+	ima_initialized = 1;
+	return error;
+}
+
+late_initcall(init_ima);	/* Start IMA after the TPM is available */
+
+MODULE_DESCRIPTION("Integrity Measurement Architecture");
+MODULE_LICENSE("GPL");
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
new file mode 100644
index 000000000000..7c3d1ffb1472
--- /dev/null
+++ b/security/integrity/ima/ima_policy.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2008 IBM Corporation
+ * Author: Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 2 of the License.
+ *
+ * ima_policy.c
+ * 	- initialize default measure policy rules
+ *
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/audit.h>
+#include <linux/security.h>
+#include <linux/magic.h>
+
+#include "ima.h"
+
+/* flags definitions */
+#define IMA_FUNC 	0x0001
+#define IMA_MASK 	0x0002
+#define IMA_FSMAGIC	0x0004
+#define IMA_UID		0x0008
+
+enum ima_action { DONT_MEASURE, MEASURE };
+
+struct ima_measure_rule_entry {
+	struct list_head list;
+	enum ima_action action;
+	unsigned int flags;
+	enum ima_hooks func;
+	int mask;
+	unsigned long fsmagic;
+	uid_t uid;
+};
+
+static struct ima_measure_rule_entry default_rules[] = {
+	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,
+	 .flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,
+	 .flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = 0xF97CFF8C,.flags = IMA_FSMAGIC},
+	{.action = MEASURE,.func = FILE_MMAP,.mask = MAY_EXEC,
+	 .flags = IMA_FUNC | IMA_MASK},
+	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
+	 .flags = IMA_FUNC | IMA_MASK},
+	{.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0,
+	 .flags = IMA_FUNC | IMA_MASK | IMA_UID}
+};
+
+static LIST_HEAD(measure_default_rules);
+static struct list_head *ima_measure;
+
+/**
+ * ima_match_rules - determine whether an inode matches the measure rule.
+ * @rule: a pointer to a rule
+ * @inode: a pointer to an inode
+ * @func: LIM hook identifier
+ * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
+ *
+ * Returns true on rule match, false on failure.
+ */
+static bool ima_match_rules(struct ima_measure_rule_entry *rule,
+			    struct inode *inode, enum ima_hooks func, int mask)
+{
+	struct task_struct *tsk = current;
+
+	if ((rule->flags & IMA_FUNC) && rule->func != func)
+		return false;
+	if ((rule->flags & IMA_MASK) && rule->mask != mask)
+		return false;
+	if ((rule->flags & IMA_FSMAGIC)
+	    && rule->fsmagic != inode->i_sb->s_magic)
+		return false;
+	if ((rule->flags & IMA_UID) && rule->uid != tsk->cred->uid)
+		return false;
+	return true;
+}
+
+/**
+ * ima_match_policy - decision based on LSM and other conditions
+ * @inode: pointer to an inode for which the policy decision is being made
+ * @func: IMA hook identifier
+ * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
+ *
+ * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type)
+ * conditions.
+ *
+ * (There is no need for locking when walking the policy list,
+ * as elements in the list are never deleted, nor does the list
+ * change.)
+ */
+int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask)
+{
+	struct ima_measure_rule_entry *entry;
+
+	list_for_each_entry(entry, ima_measure, list) {
+		bool rc;
+
+		rc = ima_match_rules(entry, inode, func, mask);
+		if (rc)
+			return entry->action;
+	}
+	return 0;
+}
+
+/**
+ * ima_init_policy - initialize the default measure rules.
+ *
+ * (Could use the default_rules directly, but in policy patch
+ * ima_measure points to either the measure_default_rules or the
+ * the new measure_policy_rules.)
+ */
+void ima_init_policy(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(default_rules); i++)
+		list_add_tail(&default_rules[i].list, &measure_default_rules);
+	ima_measure = &measure_default_rules;
+}
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
new file mode 100644
index 000000000000..7ec94314ac0c
--- /dev/null
+++ b/security/integrity/ima/ima_queue.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2005,2006,2007,2008 IBM Corporation
+ *
+ * Authors:
+ * Serge Hallyn <serue@us.ibm.com>
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Mimi Zohar <zohar@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_queue.c
+ *       Implements queues that store template measurements and
+ *       maintains aggregate over the stored measurements
+ *       in the pre-configured TPM PCR (if available).
+ *       The measurement list is append-only. No entry is
+ *       ever removed or changed during the boot-cycle.
+ */
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include "ima.h"
+
+LIST_HEAD(ima_measurements);	/* list of all measurements */
+
+/* key: inode (before secure-hashing a file) */
+struct ima_h_table ima_htable = {
+	.len = ATOMIC_LONG_INIT(0),
+	.violations = ATOMIC_LONG_INIT(0),
+	.queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT
+};
+
+/* mutex protects atomicity of extending measurement list
+ * and extending the TPM PCR aggregate. Since tpm_extend can take
+ * long (and the tpm driver uses a mutex), we can't use the spinlock.
+ */
+static DEFINE_MUTEX(ima_extend_list_mutex);
+
+/* lookup up the digest value in the hash table, and return the entry */
+static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value)
+{
+	struct ima_queue_entry *qe, *ret = NULL;
+	unsigned int key;
+	struct hlist_node *pos;
+	int rc;
+
+	key = ima_hash_key(digest_value);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(qe, pos, &ima_htable.queue[key], hnext) {
+		rc = memcmp(qe->entry->digest, digest_value, IMA_DIGEST_SIZE);
+		if (rc == 0) {
+			ret = qe;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+/* ima_add_template_entry helper function:
+ * - Add template entry to measurement list and hash table.
+ *
+ * (Called with ima_extend_list_mutex held.)
+ */
+static int ima_add_digest_entry(struct ima_template_entry *entry)
+{
+	struct ima_queue_entry *qe;
+	unsigned int key;
+
+	qe = kmalloc(sizeof(*qe), GFP_KERNEL);
+	if (qe == NULL) {
+		pr_err("OUT OF MEMORY ERROR creating queue entry.\n");
+		return -ENOMEM;
+	}
+	qe->entry = entry;
+
+	INIT_LIST_HEAD(&qe->later);
+	list_add_tail_rcu(&qe->later, &ima_measurements);
+
+	atomic_long_inc(&ima_htable.len);
+	key = ima_hash_key(entry->digest);
+	hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]);
+	return 0;
+}
+
+static int ima_pcr_extend(const u8 *hash)
+{
+	int result = 0;
+
+	if (!ima_used_chip)
+		return result;
+
+	result = tpm_pcr_extend(TPM_ANY_NUM, CONFIG_IMA_MEASURE_PCR_IDX, hash);
+	if (result != 0)
+		pr_err("Error Communicating to TPM chip\n");
+	return result;
+}
+
+/* Add template entry to the measurement list and hash table,
+ * and extend the pcr.
+ */
+int ima_add_template_entry(struct ima_template_entry *entry, int violation,
+			   const char *op, struct inode *inode)
+{
+	u8 digest[IMA_DIGEST_SIZE];
+	const char *audit_cause = "hash_added";
+	int audit_info = 1;
+	int result = 0;
+
+	mutex_lock(&ima_extend_list_mutex);
+	if (!violation) {
+		memcpy(digest, entry->digest, sizeof digest);
+		if (ima_lookup_digest_entry(digest)) {
+			audit_cause = "hash_exists";
+			goto out;
+		}
+	}
+
+	result = ima_add_digest_entry(entry);
+	if (result < 0) {
+		audit_cause = "ENOMEM";
+		audit_info = 0;
+		goto out;
+	}
+
+	if (violation)		/* invalidate pcr */
+		memset(digest, 0xff, sizeof digest);
+
+	result = ima_pcr_extend(digest);
+	if (result != 0) {
+		audit_cause = "TPM error";
+		audit_info = 0;
+	}
+out:
+	mutex_unlock(&ima_extend_list_mutex);
+	integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, entry->template_name,
+			    op, audit_cause, result, audit_info);
+	return result;
+}
-- 
cgit v1.2.3-71-gd317


From 1df9f0a73178718969ae47d813b8e7aab2cf073c Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 4 Feb 2009 09:07:02 -0500
Subject: Integrity: IMA file free imbalance

The number of calls to ima_path_check()/ima_file_free()
should be balanced.  An extra call to fput(), indicates
the file could have been accessed without first being
measured.

Although f_count is incremented/decremented in places other
than fget/fput, like fget_light/fput_light and get_file, the
current task must already hold a file refcnt.  The call to
__fput() is delayed until the refcnt becomes 0, resulting
in ima_file_free() flagging any changes.

- add hook to increment opencount for IPC shared memory(SYSV),
  shmat files, and /dev/zero
- moved NULL iint test in opencount_get()

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/ima.h               |  6 ++++++
 ipc/shm.c                         |  3 +++
 mm/shmem.c                        |  2 ++
 security/integrity/ima/ima.h      |  2 ++
 security/integrity/ima/ima_iint.c | 17 ++++++++++++++++
 security/integrity/ima/ima_main.c | 42 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index dcc3664feee8..6db30a328d98 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -19,6 +19,7 @@ extern void ima_inode_free(struct inode *inode);
 extern int ima_path_check(struct path *path, int mask);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
+extern void ima_shm_check(struct file *file);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -50,5 +51,10 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 {
 	return 0;
 }
+
+static inline void ima_shm_check(struct file *file)
+{
+	return;
+}
 #endif /* CONFIG_IMA_H */
 #endif /* _LINUX_IMA_H */
diff --git a/ipc/shm.c b/ipc/shm.c
index 38a055758a9b..d39bd7637b1c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -39,6 +39,7 @@
 #include <linux/nsproxy.h>
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -381,6 +382,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
+	ima_shm_check(file);
 
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
@@ -888,6 +890,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 	file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
 	if (!file)
 		goto out_free;
+	ima_shm_check(file);
 
 	file->private_data = sfd;
 	file->f_mapping = shp->shm_file->f_mapping;
diff --git a/mm/shmem.c b/mm/shmem.c
index f1b0d4871f3a..dd5588f5d939 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -51,6 +51,7 @@
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 #include <asm/div64.h>
@@ -2600,6 +2601,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
+	ima_shm_check(file);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 42706b554921..e3c16a21a38e 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -97,6 +97,7 @@ static inline unsigned long ima_hash_key(u8 *digest)
 
 /* iint cache flags */
 #define IMA_MEASURED		1
+#define IMA_IINT_DUMP_STACK	512
 
 /* integrity data associated with an inode */
 struct ima_iint_cache {
@@ -106,6 +107,7 @@ struct ima_iint_cache {
 	struct mutex mutex;	/* protects: version, flags, digest */
 	long readcount;		/* measured files readcount */
 	long writecount;	/* measured files writecount */
+	long opencount;		/* opens reference count */
 	struct kref refcount;	/* ima_iint_cache reference count */
 	struct rcu_head rcu;
 };
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
index 750db3c993a7..1f035e8d29c7 100644
--- a/security/integrity/ima/ima_iint.c
+++ b/security/integrity/ima/ima_iint.c
@@ -126,6 +126,7 @@ struct ima_iint_cache *ima_iint_find_insert_get(struct inode *inode)
 
 	return iint;
 }
+EXPORT_SYMBOL_GPL(ima_iint_find_insert_get);
 
 /* iint_free - called when the iint refcount goes to zero */
 void iint_free(struct kref *kref)
@@ -134,6 +135,21 @@ void iint_free(struct kref *kref)
 						   refcount);
 	iint->version = 0;
 	iint->flags = 0UL;
+	if (iint->readcount != 0) {
+		printk(KERN_INFO "%s: readcount: %ld\n", __FUNCTION__,
+		       iint->readcount);
+		iint->readcount = 0;
+	}
+	if (iint->writecount != 0) {
+		printk(KERN_INFO "%s: writecount: %ld\n", __FUNCTION__,
+		       iint->writecount);
+		iint->writecount = 0;
+	}
+	if (iint->opencount != 0) {
+		printk(KERN_INFO "%s: opencount: %ld\n", __FUNCTION__,
+		       iint->opencount);
+		iint->opencount = 0;
+	}
 	kref_set(&iint->refcount, 1);
 	kmem_cache_free(iint_cache, iint);
 }
@@ -174,6 +190,7 @@ static void init_once(void *foo)
 	mutex_init(&iint->mutex);
 	iint->readcount = 0;
 	iint->writecount = 0;
+	iint->opencount = 0;
 	kref_set(&iint->refcount, 1);
 }
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 871e356e8d6c..f4e7266f5aee 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -66,6 +66,19 @@ void ima_file_free(struct file *file)
 		return;
 
 	mutex_lock(&iint->mutex);
+	if (iint->opencount <= 0) {
+		printk(KERN_INFO
+		       "%s: %s open/free imbalance (r:%ld w:%ld o:%ld f:%ld)\n",
+		       __FUNCTION__, file->f_dentry->d_name.name,
+		       iint->readcount, iint->writecount,
+		       iint->opencount, atomic_long_read(&file->f_count));
+		if (!(iint->flags & IMA_IINT_DUMP_STACK)) {
+			dump_stack();
+			iint->flags |= IMA_IINT_DUMP_STACK;
+		}
+	}
+	iint->opencount--;
+
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		iint->readcount--;
 
@@ -119,6 +132,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 		pr_info("%s dentry_open failed\n", filename);
 		return rc;
 	}
+	iint->opencount++;
 	iint->readcount++;
 
 	rc = ima_collect_measurement(iint, file);
@@ -159,6 +173,7 @@ int ima_path_check(struct path *path, int mask)
 		return 0;
 
 	mutex_lock(&iint->mutex);
+	iint->opencount++;
 	if ((mask & MAY_WRITE) || (mask == 0))
 		iint->writecount++;
 	else if (mask & (MAY_READ | MAY_EXEC))
@@ -219,6 +234,21 @@ out:
 	return rc;
 }
 
+static void opencount_get(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ima_iint_cache *iint;
+
+	if (!ima_initialized || !S_ISREG(inode->i_mode))
+		return;
+	iint = ima_iint_find_insert_get(inode);
+	if (!iint)
+		return;
+	mutex_lock(&iint->mutex);
+	iint->opencount++;
+	mutex_unlock(&iint->mutex);
+}
+
 /**
  * ima_file_mmap - based on policy, collect/store measurement.
  * @file: pointer to the file to be measured (May be NULL)
@@ -242,6 +272,18 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
+/*
+ * ima_shm_check - IPC shm and shmat create/fput a file
+ *
+ * Maintain the opencount for these files to prevent unnecessary
+ * imbalance messages.
+ */
+void ima_shm_check(struct file *file)
+{
+	opencount_get(file);
+	return;
+}
+
 /**
  * ima_bprm_check - based on policy, collect/store measurement.
  * @bprm: contains the linux_binprm structure
-- 
cgit v1.2.3-71-gd317


From 0a9877514c4fed10a70720293b37213dd172ee3e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:12:56 -0200
Subject: ring_buffer: remove unused flags parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: API change, cleanup

>From ring_buffer_{lock_reserve,unlock_commit}.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |  -14
  trace_graph_return         |  -14
  trace_graph_entry          |  -10
  trace_function             |   -8
  __ftrace_trace_stack       |   -8
  ftrace_trace_userstack     |   -8
  tracing_sched_switch_trace |   -8
  ftrace_trace_special       |  -12
  tracing_sched_wakeup_trace |   -8
 9 functions changed, 90 bytes removed, diff: -90

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |   -1
 1 function changed, 1 bytes removed, diff: -1

/tmp/vmlinux.after:
 10 functions changed, 91 bytes removed, diff: -91

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                 |  8 +++---
 include/linux/ring_buffer.h      |  9 +++----
 kernel/trace/kmemtrace.c         | 12 +++------
 kernel/trace/ring_buffer.c       |  9 ++-----
 kernel/trace/trace.c             | 56 ++++++++++++++--------------------------
 kernel/trace/trace_boot.c        | 12 +++------
 kernel/trace/trace_branch.c      |  7 +++--
 kernel/trace/trace_hw_branches.c |  6 ++---
 kernel/trace/trace_mmiotrace.c   | 12 +++------
 kernel/trace/trace_power.c       | 12 +++------
 10 files changed, 51 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/block/blktrace.c b/block/blktrace.c
index d9d7146ee023..8e52f24cc8f9 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -165,7 +165,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
 	struct blk_io_trace *t;
-	unsigned long flags;
+	unsigned long flags = 0;
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
@@ -191,7 +191,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		tracing_record_cmdline(current);
 
 		event = ring_buffer_lock_reserve(blk_tr->buffer,
-						 sizeof(*t) + pdu_len, &flags);
+						 sizeof(*t) + pdu_len);
 		if (!event)
 			return;
 
@@ -241,11 +241,11 @@ record_it:
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
 		if (blk_tr) {
-			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
+			ring_buffer_unlock_commit(blk_tr->buffer, event);
 			if (pid != 0 &&
 			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, flags, 5, pc);
+				__trace_stack(blk_tr, 0, 5, pc);
 			trace_wake_up();
 			return;
 		}
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b3b359660082..3110d92e7d81 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -74,13 +74,10 @@ void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
 
-struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags);
+struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+						   unsigned long length);
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags);
+			      struct ring_buffer_event *event);
 int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index f04c0625f1cd..256749d1032a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -272,13 +272,11 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -292,7 +290,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -305,13 +303,11 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -322,7 +318,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..aee76b3eeed2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1257,7 +1257,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
  * @length: the length of the data to reserve (excluding event header)
- * @flags: a pointer to save the interrupt flags
  *
  * Returns a reseverd event on the ring buffer to copy directly to.
  * The user of this interface will need to get the body to write into
@@ -1270,9 +1269,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * If NULL is returned, then nothing has been allocated or locked.
  */
 struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags)
+ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
@@ -1339,15 +1336,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
  * @event: The event pointer to commit.
- * @flags: the interrupt flags received from ring_buffer_lock_reserve.
  *
  * This commits the data to the ring buffer, and releases any locks held.
  *
  * Must be paired with ring_buffer_lock_reserve.
  */
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags)
+			      struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3536ef41575d..eb453a238a6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -783,14 +783,12 @@ trace_function(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
-	unsigned long irq_flags;
 
 	/* If we are reading the ring buffer, don't trace */
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -798,7 +796,7 @@ trace_function(struct trace_array *tr,
 	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -809,20 +807,18 @@ static void __trace_graph_entry(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -832,20 +828,18 @@ static void __trace_graph_return(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -866,10 +860,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -884,7 +876,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -912,13 +904,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -933,7 +923,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -950,10 +940,8 @@ ftrace_trace_special(void *__tr,
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -962,9 +950,9 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, irq_flags, pc);
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, 0, 4, pc);
+	ftrace_trace_userstack(tr, 0, pc);
 
 	trace_wake_up();
 }
@@ -984,10 +972,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1000,7 +986,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 5, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1013,10 +999,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1029,7 +1013,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 
@@ -2841,7 +2825,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, size);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -2852,7 +2836,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 1f07895977a0..4e08debf662d 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -132,7 +132,6 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_call *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -144,15 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -164,7 +162,6 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_ret *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -173,15 +170,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 027e83690615..770e52acfc10 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -33,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
-	unsigned long flags, irq_flags;
+	unsigned long flags;
 	int cpu, pc;
 	const char *p;
 
@@ -52,8 +52,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 
@@ -75,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545fc866..e720c001db2b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -175,7 +175,7 @@ void trace_hw_branch(u64 from, u64 to)
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq1, irq2;
+	unsigned long irq1;
 	int cpu;
 
 	if (unlikely(!tr))
@@ -189,7 +189,7 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
@@ -198,7 +198,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ec78e244242e..104ddebc11d1 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,10 +307,8 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -319,7 +317,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -337,10 +335,8 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -349,7 +345,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index faa6ab7a1f5c..3b1a292d12d2 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -115,7 +115,6 @@ void trace_power_end(struct power_trace *it)
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -125,15 +124,14 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -148,7 +146,6 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -162,15 +159,14 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
-- 
cgit v1.2.3-71-gd317


From 33dccbb050bbe35b88ca8cf1228dcf3e4d4b3554 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 5 Feb 2009 21:25:32 -0800
Subject: tun: Limit amount of queued packets per device

Unlike a normal socket path, the tuntap device send path does
not have any accounting.  This means that the user-space sender
may be able to pin down arbitrary amounts of kernel memory by
continuing to send data to an end-point that is congested.

Even when this isn't an issue because of limited queueing at
most end points, this can also be a problem because its only
response to congestion is packet loss.  That is, when those
local queues at the end-point fills up, the tuntap device will
start wasting system time because it will continue to send
data there which simply gets dropped straight away.

Of course one could argue that everybody should do congestion
control end-to-end, unfortunately there are people in this world
still hooked on UDP, and they don't appear to be going away
anywhere fast.  In fact, we've always helped them by performing
accounting in our UDP code, the sole purpose of which is to
provide congestion feedback other than through packet loss.

This patch attempts to apply the same bandaid to the tuntap device.
It creates a pseudo-socket object which is used to account our
packets just as a normal socket does for UDP.  Of course things
are a little complex because we're actually reinjecting traffic
back into the stack rather than out of the stack.

The stack complexities however should have been resolved by preceding
patches.  So this one can simply start using skb_set_owner_w.

For now the accounting is essentially disabled by default for
backwards compatibility.  In particular, we set the cap to INT_MAX.
This is so that existing applications don't get confused by the
sudden arrival EAGAIN errors.

In future we may wish (or be forced to) do this by default.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 167 +++++++++++++++++++++++++++++++++----------------
 fs/compat_ioctl.c      |   2 +
 include/linux/if_tun.h |   2 +
 3 files changed, 118 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 15d67635bb10..0476549841ac 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -64,6 +64,7 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
+#include <net/sock.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -95,6 +96,8 @@ struct tun_file {
 	wait_queue_head_t	read_wait;
 };
 
+struct tun_sock;
+
 struct tun_struct {
 	struct tun_file		*tfile;
 	unsigned int 		flags;
@@ -107,12 +110,24 @@ struct tun_struct {
 	struct fasync_struct	*fasync;
 
 	struct tap_filter       txflt;
+	struct sock		*sk;
+	struct socket		socket;
 
 #ifdef TUN_DEBUG
 	int debug;
 #endif
 };
 
+struct tun_sock {
+	struct sock		sk;
+	struct tun_struct	*tun;
+};
+
+static inline struct tun_sock *tun_sk(struct sock *sk)
+{
+	return container_of(sk, struct tun_sock, sk);
+}
+
 static int tun_attach(struct tun_struct *tun, struct file *file)
 {
 	struct tun_file *tfile = file->private_data;
@@ -461,7 +476,8 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 {
 	struct tun_file *tfile = file->private_data;
 	struct tun_struct *tun = __tun_get(tfile);
-	unsigned int mask = POLLOUT | POLLWRNORM;
+	struct sock *sk = tun->sk;
+	unsigned int mask = 0;
 
 	if (!tun)
 		return POLLERR;
@@ -473,6 +489,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 	if (!skb_queue_empty(&tun->readq))
 		mask |= POLLIN | POLLRDNORM;
 
+	if (sock_writeable(sk) ||
+	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+	     sock_writeable(sk)))
+		mask |= POLLOUT | POLLWRNORM;
+
 	if (tun->dev->reg_state != NETREG_REGISTERED)
 		mask = POLLERR;
 
@@ -482,66 +503,35 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
-static struct sk_buff *tun_alloc_skb(size_t prepad, size_t len, size_t linear,
-				     gfp_t gfp)
+static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
+					    size_t prepad, size_t len,
+					    size_t linear, int noblock)
 {
+	struct sock *sk = tun->sk;
 	struct sk_buff *skb;
-	unsigned int i;
-
-	skb = alloc_skb(prepad + len, gfp|__GFP_NOWARN);
-	if (skb) {
-		skb_reserve(skb, prepad);
-		skb_put(skb, len);
-		return skb;
-	}
+	int err;
 
 	/* Under a page?  Don't bother with paged skb. */
 	if (prepad + len < PAGE_SIZE)
-		return NULL;
+		linear = len;
 
-	/* Start with a normal skb, and add pages. */
-	skb = alloc_skb(prepad + linear, gfp);
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+				   &err);
 	if (!skb)
-		return NULL;
+		return ERR_PTR(err);
 
 	skb_reserve(skb, prepad);
 	skb_put(skb, linear);
-
-	len -= linear;
-
-	for (i = 0; i < MAX_SKB_FRAGS; i++) {
-		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
-
-		f->page = alloc_page(gfp|__GFP_ZERO);
-		if (!f->page)
-			break;
-
-		f->page_offset = 0;
-		f->size = PAGE_SIZE;
-
-		skb->data_len += PAGE_SIZE;
-		skb->len += PAGE_SIZE;
-		skb->truesize += PAGE_SIZE;
-		skb_shinfo(skb)->nr_frags++;
-
-		if (len < PAGE_SIZE) {
-			len = 0;
-			break;
-		}
-		len -= PAGE_SIZE;
-	}
-
-	/* Too large, or alloc fail? */
-	if (unlikely(len)) {
-		kfree_skb(skb);
-		skb = NULL;
-	}
+	skb->data_len = len - linear;
+	skb->len += len - linear;
 
 	return skb;
 }
 
 /* Get packet from user space buffer */
-static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
+static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
+				       struct iovec *iv, size_t count,
+				       int noblock)
 {
 	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
 	struct sk_buff *skb;
@@ -573,9 +563,11 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 			return -EINVAL;
 	}
 
-	if (!(skb = tun_alloc_skb(align, len, gso.hdr_len, GFP_KERNEL))) {
-		tun->dev->stats.rx_dropped++;
-		return -ENOMEM;
+	skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
+	if (IS_ERR(skb)) {
+		if (PTR_ERR(skb) != -EAGAIN)
+			tun->dev->stats.rx_dropped++;
+		return PTR_ERR(skb);
 	}
 
 	if (skb_copy_datagram_from_iovec(skb, 0, iv, len)) {
@@ -661,7 +653,8 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 			      unsigned long count, loff_t pos)
 {
-	struct tun_struct *tun = tun_get(iocb->ki_filp);
+	struct file *file = iocb->ki_filp;
+	struct tun_struct *tun = file->private_data;
 	ssize_t result;
 
 	if (!tun)
@@ -669,7 +662,8 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 
 	DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
 
-	result = tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
+	result = tun_get_user(tun, (struct iovec *)iv, iov_length(iv, count),
+			      file->f_flags & O_NONBLOCK);
 
 	tun_put(tun);
 	return result;
@@ -828,11 +822,40 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
 	.validate	= tun_validate,
 };
 
+static void tun_sock_write_space(struct sock *sk)
+{
+	struct tun_struct *tun;
+
+	if (!sock_writeable(sk))
+		return;
+
+	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+		wake_up_interruptible_sync(sk->sk_sleep);
+
+	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+		return;
+
+	tun = container_of(sk, struct tun_sock, sk)->tun;
+	kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
+}
+
+static void tun_sock_destruct(struct sock *sk)
+{
+	dev_put(container_of(sk, struct tun_sock, sk)->tun->dev);
+}
+
+static struct proto tun_proto = {
+	.name		= "tun",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct tun_sock),
+};
 
 static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 {
+	struct sock *sk;
 	struct tun_struct *tun;
 	struct net_device *dev;
+	struct tun_file *tfile = file->private_data;
 	int err;
 
 	dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -885,14 +908,31 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		tun->flags = flags;
 		tun->txflt.count = 0;
 
+		err = -ENOMEM;
+		sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
+		if (!sk)
+			goto err_free_dev;
+
+		/* This ref count is for tun->sk. */
+		dev_hold(dev);
+		sock_init_data(&tun->socket, sk);
+		sk->sk_write_space = tun_sock_write_space;
+		sk->sk_destruct = tun_sock_destruct;
+		sk->sk_sndbuf = INT_MAX;
+		sk->sk_sleep = &tfile->read_wait;
+
+		tun->sk = sk;
+		container_of(sk, struct tun_sock, sk)->tun = tun;
+
 		tun_net_init(dev);
 
 		if (strchr(dev->name, '%')) {
 			err = dev_alloc_name(dev, dev->name);
 			if (err < 0)
-				goto err_free_dev;
+				goto err_free_sk;
 		}
 
+		err = -EINVAL;
 		err = register_netdevice(tun->dev);
 		if (err < 0)
 			goto err_free_dev;
@@ -928,6 +968,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	strcpy(ifr->ifr_name, tun->dev->name);
 	return 0;
 
+ err_free_sk:
+	sock_put(sk);
  err_free_dev:
 	free_netdev(dev);
  failed:
@@ -1012,6 +1054,7 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
 	struct tun_struct *tun;
 	void __user* argp = (void __user*)arg;
 	struct ifreq ifr;
+	int sndbuf;
 	int ret;
 
 	if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
@@ -1151,6 +1194,22 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
 		ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
 		rtnl_unlock();
 		break;
+
+	case TUNGETSNDBUF:
+		sndbuf = tun->sk->sk_sndbuf;
+		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
+			ret = -EFAULT;
+		break;
+
+	case TUNSETSNDBUF:
+		if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		tun->sk->sk_sndbuf = sndbuf;
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
@@ -1218,8 +1277,10 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 		__tun_detach(tun);
 
 		/* If desireable, unregister the netdevice. */
-		if (!(tun->flags & TUN_PERSIST))
+		if (!(tun->flags & TUN_PERSIST)) {
+			sock_put(tun->sk);
 			unregister_netdevice(tun->dev);
+		}
 
 		rtnl_unlock();
 	}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c8f8d5904f5e..c03c10d7fb6b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1988,6 +1988,8 @@ COMPATIBLE_IOCTL(TUNSETGROUP)
 COMPATIBLE_IOCTL(TUNGETFEATURES)
 COMPATIBLE_IOCTL(TUNSETOFFLOAD)
 COMPATIBLE_IOCTL(TUNSETTXFILTER)
+COMPATIBLE_IOCTL(TUNGETSNDBUF)
+COMPATIBLE_IOCTL(TUNSETSNDBUF)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 8529f57ba263..049d6c9428db 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -46,6 +46,8 @@
 #define TUNSETOFFLOAD  _IOW('T', 208, unsigned int)
 #define TUNSETTXFILTER _IOW('T', 209, unsigned int)
 #define TUNGETIFF      _IOR('T', 210, unsigned int)
+#define TUNGETSNDBUF   _IOR('T', 211, int)
+#define TUNSETSNDBUF   _IOW('T', 212, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.2.3-71-gd317


From fe2918b098cdbf55b69ba8762bd3de0ae64f33ff Mon Sep 17 00:00:00 2001
From: Graf Yang <graf.yang@analog.com>
Date: Thu, 5 Feb 2009 21:26:19 -0800
Subject: net: fix some trailing whitespaces

Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_ether.h  |  8 ++++----
 include/linux/netdevice.h | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index 7f3c735f422b..0216e1bdbc56 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -17,7 +17,7 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  */
- 
+
 #ifndef _LINUX_IF_ETHER_H
 #define _LINUX_IF_ETHER_H
 
@@ -25,7 +25,7 @@
 
 /*
  *	IEEE 802.3 Ethernet magic constants.  The frame sizes omit the preamble
- *	and FCS/CRC (frame check sequence). 
+ *	and FCS/CRC (frame check sequence).
  */
 
 #define ETH_ALEN	6		/* Octets in one ethernet addr	 */
@@ -83,7 +83,7 @@
 /*
  *	Non DIX types. Won't clash for 1500 types.
  */
- 
+
 #define ETH_P_802_3	0x0001		/* Dummy type for 802.3 frames  */
 #define ETH_P_AX25	0x0002		/* Dummy protocol id for AX.25  */
 #define ETH_P_ALL	0x0003		/* Every packet (be careful!!!) */
@@ -109,7 +109,7 @@
 /*
  *	This is an Ethernet frame header.
  */
- 
+
 struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
 	unsigned char	h_source[ETH_ALEN];	/* source ether addr	*/
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7a5057fbb7cd..864519e585fc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -96,7 +96,7 @@ struct wireless_dev;
  *	Compute the worst case header length according to the protocols
  *	used.
  */
- 
+
 #if defined(CONFIG_WLAN_80211) || defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
 # if defined(CONFIG_MAC80211_MESH)
 #  define LL_MAX_HEADER 128
@@ -124,7 +124,7 @@ struct wireless_dev;
  *	Network device statistics. Akin to the 2.0 ether stats but
  *	with byte counters.
  */
- 
+
 struct net_device_stats
 {
 	unsigned long	rx_packets;		/* total packets received	*/
@@ -285,7 +285,7 @@ enum netdev_state_t
 
 /*
  * This structure holds at boot time configured netdevice settings. They
- * are then used in the device probing. 
+ * are then used in the device probing.
  */
 struct netdev_boot_setup {
 	char name[IFNAMSIZ];
@@ -740,7 +740,7 @@ struct net_device
 	void			*dsa_ptr;	/* dsa specific data */
 #endif
 	void 			*atalk_ptr;	/* AppleTalk link 	*/
-	void			*ip_ptr;	/* IPv4 specific data	*/  
+	void			*ip_ptr;	/* IPv4 specific data	*/
 	void                    *dn_ptr;        /* DECnet specific data */
 	void                    *ip6_ptr;       /* IPv6 specific data */
 	void			*ec_ptr;	/* Econet specific data	*/
@@ -753,7 +753,7 @@ struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast 
+	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
 							   because most packets are unicast) */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
-- 
cgit v1.2.3-71-gd317


From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 18:43:07 -0500
Subject: ring-buffer: add NMI protection for spinlocks

Impact: prevent deadlock in NMI

The ring buffers are not yet totally lockless with writing to
the buffer. When a writer crosses a page, it grabs a per cpu spinlock
to protect against a reader. The spinlocks taken by a writer are not
to protect against other writers, since a writer can only write to
its own per cpu buffer. The spinlocks protect against readers that
can touch any cpu buffer. The writers are made to be reentrant
with the spinlocks disabling interrupts.

The problem arises when an NMI writes to the buffer, and that write
crosses a page boundary. If it grabs a spinlock, it can be racing
with another writer (since disabling interrupts does not protect
against NMIs) or with a reader on the same CPU. Luckily, most of the
users are not reentrant and protects against this issue. But if a
user of the ring buffer becomes reentrant (which is what the ring
buffers do allow), if the NMI also writes to the ring buffer then
we risk the chance of a deadlock.

This patch moves the ftrace_nmi_enter called by nmi_enter() to the
ring buffer code. It replaces the current ftrace_nmi_enter that is
used by arch specific code to arch_ftrace_nmi_enter and updates
the Kconfig to handle it.

When an NMI is called, it will set a per cpu variable in the ring buffer
code and will clear it when the NMI exits. If a write to the ring buffer
crosses page boundaries inside an NMI, a trylock is used on the spin
lock instead. If the spinlock fails to be acquired, then the entry
is discarded.

This bug appeared in the ftrace work in the RT tree, where event tracing
is reentrant. This workaround solved the deadlocks that appeared there.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/ftrace.c   |  8 ++++----
 include/linux/ftrace_irq.h | 10 +++++++++-
 kernel/trace/Kconfig       |  8 ++++++++
 kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 68 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..a6be725cb049 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4d33224c055f..4c683587055b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 	/* Must have in_nmi seen before reading write flag */
@@ -124,7 +124,7 @@ void ftrace_nmi_enter(void)
 	}
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing in_nmi */
 	smp_wmb();
@@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void)
  */
 static atomic_t in_nmi;
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	atomic_dec(&in_nmi);
 }
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 366a054d0b05..29de6779a963 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,15 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+#ifdef CONFIG_FTRACE_NMI_ENTER
+extern void arch_ftrace_nmi_enter(void);
+extern void arch_ftrace_nmi_exit(void);
+#else
+static inline void arch_ftrace_nmi_enter(void) { }
+static inline void arch_ftrace_nmi_exit(void) { }
+#endif
+
+#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644484d9..25131a5d5e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
 	bool
 
+config HAVE_FTRACE_NMI_ENTER
+	bool
+
 config HAVE_FUNCTION_TRACER
 	bool
 
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
 config RING_BUFFER
 	bool
 
+config FTRACE_NMI_ENTER
+       bool
+       depends on HAVE_FTRACE_NMI_ENTER
+       default y
+
 config TRACING
 	bool
 	select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..a60a6a852f42 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -18,6 +19,35 @@
 
 #include "trace.h"
 
+/*
+ * Since the write to the buffer is still not fully lockless,
+ * we must be careful with NMIs. The locks in the writers
+ * are taken when a write crosses to a new page. The locks
+ * protect against races with the readers (this will soon
+ * be fixed with a lockless solution).
+ *
+ * Because we can not protect against NMIs, and we want to
+ * keep traces reentrant, we need to manage what happens
+ * when we are in an NMI.
+ */
+static DEFINE_PER_CPU(int, rb_in_nmi);
+
+void ftrace_nmi_enter(void)
+{
+	__get_cpu_var(rb_in_nmi)++;
+	/* call arch specific handler too */
+	arch_ftrace_nmi_enter();
+}
+
+void ftrace_nmi_exit(void)
+{
+	arch_ftrace_nmi_exit();
+	__get_cpu_var(rb_in_nmi)--;
+	/* NMIs are not recursive */
+	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
+}
+
+
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
+	bool lock_taken = false;
 
 	commit_page = cpu_buffer->commit_page;
 	/* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		struct buffer_page *next_page = tail_page;
 
 		local_irq_save(flags);
-		__raw_spin_lock(&cpu_buffer->lock);
+		/*
+		 * NMIs can happen after we take the lock.
+		 * If we are in an NMI, only take the lock
+		 * if it is not already taken. Otherwise
+		 * simply fail.
+		 */
+		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+			if (!__raw_spin_trylock(&cpu_buffer->lock))
+				goto out_unlock;
+		} else
+			__raw_spin_lock(&cpu_buffer->lock);
+
+		lock_taken = true;
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
 
-	__raw_spin_unlock(&cpu_buffer->lock);
+	if (likely(lock_taken))
+		__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
 }
-- 
cgit v1.2.3-71-gd317


From d8b891a2db13c8ed296158d6f8c4e335896d0cef Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 19:54:51 -0500
Subject: ring-buffer: allow tracing_off to be used in core kernel code

tracing_off() is the fastest way to stop recording to the ring buffers.
This may be used in places like panic and die, just before the
ftrace_dump is called.

This patch adds the appropriate CPP conditionals to make it a stub
function when the ring buffer is not configured it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b3b359660082..ac94c066f6e9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -124,9 +124,18 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+/*
+ * The below functions are fine to use outside the tracing facility.
+ */
+#ifdef CONFIG_RING_BUFFER
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
+#else
+static inline void tracing_on(void) { }
+static inline void tracing_off(void) { }
+static inline void tracing_off_permanent(void) { }
+#endif
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
-- 
cgit v1.2.3-71-gd317


From 375b38b4214f29109a393ab762d468054bf52354 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Feb 2009 00:51:37 -0500
Subject: nmi: add generic nmi tracking state

This code adds an in_nmi() macro that uses the current tasks preempt count
to track when it is in NMI context. Other parts of the kernel can
use this to determine if the context is in NMI context or not.

This code was inspired by the -rt patch in_nmi version that was
written by Peter Zijlstra, who borrowed that code from
Mathieu Desnoyers.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/hardirq.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f83288347dda..f3cf86e1465b 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,6 +61,12 @@
 #error PREEMPT_ACTIVE is too low!
 #endif
 
+#define NMI_OFFSET	(PREEMPT_ACTIVE << 1)
+
+#if NMI_OFFSET >= 0x80000000
+#error PREEMPT_ACTIVE too high!
+#endif
+
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
@@ -73,6 +79,11 @@
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi()	(preempt_count() & NMI_OFFSET)
+
 #if defined(CONFIG_PREEMPT)
 # define PREEMPT_INATOMIC_BASE kernel_locked()
 # define PREEMPT_CHECK_OFFSET 1
@@ -167,6 +178,8 @@ extern void irq_exit(void);
 #define nmi_enter()				\
 	do {					\
 		ftrace_nmi_enter();		\
+		BUG_ON(in_nmi());		\
+		add_preempt_count(NMI_OFFSET);	\
 		lockdep_off();			\
 		rcu_nmi_enter();		\
 		__irq_enter();			\
@@ -177,6 +190,8 @@ extern void irq_exit(void);
 		__irq_exit();			\
 		rcu_nmi_exit();			\
 		lockdep_on();			\
+		BUG_ON(!in_nmi());		\
+		sub_preempt_count(NMI_OFFSET);	\
 		ftrace_nmi_exit();		\
 	} while (0)
 
-- 
cgit v1.2.3-71-gd317


From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Feb 2009 01:45:16 -0500
Subject: ring-buffer: use generic version of in_nmi

Impact: clean up

Now that a generic in_nmi is available, this patch removes the
special code in the ring_buffer and implements the in_nmi generic
version instead.

With this change, I was also able to rename the "arch_ftrace_nmi_enter"
back to "ftrace_nmi_enter" and remove the code from the ring buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c   |  4 ++--
 include/linux/ftrace_irq.h |  8 --------
 kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------
 3 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 918073c6681b..d74d75e0952d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void arch_ftrace_nmi_enter(void)
+void ftrace_nmi_enter(void)
 {
 	atomic_inc(&nmi_running);
 	/* Must have nmi_running seen before reading write flag */
@@ -124,7 +124,7 @@ void arch_ftrace_nmi_enter(void)
 	}
 }
 
-void arch_ftrace_nmi_exit(void)
+void ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing nmi_running */
 	smp_wmb();
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 29de6779a963..dca7bf8cffe2 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -3,14 +3,6 @@
 
 
 #ifdef CONFIG_FTRACE_NMI_ENTER
-extern void arch_ftrace_nmi_enter(void);
-extern void arch_ftrace_nmi_exit(void);
-#else
-static inline void arch_ftrace_nmi_enter(void) { }
-static inline void arch_ftrace_nmi_exit(void) { }
-#endif
-
-#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a60a6a852f42..5ee344417cd5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -19,35 +20,6 @@
 
 #include "trace.h"
 
-/*
- * Since the write to the buffer is still not fully lockless,
- * we must be careful with NMIs. The locks in the writers
- * are taken when a write crosses to a new page. The locks
- * protect against races with the readers (this will soon
- * be fixed with a lockless solution).
- *
- * Because we can not protect against NMIs, and we want to
- * keep traces reentrant, we need to manage what happens
- * when we are in an NMI.
- */
-static DEFINE_PER_CPU(int, rb_in_nmi);
-
-void ftrace_nmi_enter(void)
-{
-	__get_cpu_var(rb_in_nmi)++;
-	/* call arch specific handler too */
-	arch_ftrace_nmi_enter();
-}
-
-void ftrace_nmi_exit(void)
-{
-	arch_ftrace_nmi_exit();
-	__get_cpu_var(rb_in_nmi)--;
-	/* NMIs are not recursive */
-	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
-}
-
-
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		local_irq_save(flags);
 		/*
+		 * Since the write to the buffer is still not
+		 * fully lockless, we must be careful with NMIs.
+		 * The locks in the writers are taken when a write
+		 * crosses to a new page. The locks protect against
+		 * races with the readers (this will soon be fixed
+		 * with a lockless solution).
+		 *
+		 * Because we can not protect against NMIs, and we
+		 * want to keep traces reentrant, we need to manage
+		 * what happens when we are in an NMI.
+		 *
 		 * NMIs can happen after we take the lock.
 		 * If we are in an NMI, only take the lock
 		 * if it is not already taken. Otherwise
 		 * simply fail.
 		 */
-		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
 				goto out_unlock;
 		} else
-- 
cgit v1.2.3-71-gd317


From 57794a9d48b63e34acbe63282628c9f029603308 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Fri, 6 Feb 2009 17:33:27 +0800
Subject: trace: trivial fixes in comment typos.

Impact: clean up

Fixed several typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 2 +-
 kernel/trace/ftrace.c  | 6 +++---
 kernel/trace/trace.h   | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7840e718c6c7..5e302d636fc2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -140,7 +140,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
 #endif
 
 /**
- * ftrace_make_nop - convert code into top
+ * ftrace_make_nop - convert code into nop
  * @mod: module structure if called by module load initialization
  * @rec: the mcount call site record
  * @addr: the address that the call site should be calling
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 68610031780b..1796e018fbff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	 * it is not enabled then do nothing.
 	 *
 	 * If this record is not to be traced and
-	 * it is enabled then disabled it.
+	 * it is enabled then disable it.
 	 *
 	 */
 	if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/* Record is not filtered and is not enabled do nothing */
+		/* Record is not filtered or enabled, do nothing */
 		if (!fl)
 			return 0;
 
@@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 		} else {
 
-			/* if record is not enabled do nothing */
+			/* if record is not enabled, do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5efc4c707f7e..f92aba52a894 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,12 +616,12 @@ extern struct tracer nop_trace;
  * preempt_enable (after a disable), a schedule might take place
  * causing an infinite recursion.
  *
- * To prevent this, we read the need_recshed flag before
+ * To prevent this, we read the need_resched flag before
  * disabling preemption. When we want to enable preemption we
  * check the flag, if it is set, then we call preempt_enable_no_resched.
  * Otherwise, we call preempt_enable.
  *
- * The rational for doing the above is that if need resched is set
+ * The rational for doing the above is that if need_resched is set
  * and we have yet to reschedule, we are either in an atomic location
  * (where we do not need to check for scheduling) or we are inside
  * the scheduler and do not want to resched.
@@ -642,7 +642,7 @@ static inline int ftrace_preempt_disable(void)
  *
  * This is a scheduler safe way to enable preemption and not miss
  * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we were either inside an atomic or
+ * If resched is set, then we are either inside an atomic or
  * are inside the scheduler (we would have already scheduled
  * otherwise). In this case, we do not want to call normal
  * preempt_enable, but preempt_enable_no_resched instead.
-- 
cgit v1.2.3-71-gd317


From d6301d3dd1c287b32132dda15272a50c11e92a14 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 8 Feb 2009 19:24:13 -0800
Subject: net: Increase default NET_SKB_PAD to 32.

Several devices need to insert some "pre headers" in front of the
main packet data when they transmit a packet.

Currently we allocate only 16 bytes of pad room and this ends up not
being enough for some types of hardware (NIU, usb-net, s390 qeth,
etc.)

So increase this to 32.

Note that drivers still need to check in their transmit routine
whether enough headroom exists, and if not use skb_realloc_headroom().
Tunneling, IPSEC, and other encapsulation methods can cause the
padding area to be used up.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 08670d017479..5eba4007e07f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1287,7 +1287,7 @@ static inline int skb_network_offset(const struct sk_buff *skb)
  * The networking layer reserves some headroom in skb data (via
  * dev_alloc_skb). This is used to avoid having to reallocate skb data when
  * the header has to grow. In the default case, if the header has to grow
- * 16 bytes or less we avoid the reallocation.
+ * 32 bytes or less we avoid the reallocation.
  *
  * Unfortunately this headroom changes the DMA alignment of the resulting
  * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
@@ -1295,11 +1295,11 @@ static inline int skb_network_offset(const struct sk_buff *skb)
  * perhaps setting it to a cacheline in size (since that will maintain
  * cacheline alignment of the DMA). It must be a power of 2.
  *
- * Various parts of the networking layer expect at least 16 bytes of
+ * Various parts of the networking layer expect at least 32 bytes of
  * headroom, you should not reduce this.
  */
 #ifndef NET_SKB_PAD
-#define NET_SKB_PAD	16
+#define NET_SKB_PAD	32
 #endif
 
 extern int ___pskb_trim(struct sk_buff *skb, unsigned int len);
-- 
cgit v1.2.3-71-gd317


From 4ae5544f9a33e4ae306e337f96951eb3ff2df6d9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 8 Feb 2009 18:00:36 +0000
Subject: gro: Remember number of held packets instead of counting every time

This patch prepares for the move of the same_flow checks out of
dev_gro_receive.  As such we need to remember the number of held
packets since doing a loop just to count them every time is silly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/core/dev.c            | 12 +++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 864519e585fc..9ee344bc6c13 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -314,6 +314,9 @@ struct napi_struct {
 	spinlock_t		poll_lock;
 	int			poll_owner;
 #endif
+
+	unsigned int		gro_count;
+
 	struct net_device	*dev;
 	struct list_head	dev_list;
 	struct sk_buff		*gro_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index 709a9a922258..ae0b66936abe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2372,6 +2372,7 @@ void napi_gro_flush(struct napi_struct *napi)
 		napi_gro_complete(skb);
 	}
 
+	napi->gro_count = 0;
 	napi->gro_list = NULL;
 }
 EXPORT_SYMBOL(napi_gro_flush);
@@ -2402,7 +2403,6 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	struct packet_type *ptype;
 	__be16 type = skb->protocol;
 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
-	int count = 0;
 	int same_flow;
 	int mac_len;
 	int ret;
@@ -2430,8 +2430,6 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		NAPI_GRO_CB(skb)->free = 0;
 
 		for (p = napi->gro_list; p; p = p->next) {
-			count++;
-
 			if (!NAPI_GRO_CB(p)->same_flow)
 				continue;
 
@@ -2457,15 +2455,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		*pp = nskb->next;
 		nskb->next = NULL;
 		napi_gro_complete(nskb);
-		count--;
+		napi->gro_count--;
 	}
 
 	if (same_flow)
 		goto ok;
 
-	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS)
+	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
 		goto normal;
 
+	napi->gro_count++;
 	NAPI_GRO_CB(skb)->count = 1;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	skb->next = napi->gro_list;
@@ -2713,6 +2712,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
 	INIT_LIST_HEAD(&napi->poll_list);
+	napi->gro_count = 0;
 	napi->gro_list = NULL;
 	napi->skb = NULL;
 	napi->poll = poll;
@@ -2741,6 +2741,7 @@ void netif_napi_del(struct napi_struct *napi)
 	}
 
 	napi->gro_list = NULL;
+	napi->gro_count = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
 
@@ -5246,6 +5247,7 @@ static int __init net_dev_init(void)
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
+		queue->backlog.gro_count = 0;
 	}
 
 	dev_boot_phase = 0;
-- 
cgit v1.2.3-71-gd317


From aa4b9f533ed5a22952e038b9fac2447ccc682124 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 8 Feb 2009 18:00:37 +0000
Subject: gro: Optimise Ethernet header comparison

This patch optimises the Ethernet header comparison to use 2-byte
and 4-byte xors instead of memcmp.  In order to facilitate this,
the actual comparison is now carried out by the callers of the
shared dev_gro_receive function.

This has a significant impact when receiving 1500B packets through
10GbE.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 21 +++++++++++++++++++++
 include/linux/netdevice.h   |  7 +++++++
 net/8021q/vlan_core.c       |  4 +++-
 net/core/dev.c              | 23 ++---------------------
 4 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 1cb0f0b90926..a1f17abba7dc 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -184,4 +184,25 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 }
 #endif	/* __KERNEL__ */
 
+/**
+ * compare_ether_header - Compare two Ethernet headers
+ * @a: Pointer to Ethernet header
+ * @b: Pointer to Ethernet header
+ *
+ * Compare two ethernet headers, returns 0 if equal.
+ * This assumes that the network header (i.e., IP header) is 4-byte
+ * aligned OR the platform can handle unaligned access.  This is the
+ * case for all packets coming into netif_receive_skb or similar
+ * entry points.
+ */
+
+static inline int compare_ether_header(const void *a, const void *b)
+{
+	u32 *a32 = (u32 *)((u8 *)a + 2);
+	u32 *b32 = (u32 *)((u8 *)b + 2);
+
+	return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
+	       (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
+}
+
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9ee344bc6c13..355662aac940 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1117,6 +1117,13 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb)
 	NAPI_GRO_CB(skb)->data_offset = 0;
 }
 
+static inline void *skb_gro_mac_header(struct sk_buff *skb)
+{
+	return skb_mac_header(skb) < skb->data ? skb_mac_header(skb) :
+	       page_address(skb_shinfo(skb)->frags[0].page) +
+	       skb_shinfo(skb)->frags[0].page_offset;
+}
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 378fa69d625a..70435af153f2 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -85,7 +85,9 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 		goto drop;
 
 	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow = p->dev == skb->dev;
+		NAPI_GRO_CB(p)->same_flow =
+			p->dev == skb->dev && !compare_ether_header(
+				skb_mac_header(p), skb_gro_mac_header(skb));
 		NAPI_GRO_CB(p)->flush = 0;
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ae0b66936abe..1e27a67df242 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -215,13 +215,6 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 }
 
-static inline void *skb_gro_mac_header(struct sk_buff *skb)
-{
-	return skb_mac_header(skb) < skb->data ? skb_mac_header(skb) :
-	       page_address(skb_shinfo(skb)->frags[0].page) +
-	       skb_shinfo(skb)->frags[0].page_offset;
-}
-
 /* Device list insertion */
 static int list_netdevice(struct net_device *dev)
 {
@@ -2415,29 +2408,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
-		struct sk_buff *p;
-		void *mac;
-
 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
 			continue;
 
 		skb_set_network_header(skb, skb_gro_offset(skb));
-		mac = skb_gro_mac_header(skb);
 		mac_len = skb->network_header - skb->mac_header;
 		skb->mac_len = mac_len;
 		NAPI_GRO_CB(skb)->same_flow = 0;
 		NAPI_GRO_CB(skb)->flush = 0;
 		NAPI_GRO_CB(skb)->free = 0;
 
-		for (p = napi->gro_list; p; p = p->next) {
-			if (!NAPI_GRO_CB(p)->same_flow)
-				continue;
-
-			if (p->mac_len != mac_len ||
-			    memcmp(skb_mac_header(p), mac, mac_len))
-				NAPI_GRO_CB(p)->same_flow = 0;
-		}
-
 		pp = ptype->gro_receive(&napi->gro_list, skb);
 		break;
 	}
@@ -2492,7 +2472,8 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	struct sk_buff *p;
 
 	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow = 1;
+		NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
+			skb_mac_header(p), skb_gro_mac_header(skb));
 		NAPI_GRO_CB(p)->flush = 0;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 1292211058aaf872eeb2a0e2677d237916b4501f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 22:16:12 +0100
Subject: tracing/power: move the power trace headers to a dedicated file

Impact: cleanup

Move the power tracer headers to trace/power.h to keep ftrace.h and power bits
more easy to maintain as separated topics.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  2 +-
 arch/x86/kernel/process.c                  |  2 +-
 include/linux/ftrace.h                     | 30 -------------------------
 include/trace/power.h                      | 35 ++++++++++++++++++++++++++++++
 kernel/trace/trace.h                       |  1 +
 kernel/trace/trace_power.c                 |  2 +-
 6 files changed, 39 insertions(+), 33 deletions(-)
 create mode 100644 include/trace/power.h

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..7ed925edf4d2 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e68bb9e30864..026819ffcb0c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5e302d636fc2..106b7909d500 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -339,36 +339,6 @@ ftrace_init_module(struct module *mod,
 		   unsigned long *start, unsigned long *end) { }
 #endif
 
-enum {
-	POWER_NONE = 0,
-	POWER_CSTATE = 1,
-	POWER_PSTATE = 2,
-};
-
-struct power_trace {
-#ifdef CONFIG_POWER_TRACER
-	ktime_t			stamp;
-	ktime_t			end;
-	int			type;
-	int			state;
-#endif
-};
-
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
-
-
 /*
  * Structure that defines an entry function trace.
  */
diff --git a/include/trace/power.h b/include/trace/power.h
new file mode 100644
index 000000000000..c7cefbcdaea4
--- /dev/null
+++ b/include/trace/power.h
@@ -0,0 +1,35 @@
+#ifndef _TRACE_POWER_H
+#define _TRACE_POWER_H
+
+#include <linux/ktime.h>
+
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
+#endif /* _TRACE_POWER_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a011ec062225..1ecfb9d2b365 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -10,6 +10,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <trace/kmemtrace.h>
+#include <trace/power.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bfc21f8079ab..b1d0d087d3a6 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,7 +11,7 @@
 
 #include <linux/init.h>
 #include <linux/debugfs.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <linux/kallsyms.h>
 #include <linux/module.h>
 
-- 
cgit v1.2.3-71-gd317


From f130347c2dd8e7ce0757cd3cf80bedbc6ed63c4c Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Fri, 30 Jan 2009 09:26:42 -0800
Subject: cfg80211: add get reg command

This lets userspace request to get the currently set
regulatory domain.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  4 +++
 net/wireless/nl80211.c  | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/reg.c      |  2 +-
 net/wireless/reg.h      |  2 ++
 4 files changed, 88 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 76aae3d8e97e..4bc27049f4e5 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -113,6 +113,8 @@
  * @NL80211_CMD_SET_BSS: Set BSS attributes for BSS identified by
  *	%NL80211_ATTR_IFINDEX.
  *
+ * @NL80211_CMD_GET_REG: ask the wireless core to send us its currently set
+ * 	regulatory domain.
  * @NL80211_CMD_SET_REG: Set current regulatory domain. CRDA sends this command
  *	after being queried by the kernel. CRDA replies by sending a regulatory
  *	domain structure which consists of %NL80211_ATTR_REG_ALPHA set to our
@@ -188,6 +190,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_SET_MGMT_EXTRA_IE,
 
+	NL80211_CMD_GET_REG,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e69da8d20474..d452396006ee 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2093,6 +2093,81 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
 
 #undef FILL_IN_MESH_PARAM_IF_SET
 
+static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	void *hdr = NULL;
+	struct nlattr *nl_reg_rules;
+	unsigned int i;
+	int err = -EINVAL;
+
+	mutex_lock(&cfg80211_drv_mutex);
+
+	if (!cfg80211_regdomain)
+		goto out;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_GET_REG);
+	if (!hdr)
+		goto nla_put_failure;
+
+	NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2,
+		cfg80211_regdomain->alpha2);
+
+	nl_reg_rules = nla_nest_start(msg, NL80211_ATTR_REG_RULES);
+	if (!nl_reg_rules)
+		goto nla_put_failure;
+
+	for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) {
+		struct nlattr *nl_reg_rule;
+		const struct ieee80211_reg_rule *reg_rule;
+		const struct ieee80211_freq_range *freq_range;
+		const struct ieee80211_power_rule *power_rule;
+
+		reg_rule = &cfg80211_regdomain->reg_rules[i];
+		freq_range = &reg_rule->freq_range;
+		power_rule = &reg_rule->power_rule;
+
+		nl_reg_rule = nla_nest_start(msg, i);
+		if (!nl_reg_rule)
+			goto nla_put_failure;
+
+		NLA_PUT_U32(msg, NL80211_ATTR_REG_RULE_FLAGS,
+			reg_rule->flags);
+		NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_START,
+			freq_range->start_freq_khz);
+		NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_END,
+			freq_range->end_freq_khz);
+		NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_MAX_BW,
+			freq_range->max_bandwidth_khz);
+		NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN,
+			power_rule->max_antenna_gain);
+		NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP,
+			power_rule->max_eirp);
+
+		nla_nest_end(msg, nl_reg_rule);
+	}
+
+	nla_nest_end(msg, nl_reg_rules);
+
+	genlmsg_end(msg, hdr);
+	err = genlmsg_unicast(msg, info->snd_pid);
+	goto out;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	err = -EMSGSIZE;
+out:
+	mutex_unlock(&cfg80211_drv_mutex);
+	return err;
+}
+
 static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
@@ -2332,6 +2407,12 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_GET_REG,
+		.doit = nl80211_get_reg,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+	},
 	{
 		.cmd = NL80211_CMD_SET_REG,
 		.doit = nl80211_set_reg,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index f643d3981102..2323644330cd 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -57,7 +57,7 @@ static u32 supported_bandwidths[] = {
 /* Central wireless core regulatory domains, we only need two,
  * the current one and a world regulatory domain in case we have no
  * information to give us an alpha2 */
-static const struct ieee80211_regdomain *cfg80211_regdomain;
+const struct ieee80211_regdomain *cfg80211_regdomain;
 
 /* We use this as a place for the rd structure built from the
  * last parsed country IE to rest until CRDA gets back to us with
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index eb1dd5bc9b27..fe8c83f34fb7 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -1,6 +1,8 @@
 #ifndef __NET_WIRELESS_REG_H
 #define __NET_WIRELESS_REG_H
 
+extern const struct ieee80211_regdomain *cfg80211_regdomain;
+
 bool is_world_regdom(const char *alpha2);
 bool reg_is_valid_request(const char *alpha2);
 
-- 
cgit v1.2.3-71-gd317


From 0c2bec96945ccfc4a58a88d73531e392972ba6c5 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Tue, 3 Feb 2009 09:04:20 +0200
Subject: libertas: if_spi: add ability to call board specific setup/teardown
 methods

In certain cases it is required to perform board specific actions
before activating libertas G-SPI interface. These actions may include
power up of the chip, GPIOs setup, proper pin-strapping and SPI
controller config.
This patch adds ability to call board specific setup/teardown methods

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Acked-by: Andrey Yurovsky <andrey@cozybit.com>
Acked-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/libertas/if_spi.c | 15 +++++++++++++++
 include/linux/spi/libertas_spi.h       |  7 +++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/libertas/if_spi.c b/drivers/net/wireless/libertas/if_spi.c
index 7c02ea314fd1..07311e71af92 100644
--- a/drivers/net/wireless/libertas/if_spi.c
+++ b/drivers/net/wireless/libertas/if_spi.c
@@ -42,6 +42,7 @@ struct if_spi_packet {
 struct if_spi_card {
 	struct spi_device		*spi;
 	struct lbs_private		*priv;
+	struct libertas_spi_platform_data *pdata;
 
 	char				helper_fw_name[FIRMWARE_NAME_MAX];
 	char				main_fw_name[FIRMWARE_NAME_MAX];
@@ -1022,6 +1023,17 @@ static int __devinit if_spi_probe(struct spi_device *spi)
 
 	lbs_deb_enter(LBS_DEB_SPI);
 
+	if (!pdata) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (pdata->setup) {
+		err = pdata->setup(spi);
+		if (err)
+			goto out;
+	}
+
 	/* Allocate card structure to represent this specific device */
 	card = kzalloc(sizeof(struct if_spi_card), GFP_KERNEL);
 	if (!card) {
@@ -1029,6 +1041,7 @@ static int __devinit if_spi_probe(struct spi_device *spi)
 		goto out;
 	}
 	spi_set_drvdata(spi, card);
+	card->pdata = pdata;
 	card->spi = spi;
 	card->gpio_cs = pdata->gpio_cs;
 	card->prev_xfer_time = jiffies;
@@ -1158,6 +1171,8 @@ static int __devexit libertas_spi_remove(struct spi_device *spi)
 	if_spi_terminate_spi_thread(card);
 	lbs_remove_card(priv); /* will call free_netdev */
 	gpio_free(card->gpio_cs);
+	if (card->pdata->teardown)
+		card->pdata->teardown(spi);
 	free_if_spi_card(card);
 	lbs_deb_leave(LBS_DEB_SPI);
 	return 0;
diff --git a/include/linux/spi/libertas_spi.h b/include/linux/spi/libertas_spi.h
index ada71b4f3788..79506f5f9e67 100644
--- a/include/linux/spi/libertas_spi.h
+++ b/include/linux/spi/libertas_spi.h
@@ -10,6 +10,9 @@
  */
 #ifndef _LIBERTAS_SPI_H_
 #define _LIBERTAS_SPI_H_
+
+struct spi_device;
+
 struct libertas_spi_platform_data {
 	/* There are two ways to read data from the WLAN module's SPI
 	 * interface. Setting 0 or 1 here controls which one is used.
@@ -21,5 +24,9 @@ struct libertas_spi_platform_data {
 
 	/* GPIO number to use as chip select */
 	u16 gpio_cs;
+
+	/* Board specific setup/teardown */
+	int (*setup)(struct spi_device *spi);
+	int (*teardown)(struct spi_device *spi);
 };
 #endif
-- 
cgit v1.2.3-71-gd317


From c9703146158c0415a60799570397e488bc982af5 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Tue, 3 Feb 2009 19:23:18 +0100
Subject: ssb: Add PMU support

This adds support for the SSB PMU.
A PMU is found on Low-Power devices.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/ssb/Makefile                      |   1 +
 drivers/ssb/driver_chipcommon.c           |  14 +-
 drivers/ssb/driver_chipcommon_pmu.c       | 508 ++++++++++++++++++++++++++++++
 include/linux/ssb/ssb_driver_chipcommon.h | 224 +++++++++++++
 4 files changed, 734 insertions(+), 13 deletions(-)
 create mode 100644 drivers/ssb/driver_chipcommon_pmu.c

(limited to 'include/linux')

diff --git a/drivers/ssb/Makefile b/drivers/ssb/Makefile
index 6f255e9c5af9..cfbb74f2982e 100644
--- a/drivers/ssb/Makefile
+++ b/drivers/ssb/Makefile
@@ -9,6 +9,7 @@ ssb-$(CONFIG_SSB_PCMCIAHOST)		+= pcmcia.o
 
 # built-in drivers
 ssb-y					+= driver_chipcommon.o
+ssb-y					+= driver_chipcommon_pmu.o
 ssb-$(CONFIG_SSB_DRIVER_MIPS)		+= driver_mipscore.o
 ssb-$(CONFIG_SSB_DRIVER_EXTIF)		+= driver_extif.o
 ssb-$(CONFIG_SSB_DRIVER_PCICORE)	+= driver_pcicore.o
diff --git a/drivers/ssb/driver_chipcommon.c b/drivers/ssb/driver_chipcommon.c
index 571f4fd55236..9681536163ca 100644
--- a/drivers/ssb/driver_chipcommon.c
+++ b/drivers/ssb/driver_chipcommon.c
@@ -26,19 +26,6 @@ enum ssb_clksrc {
 };
 
 
-static inline u32 chipco_read32(struct ssb_chipcommon *cc,
-				u16 offset)
-{
-	return ssb_read32(cc->dev, offset);
-}
-
-static inline void chipco_write32(struct ssb_chipcommon *cc,
-				  u16 offset,
-				  u32 value)
-{
-	ssb_write32(cc->dev, offset, value);
-}
-
 static inline u32 chipco_write32_masked(struct ssb_chipcommon *cc, u16 offset,
 					u32 mask, u32 value)
 {
@@ -246,6 +233,7 @@ void ssb_chipcommon_init(struct ssb_chipcommon *cc)
 {
 	if (!cc->dev)
 		return; /* We don't have a ChipCommon */
+	ssb_pmu_init(cc);
 	chipco_powercontrol_init(cc);
 	ssb_chipco_set_clockmode(cc, SSB_CLKMODE_FAST);
 	calc_fast_powerup_delay(cc);
diff --git a/drivers/ssb/driver_chipcommon_pmu.c b/drivers/ssb/driver_chipcommon_pmu.c
new file mode 100644
index 000000000000..4aaddeec55a2
--- /dev/null
+++ b/drivers/ssb/driver_chipcommon_pmu.c
@@ -0,0 +1,508 @@
+/*
+ * Sonics Silicon Backplane
+ * Broadcom ChipCommon Power Management Unit driver
+ *
+ * Copyright 2009, Michael Buesch <mb@bu3sch.de>
+ * Copyright 2007, Broadcom Corporation
+ *
+ * Licensed under the GNU/GPL. See COPYING for details.
+ */
+
+#include <linux/ssb/ssb.h>
+#include <linux/ssb/ssb_regs.h>
+#include <linux/ssb/ssb_driver_chipcommon.h>
+#include <linux/delay.h>
+
+#include "ssb_private.h"
+
+static u32 ssb_chipco_pll_read(struct ssb_chipcommon *cc, u32 offset)
+{
+	chipco_write32(cc, SSB_CHIPCO_PLLCTL_ADDR, offset);
+	return chipco_read32(cc, SSB_CHIPCO_PLLCTL_DATA);
+}
+
+static void ssb_chipco_pll_write(struct ssb_chipcommon *cc,
+				 u32 offset, u32 value)
+{
+	chipco_write32(cc, SSB_CHIPCO_PLLCTL_ADDR, offset);
+	chipco_write32(cc, SSB_CHIPCO_PLLCTL_DATA, value);
+}
+
+struct pmu0_plltab_entry {
+	u16 freq;	/* Crystal frequency in kHz.*/
+	u8 xf;		/* Crystal frequency value for PMU control */
+	u8 wb_int;
+	u32 wb_frac;
+};
+
+static const struct pmu0_plltab_entry pmu0_plltab[] = {
+	{ .freq = 12000, .xf =  1, .wb_int = 73, .wb_frac = 349525, },
+	{ .freq = 13000, .xf =  2, .wb_int = 67, .wb_frac = 725937, },
+	{ .freq = 14400, .xf =  3, .wb_int = 61, .wb_frac = 116508, },
+	{ .freq = 15360, .xf =  4, .wb_int = 57, .wb_frac = 305834, },
+	{ .freq = 16200, .xf =  5, .wb_int = 54, .wb_frac = 336579, },
+	{ .freq = 16800, .xf =  6, .wb_int = 52, .wb_frac = 399457, },
+	{ .freq = 19200, .xf =  7, .wb_int = 45, .wb_frac = 873813, },
+	{ .freq = 19800, .xf =  8, .wb_int = 44, .wb_frac = 466033, },
+	{ .freq = 20000, .xf =  9, .wb_int = 44, .wb_frac = 0,      },
+	{ .freq = 25000, .xf = 10, .wb_int = 70, .wb_frac = 419430, },
+	{ .freq = 26000, .xf = 11, .wb_int = 67, .wb_frac = 725937, },
+	{ .freq = 30000, .xf = 12, .wb_int = 58, .wb_frac = 699050, },
+	{ .freq = 38400, .xf = 13, .wb_int = 45, .wb_frac = 873813, },
+	{ .freq = 40000, .xf = 14, .wb_int = 45, .wb_frac = 0,      },
+};
+#define SSB_PMU0_DEFAULT_XTALFREQ	20000
+
+static const struct pmu0_plltab_entry * pmu0_plltab_find_entry(u32 crystalfreq)
+{
+	const struct pmu0_plltab_entry *e;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(pmu0_plltab); i++) {
+		e = &pmu0_plltab[i];
+		if (e->freq == crystalfreq)
+			return e;
+	}
+
+	return NULL;
+}
+
+/* Tune the PLL to the crystal speed. crystalfreq is in kHz. */
+static void ssb_pmu0_pllinit_r0(struct ssb_chipcommon *cc,
+				u32 crystalfreq)
+{
+	struct ssb_bus *bus = cc->dev->bus;
+	const struct pmu0_plltab_entry *e = NULL;
+	u32 pmuctl, tmp, pllctl;
+	unsigned int i;
+
+	if ((bus->chip_id == 0x5354) && !crystalfreq) {
+		/* The 5354 crystal freq is 25MHz */
+		crystalfreq = 25000;
+	}
+	if (crystalfreq)
+		e = pmu0_plltab_find_entry(crystalfreq);
+	if (!e)
+		e = pmu0_plltab_find_entry(SSB_PMU0_DEFAULT_XTALFREQ);
+	BUG_ON(!e);
+	crystalfreq = e->freq;
+	cc->pmu.crystalfreq = e->freq;
+
+	/* Check if the PLL already is programmed to this frequency. */
+	pmuctl = chipco_read32(cc, SSB_CHIPCO_PMU_CTL);
+	if (((pmuctl & SSB_CHIPCO_PMU_CTL_XTALFREQ) >> SSB_CHIPCO_PMU_CTL_XTALFREQ_SHIFT) == e->xf) {
+		/* We're already there... */
+		return;
+	}
+
+	ssb_printk(KERN_INFO PFX "Programming PLL to %u.%03u MHz\n",
+		   (crystalfreq / 1000), (crystalfreq % 1000));
+
+	/* First turn the PLL off. */
+	switch (bus->chip_id) {
+	case 0x4328:
+		chipco_mask32(cc, SSB_CHIPCO_PMU_MINRES_MSK,
+			      ~(1 << SSB_PMURES_4328_BB_PLL_PU));
+		chipco_mask32(cc, SSB_CHIPCO_PMU_MAXRES_MSK,
+			      ~(1 << SSB_PMURES_4328_BB_PLL_PU));
+		break;
+	case 0x5354:
+		chipco_mask32(cc, SSB_CHIPCO_PMU_MINRES_MSK,
+			      ~(1 << SSB_PMURES_5354_BB_PLL_PU));
+		chipco_mask32(cc, SSB_CHIPCO_PMU_MAXRES_MSK,
+			      ~(1 << SSB_PMURES_5354_BB_PLL_PU));
+		break;
+	default:
+		SSB_WARN_ON(1);
+	}
+	for (i = 1500; i; i--) {
+		tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST);
+		if (!(tmp & SSB_CHIPCO_CLKCTLST_HAVEHT))
+			break;
+		udelay(10);
+	}
+	tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST);
+	if (tmp & SSB_CHIPCO_CLKCTLST_HAVEHT)
+		ssb_printk(KERN_EMERG PFX "Failed to turn the PLL off!\n");
+
+	/* Set PDIV in PLL control 0. */
+	pllctl = ssb_chipco_pll_read(cc, SSB_PMU0_PLLCTL0);
+	if (crystalfreq >= SSB_PMU0_PLLCTL0_PDIV_FREQ)
+		pllctl |= SSB_PMU0_PLLCTL0_PDIV_MSK;
+	else
+		pllctl &= ~SSB_PMU0_PLLCTL0_PDIV_MSK;
+	ssb_chipco_pll_write(cc, SSB_PMU0_PLLCTL0, pllctl);
+
+	/* Set WILD in PLL control 1. */
+	pllctl = ssb_chipco_pll_read(cc, SSB_PMU0_PLLCTL1);
+	pllctl &= ~SSB_PMU0_PLLCTL1_STOPMOD;
+	pllctl &= ~(SSB_PMU0_PLLCTL1_WILD_IMSK | SSB_PMU0_PLLCTL1_WILD_FMSK);
+	pllctl |= ((u32)e->wb_int << SSB_PMU0_PLLCTL1_WILD_IMSK_SHIFT) & SSB_PMU0_PLLCTL1_WILD_IMSK;
+	pllctl |= ((u32)e->wb_frac << SSB_PMU0_PLLCTL1_WILD_FMSK_SHIFT) & SSB_PMU0_PLLCTL1_WILD_FMSK;
+	if (e->wb_frac == 0)
+		pllctl |= SSB_PMU0_PLLCTL1_STOPMOD;
+	ssb_chipco_pll_write(cc, SSB_PMU0_PLLCTL1, pllctl);
+
+	/* Set WILD in PLL control 2. */
+	pllctl = ssb_chipco_pll_read(cc, SSB_PMU0_PLLCTL2);
+	pllctl &= ~SSB_PMU0_PLLCTL2_WILD_IMSKHI;
+	pllctl |= (((u32)e->wb_int >> 4) << SSB_PMU0_PLLCTL2_WILD_IMSKHI_SHIFT) & SSB_PMU0_PLLCTL2_WILD_IMSKHI;
+	ssb_chipco_pll_write(cc, SSB_PMU0_PLLCTL2, pllctl);
+
+	/* Set the crystalfrequency and the divisor. */
+	pmuctl = chipco_read32(cc, SSB_CHIPCO_PMU_CTL);
+	pmuctl &= ~SSB_CHIPCO_PMU_CTL_ILP_DIV;
+	pmuctl |= (((crystalfreq + 127) / 128 - 1) << SSB_CHIPCO_PMU_CTL_ILP_DIV_SHIFT)
+			& SSB_CHIPCO_PMU_CTL_ILP_DIV;
+	pmuctl &= ~SSB_CHIPCO_PMU_CTL_XTALFREQ;
+	pmuctl |= ((u32)e->xf << SSB_CHIPCO_PMU_CTL_XTALFREQ_SHIFT) & SSB_CHIPCO_PMU_CTL_XTALFREQ;
+	chipco_write32(cc, SSB_CHIPCO_PMU_CTL, pmuctl);
+}
+
+struct pmu1_plltab_entry {
+	u16 freq;	/* Crystal frequency in kHz.*/
+	u8 xf;		/* Crystal frequency value for PMU control */
+	u8 ndiv_int;
+	u32 ndiv_frac;
+	u8 p1div;
+	u8 p2div;
+};
+
+static const struct pmu1_plltab_entry pmu1_plltab[] = {
+	{ .freq = 12000, .xf =  1, .p1div = 3, .p2div = 22, .ndiv_int =  0x9, .ndiv_frac = 0xFFFFEF, },
+	{ .freq = 13000, .xf =  2, .p1div = 1, .p2div =  6, .ndiv_int =  0xb, .ndiv_frac = 0x483483, },
+	{ .freq = 14400, .xf =  3, .p1div = 1, .p2div = 10, .ndiv_int =  0xa, .ndiv_frac = 0x1C71C7, },
+	{ .freq = 15360, .xf =  4, .p1div = 1, .p2div =  5, .ndiv_int =  0xb, .ndiv_frac = 0x755555, },
+	{ .freq = 16200, .xf =  5, .p1div = 1, .p2div = 10, .ndiv_int =  0x5, .ndiv_frac = 0x6E9E06, },
+	{ .freq = 16800, .xf =  6, .p1div = 1, .p2div = 10, .ndiv_int =  0x5, .ndiv_frac = 0x3CF3CF, },
+	{ .freq = 19200, .xf =  7, .p1div = 1, .p2div =  9, .ndiv_int =  0x5, .ndiv_frac = 0x17B425, },
+	{ .freq = 19800, .xf =  8, .p1div = 1, .p2div = 11, .ndiv_int =  0x4, .ndiv_frac = 0xA57EB,  },
+	{ .freq = 20000, .xf =  9, .p1div = 1, .p2div = 11, .ndiv_int =  0x4, .ndiv_frac = 0,        },
+	{ .freq = 24000, .xf = 10, .p1div = 3, .p2div = 11, .ndiv_int =  0xa, .ndiv_frac = 0,        },
+	{ .freq = 25000, .xf = 11, .p1div = 5, .p2div = 16, .ndiv_int =  0xb, .ndiv_frac = 0,        },
+	{ .freq = 26000, .xf = 12, .p1div = 1, .p2div =  2, .ndiv_int = 0x10, .ndiv_frac = 0xEC4EC4, },
+	{ .freq = 30000, .xf = 13, .p1div = 3, .p2div =  8, .ndiv_int =  0xb, .ndiv_frac = 0,        },
+	{ .freq = 38400, .xf = 14, .p1div = 1, .p2div =  5, .ndiv_int =  0x4, .ndiv_frac = 0x955555, },
+	{ .freq = 40000, .xf = 15, .p1div = 1, .p2div =  2, .ndiv_int =  0xb, .ndiv_frac = 0,        },
+};
+
+#define SSB_PMU1_DEFAULT_XTALFREQ	15360
+
+static const struct pmu1_plltab_entry * pmu1_plltab_find_entry(u32 crystalfreq)
+{
+	const struct pmu1_plltab_entry *e;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(pmu1_plltab); i++) {
+		e = &pmu1_plltab[i];
+		if (e->freq == crystalfreq)
+			return e;
+	}
+
+	return NULL;
+}
+
+/* Tune the PLL to the crystal speed. crystalfreq is in kHz. */
+static void ssb_pmu1_pllinit_r0(struct ssb_chipcommon *cc,
+				u32 crystalfreq)
+{
+	struct ssb_bus *bus = cc->dev->bus;
+	const struct pmu1_plltab_entry *e = NULL;
+	u32 buffer_strength = 0;
+	u32 tmp, pllctl, pmuctl;
+	unsigned int i;
+
+	if (bus->chip_id == 0x4312) {
+		/* We do not touch the BCM4312 PLL and assume
+		 * the default crystal settings work out-of-the-box. */
+		cc->pmu.crystalfreq = 20000;
+		return;
+	}
+
+	if (crystalfreq)
+		e = pmu1_plltab_find_entry(crystalfreq);
+	if (!e)
+		e = pmu1_plltab_find_entry(SSB_PMU1_DEFAULT_XTALFREQ);
+	BUG_ON(!e);
+	crystalfreq = e->freq;
+	cc->pmu.crystalfreq = e->freq;
+
+	/* Check if the PLL already is programmed to this frequency. */
+	pmuctl = chipco_read32(cc, SSB_CHIPCO_PMU_CTL);
+	if (((pmuctl & SSB_CHIPCO_PMU_CTL_XTALFREQ) >> SSB_CHIPCO_PMU_CTL_XTALFREQ_SHIFT) == e->xf) {
+		/* We're already there... */
+		return;
+	}
+
+	ssb_printk(KERN_INFO PFX "Programming PLL to %u.%03u MHz\n",
+		   (crystalfreq / 1000), (crystalfreq % 1000));
+
+	/* First turn the PLL off. */
+	switch (bus->chip_id) {
+	case 0x4325:
+		chipco_mask32(cc, SSB_CHIPCO_PMU_MINRES_MSK,
+			      ~((1 << SSB_PMURES_4325_BBPLL_PWRSW_PU) |
+				(1 << SSB_PMURES_4325_HT_AVAIL)));
+		chipco_mask32(cc, SSB_CHIPCO_PMU_MAXRES_MSK,
+			      ~((1 << SSB_PMURES_4325_BBPLL_PWRSW_PU) |
+				(1 << SSB_PMURES_4325_HT_AVAIL)));
+		/* Adjust the BBPLL to 2 on all channels later. */
+		buffer_strength = 0x222222;
+		break;
+	default:
+		SSB_WARN_ON(1);
+	}
+	for (i = 1500; i; i--) {
+		tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST);
+		if (!(tmp & SSB_CHIPCO_CLKCTLST_HAVEHT))
+			break;
+		udelay(10);
+	}
+	tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST);
+	if (tmp & SSB_CHIPCO_CLKCTLST_HAVEHT)
+		ssb_printk(KERN_EMERG PFX "Failed to turn the PLL off!\n");
+
+	/* Set p1div and p2div. */
+	pllctl = ssb_chipco_pll_read(cc, SSB_PMU1_PLLCTL0);
+	pllctl &= ~(SSB_PMU1_PLLCTL0_P1DIV | SSB_PMU1_PLLCTL0_P2DIV);
+	pllctl |= ((u32)e->p1div << SSB_PMU1_PLLCTL0_P1DIV_SHIFT) & SSB_PMU1_PLLCTL0_P1DIV;
+	pllctl |= ((u32)e->p2div << SSB_PMU1_PLLCTL0_P2DIV_SHIFT) & SSB_PMU1_PLLCTL0_P2DIV;
+	ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL0, pllctl);
+
+	/* Set ndiv int and ndiv mode */
+	pllctl = ssb_chipco_pll_read(cc, SSB_PMU1_PLLCTL2);
+	pllctl &= ~(SSB_PMU1_PLLCTL2_NDIVINT | SSB_PMU1_PLLCTL2_NDIVMODE);
+	pllctl |= ((u32)e->ndiv_int << SSB_PMU1_PLLCTL2_NDIVINT_SHIFT) & SSB_PMU1_PLLCTL2_NDIVINT;
+	pllctl |= (1 << SSB_PMU1_PLLCTL2_NDIVMODE_SHIFT) & SSB_PMU1_PLLCTL2_NDIVMODE;
+	ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL2, pllctl);
+
+	/* Set ndiv frac */
+	pllctl = ssb_chipco_pll_read(cc, SSB_PMU1_PLLCTL3);
+	pllctl &= ~SSB_PMU1_PLLCTL3_NDIVFRAC;
+	pllctl |= ((u32)e->ndiv_frac << SSB_PMU1_PLLCTL3_NDIVFRAC_SHIFT) & SSB_PMU1_PLLCTL3_NDIVFRAC;
+	ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL3, pllctl);
+
+	/* Change the drive strength, if required. */
+	if (buffer_strength) {
+		pllctl = ssb_chipco_pll_read(cc, SSB_PMU1_PLLCTL5);
+		pllctl &= ~SSB_PMU1_PLLCTL5_CLKDRV;
+		pllctl |= (buffer_strength << SSB_PMU1_PLLCTL5_CLKDRV_SHIFT) & SSB_PMU1_PLLCTL5_CLKDRV;
+		ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL5, pllctl);
+	}
+
+	/* Tune the crystalfreq and the divisor. */
+	pmuctl = chipco_read32(cc, SSB_CHIPCO_PMU_CTL);
+	pmuctl &= ~(SSB_CHIPCO_PMU_CTL_ILP_DIV | SSB_CHIPCO_PMU_CTL_XTALFREQ);
+	pmuctl |= ((((u32)e->freq + 127) / 128 - 1) << SSB_CHIPCO_PMU_CTL_ILP_DIV_SHIFT)
+			& SSB_CHIPCO_PMU_CTL_ILP_DIV;
+	pmuctl |= ((u32)e->xf << SSB_CHIPCO_PMU_CTL_XTALFREQ_SHIFT) & SSB_CHIPCO_PMU_CTL_XTALFREQ;
+	chipco_write32(cc, SSB_CHIPCO_PMU_CTL, pmuctl);
+}
+
+static void ssb_pmu_pll_init(struct ssb_chipcommon *cc)
+{
+	struct ssb_bus *bus = cc->dev->bus;
+	u32 crystalfreq = 0; /* in kHz. 0 = keep default freq. */
+
+	if (bus->bustype == SSB_BUSTYPE_SSB) {
+		/* TODO: The user may override the crystal frequency. */
+	}
+
+	switch (bus->chip_id) {
+	case 0x4312:
+	case 0x4325:
+		ssb_pmu1_pllinit_r0(cc, crystalfreq);
+		break;
+	case 0x4328:
+	case 0x5354:
+		ssb_pmu0_pllinit_r0(cc, crystalfreq);
+		break;
+	default:
+		ssb_printk(KERN_ERR PFX
+			   "ERROR: PLL init unknown for device %04X\n",
+			   bus->chip_id);
+	}
+}
+
+struct pmu_res_updown_tab_entry {
+	u8 resource;	/* The resource number */
+	u16 updown;	/* The updown value */
+};
+
+enum pmu_res_depend_tab_task {
+	PMU_RES_DEP_SET = 1,
+	PMU_RES_DEP_ADD,
+	PMU_RES_DEP_REMOVE,
+};
+
+struct pmu_res_depend_tab_entry {
+	u8 resource;	/* The resource number */
+	u8 task;	/* SET | ADD | REMOVE */
+	u32 depend;	/* The depend mask */
+};
+
+static const struct pmu_res_updown_tab_entry pmu_res_updown_tab_4328a0[] = {
+	{ .resource = SSB_PMURES_4328_EXT_SWITCHER_PWM,		.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_BB_SWITCHER_PWM,		.updown = 0x1F01, },
+	{ .resource = SSB_PMURES_4328_BB_SWITCHER_BURST,	.updown = 0x010F, },
+	{ .resource = SSB_PMURES_4328_BB_EXT_SWITCHER_BURST,	.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_ILP_REQUEST,		.updown = 0x0202, },
+	{ .resource = SSB_PMURES_4328_RADIO_SWITCHER_PWM,	.updown = 0x0F01, },
+	{ .resource = SSB_PMURES_4328_RADIO_SWITCHER_BURST,	.updown = 0x0F01, },
+	{ .resource = SSB_PMURES_4328_ROM_SWITCH,		.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_PA_REF_LDO,		.updown = 0x0F01, },
+	{ .resource = SSB_PMURES_4328_RADIO_LDO,		.updown = 0x0F01, },
+	{ .resource = SSB_PMURES_4328_AFE_LDO,			.updown = 0x0F01, },
+	{ .resource = SSB_PMURES_4328_PLL_LDO,			.updown = 0x0F01, },
+	{ .resource = SSB_PMURES_4328_BG_FILTBYP,		.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_TX_FILTBYP,		.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_RX_FILTBYP,		.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_XTAL_PU,			.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_XTAL_EN,			.updown = 0xA001, },
+	{ .resource = SSB_PMURES_4328_BB_PLL_FILTBYP,		.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_RF_PLL_FILTBYP,		.updown = 0x0101, },
+	{ .resource = SSB_PMURES_4328_BB_PLL_PU,		.updown = 0x0701, },
+};
+
+static const struct pmu_res_depend_tab_entry pmu_res_depend_tab_4328a0[] = {
+	{
+		/* Adjust ILP Request to avoid forcing EXT/BB into burst mode. */
+		.resource = SSB_PMURES_4328_ILP_REQUEST,
+		.task = PMU_RES_DEP_SET,
+		.depend = ((1 << SSB_PMURES_4328_EXT_SWITCHER_PWM) |
+			   (1 << SSB_PMURES_4328_BB_SWITCHER_PWM)),
+	},
+};
+
+static const struct pmu_res_updown_tab_entry pmu_res_updown_tab_4325a0[] = {
+	{ .resource = SSB_PMURES_4325_XTAL_PU,			.updown = 0x1501, },
+};
+
+static const struct pmu_res_depend_tab_entry pmu_res_depend_tab_4325a0[] = {
+	{
+		/* Adjust HT-Available dependencies. */
+		.resource = SSB_PMURES_4325_HT_AVAIL,
+		.task = PMU_RES_DEP_ADD,
+		.depend = ((1 << SSB_PMURES_4325_RX_PWRSW_PU) |
+			   (1 << SSB_PMURES_4325_TX_PWRSW_PU) |
+			   (1 << SSB_PMURES_4325_LOGEN_PWRSW_PU) |
+			   (1 << SSB_PMURES_4325_AFE_PWRSW_PU)),
+	},
+};
+
+static void ssb_pmu_resources_init(struct ssb_chipcommon *cc)
+{
+	struct ssb_bus *bus = cc->dev->bus;
+	u32 min_msk = 0, max_msk = 0;
+	unsigned int i;
+	const struct pmu_res_updown_tab_entry *updown_tab = NULL;
+	unsigned int updown_tab_size;
+	const struct pmu_res_depend_tab_entry *depend_tab = NULL;
+	unsigned int depend_tab_size;
+
+	switch (bus->chip_id) {
+	case 0x4312:
+		/* We keep the default settings:
+		 * min_msk = 0xCBB
+		 * max_msk = 0x7FFFF
+		 */
+		break;
+	case 0x4325:
+		/* Power OTP down later. */
+		min_msk = (1 << SSB_PMURES_4325_CBUCK_BURST) |
+			  (1 << SSB_PMURES_4325_LNLDO2_PU);
+		if (chipco_read32(cc, SSB_CHIPCO_CHIPSTAT) &
+		    SSB_CHIPCO_CHST_4325_PMUTOP_2B)
+			min_msk |= (1 << SSB_PMURES_4325_CLDO_CBUCK_BURST);
+		/* The PLL may turn on, if it decides so. */
+		max_msk = 0xFFFFF;
+		updown_tab = pmu_res_updown_tab_4325a0;
+		updown_tab_size = ARRAY_SIZE(pmu_res_updown_tab_4325a0);
+		depend_tab = pmu_res_depend_tab_4325a0;
+		depend_tab_size = ARRAY_SIZE(pmu_res_depend_tab_4325a0);
+		break;
+	case 0x4328:
+		min_msk = (1 << SSB_PMURES_4328_EXT_SWITCHER_PWM) |
+			  (1 << SSB_PMURES_4328_BB_SWITCHER_PWM) |
+			  (1 << SSB_PMURES_4328_XTAL_EN);
+		/* The PLL may turn on, if it decides so. */
+		max_msk = 0xFFFFF;
+		updown_tab = pmu_res_updown_tab_4328a0;
+		updown_tab_size = ARRAY_SIZE(pmu_res_updown_tab_4328a0);
+		depend_tab = pmu_res_depend_tab_4328a0;
+		depend_tab_size = ARRAY_SIZE(pmu_res_depend_tab_4328a0);
+		break;
+	case 0x5354:
+		/* The PLL may turn on, if it decides so. */
+		max_msk = 0xFFFFF;
+		break;
+	default:
+		ssb_printk(KERN_ERR PFX
+			   "ERROR: PMU resource config unknown for device %04X\n",
+			   bus->chip_id);
+	}
+
+	if (updown_tab) {
+		for (i = 0; i < updown_tab_size; i++) {
+			chipco_write32(cc, SSB_CHIPCO_PMU_RES_TABSEL,
+				       updown_tab[i].resource);
+			chipco_write32(cc, SSB_CHIPCO_PMU_RES_UPDNTM,
+				       updown_tab[i].updown);
+		}
+	}
+	if (depend_tab) {
+		for (i = 0; i < depend_tab_size; i++) {
+			chipco_write32(cc, SSB_CHIPCO_PMU_RES_TABSEL,
+				       depend_tab[i].resource);
+			switch (depend_tab[i].task) {
+			case PMU_RES_DEP_SET:
+				chipco_write32(cc, SSB_CHIPCO_PMU_RES_DEPMSK,
+					       depend_tab[i].depend);
+				break;
+			case PMU_RES_DEP_ADD:
+				chipco_set32(cc, SSB_CHIPCO_PMU_RES_DEPMSK,
+					     depend_tab[i].depend);
+				break;
+			case PMU_RES_DEP_REMOVE:
+				chipco_mask32(cc, SSB_CHIPCO_PMU_RES_DEPMSK,
+					      ~(depend_tab[i].depend));
+				break;
+			default:
+				SSB_WARN_ON(1);
+			}
+		}
+	}
+
+	/* Set the resource masks. */
+	if (min_msk)
+		chipco_write32(cc, SSB_CHIPCO_PMU_MINRES_MSK, min_msk);
+	if (max_msk)
+		chipco_write32(cc, SSB_CHIPCO_PMU_MAXRES_MSK, max_msk);
+}
+
+void ssb_pmu_init(struct ssb_chipcommon *cc)
+{
+	struct ssb_bus *bus = cc->dev->bus;
+	u32 pmucap;
+
+	if (!(cc->capabilities & SSB_CHIPCO_CAP_PMU))
+		return;
+
+	pmucap = chipco_read32(cc, SSB_CHIPCO_PMU_CAP);
+	cc->pmu.rev = (pmucap & SSB_CHIPCO_PMU_CAP_REVISION);
+
+	ssb_dprintk(KERN_DEBUG PFX "Found rev %u PMU (capabilities 0x%08X)\n",
+		    cc->pmu.rev, pmucap);
+
+	if (cc->pmu.rev >= 1) {
+		if ((bus->chip_id == 0x4325) && (bus->chip_rev < 2)) {
+			chipco_mask32(cc, SSB_CHIPCO_PMU_CTL,
+				      ~SSB_CHIPCO_PMU_CTL_NOILPONW);
+		} else {
+			chipco_set32(cc, SSB_CHIPCO_PMU_CTL,
+				     SSB_CHIPCO_PMU_CTL_NOILPONW);
+		}
+	}
+	ssb_pmu_pll_init(cc);
+	ssb_pmu_resources_init(cc);
+}
diff --git a/include/linux/ssb/ssb_driver_chipcommon.h b/include/linux/ssb/ssb_driver_chipcommon.h
index 7d7e03dcf77c..d3b1d18922f2 100644
--- a/include/linux/ssb/ssb_driver_chipcommon.h
+++ b/include/linux/ssb/ssb_driver_chipcommon.h
@@ -181,6 +181,16 @@
 #define SSB_CHIPCO_PROG_WAITCNT		0x0124
 #define SSB_CHIPCO_FLASH_CFG		0x0128
 #define SSB_CHIPCO_FLASH_WAITCNT	0x012C
+#define SSB_CHIPCO_CLKCTLST		0x01E0 /* Clock control and status (rev >= 20) */
+#define  SSB_CHIPCO_CLKCTLST_FORCEALP	0x00000001 /* Force ALP request */
+#define  SSB_CHIPCO_CLKCTLST_FORCEHT	0x00000002 /* Force HT request */
+#define  SSB_CHIPCO_CLKCTLST_FORCEILP	0x00000004 /* Force ILP request */
+#define  SSB_CHIPCO_CLKCTLST_HAVEALPREQ	0x00000008 /* ALP available request */
+#define  SSB_CHIPCO_CLKCTLST_HAVEHTREQ	0x00000010 /* HT available request */
+#define  SSB_CHIPCO_CLKCTLST_HWCROFF	0x00000020 /* Force HW clock request off */
+#define  SSB_CHIPCO_CLKCTLST_HAVEHT	0x00010000 /* HT available */
+#define  SSB_CHIPCO_CLKCTLST_HAVEALP	0x00020000 /* APL available */
+#define SSB_CHIPCO_HW_WORKAROUND	0x01E4 /* Hardware workaround (rev >= 20) */
 #define SSB_CHIPCO_UART0_DATA		0x0300
 #define SSB_CHIPCO_UART0_IMR		0x0304
 #define SSB_CHIPCO_UART0_FCR		0x0308
@@ -197,6 +207,196 @@
 #define SSB_CHIPCO_UART1_LSR		0x0414
 #define SSB_CHIPCO_UART1_MSR		0x0418
 #define SSB_CHIPCO_UART1_SCRATCH	0x041C
+/* PMU registers (rev >= 20) */
+#define SSB_CHIPCO_PMU_CTL			0x0600 /* PMU control */
+#define  SSB_CHIPCO_PMU_CTL_ILP_DIV		0xFFFF0000 /* ILP div mask */
+#define  SSB_CHIPCO_PMU_CTL_ILP_DIV_SHIFT	16
+#define  SSB_CHIPCO_PMU_CTL_NOILPONW		0x00000200 /* No ILP on wait */
+#define  SSB_CHIPCO_PMU_CTL_HTREQEN		0x00000100 /* HT req enable */
+#define  SSB_CHIPCO_PMU_CTL_ALPREQEN		0x00000080 /* ALP req enable */
+#define  SSB_CHIPCO_PMU_CTL_XTALFREQ		0x0000007C /* Crystal freq */
+#define  SSB_CHIPCO_PMU_CTL_XTALFREQ_SHIFT	2
+#define  SSB_CHIPCO_PMU_CTL_ILPDIVEN		0x00000002 /* ILP div enable */
+#define  SSB_CHIPCO_PMU_CTL_LPOSEL		0x00000001 /* LPO sel */
+#define SSB_CHIPCO_PMU_CAP			0x0604 /* PMU capabilities */
+#define  SSB_CHIPCO_PMU_CAP_REVISION		0x000000FF /* Revision mask */
+#define SSB_CHIPCO_PMU_STAT			0x0608 /* PMU status */
+#define  SSB_CHIPCO_PMU_STAT_INTPEND		0x00000040 /* Interrupt pending */
+#define  SSB_CHIPCO_PMU_STAT_SBCLKST		0x00000030 /* Backplane clock status? */
+#define  SSB_CHIPCO_PMU_STAT_HAVEALP		0x00000008 /* ALP available */
+#define  SSB_CHIPCO_PMU_STAT_HAVEHT		0x00000004 /* HT available */
+#define  SSB_CHIPCO_PMU_STAT_RESINIT		0x00000003 /* Res init */
+#define SSB_CHIPCO_PMU_RES_STAT			0x060C /* PMU res status */
+#define SSB_CHIPCO_PMU_RES_PEND			0x0610 /* PMU res pending */
+#define SSB_CHIPCO_PMU_TIMER			0x0614 /* PMU timer */
+#define SSB_CHIPCO_PMU_MINRES_MSK		0x0618 /* PMU min res mask */
+#define SSB_CHIPCO_PMU_MAXRES_MSK		0x061C /* PMU max res mask */
+#define SSB_CHIPCO_PMU_RES_TABSEL		0x0620 /* PMU res table sel */
+#define SSB_CHIPCO_PMU_RES_DEPMSK		0x0624 /* PMU res dep mask */
+#define SSB_CHIPCO_PMU_RES_UPDNTM		0x0628 /* PMU res updown timer */
+#define SSB_CHIPCO_PMU_RES_TIMER		0x062C /* PMU res timer */
+#define SSB_CHIPCO_PMU_CLKSTRETCH		0x0630 /* PMU clockstretch */
+#define SSB_CHIPCO_PMU_WATCHDOG			0x0634 /* PMU watchdog */
+#define SSB_CHIPCO_PMU_RES_REQTS		0x0640 /* PMU res req timer sel */
+#define SSB_CHIPCO_PMU_RES_REQT			0x0644 /* PMU res req timer */
+#define SSB_CHIPCO_PMU_RES_REQM			0x0648 /* PMU res req mask */
+#define SSB_CHIPCO_CHIPCTL_ADDR			0x0650
+#define SSB_CHIPCO_CHIPCTL_DATA			0x0654
+#define SSB_CHIPCO_REGCTL_ADDR			0x0658
+#define SSB_CHIPCO_REGCTL_DATA			0x065C
+#define SSB_CHIPCO_PLLCTL_ADDR			0x0660
+#define SSB_CHIPCO_PLLCTL_DATA			0x0664
+
+
+
+/** PMU PLL registers */
+
+/* PMU rev 0 PLL registers */
+#define SSB_PMU0_PLLCTL0			0
+#define  SSB_PMU0_PLLCTL0_PDIV_MSK		0x00000001
+#define  SSB_PMU0_PLLCTL0_PDIV_FREQ		25000 /* kHz */
+#define SSB_PMU0_PLLCTL1			1
+#define  SSB_PMU0_PLLCTL1_WILD_IMSK		0xF0000000 /* Wild int mask (low nibble) */
+#define  SSB_PMU0_PLLCTL1_WILD_IMSK_SHIFT	28
+#define  SSB_PMU0_PLLCTL1_WILD_FMSK		0x0FFFFF00 /* Wild frac mask */
+#define  SSB_PMU0_PLLCTL1_WILD_FMSK_SHIFT	8
+#define  SSB_PMU0_PLLCTL1_STOPMOD		0x00000040 /* Stop mod */
+#define SSB_PMU0_PLLCTL2			2
+#define  SSB_PMU0_PLLCTL2_WILD_IMSKHI		0x0000000F /* Wild int mask (high nibble) */
+#define  SSB_PMU0_PLLCTL2_WILD_IMSKHI_SHIFT	0
+
+/* PMU rev 1 PLL registers */
+#define SSB_PMU1_PLLCTL0			0
+#define  SSB_PMU1_PLLCTL0_P1DIV			0x00F00000 /* P1 div */
+#define  SSB_PMU1_PLLCTL0_P1DIV_SHIFT		20
+#define  SSB_PMU1_PLLCTL0_P2DIV			0x0F000000 /* P2 div */
+#define  SSB_PMU1_PLLCTL0_P2DIV_SHIFT		24
+#define SSB_PMU1_PLLCTL1			1
+#define  SSB_PMU1_PLLCTL1_M1DIV			0x000000FF /* M1 div */
+#define  SSB_PMU1_PLLCTL1_M1DIV_SHIFT		0
+#define  SSB_PMU1_PLLCTL1_M2DIV			0x0000FF00 /* M2 div */
+#define  SSB_PMU1_PLLCTL1_M2DIV_SHIFT		8
+#define  SSB_PMU1_PLLCTL1_M3DIV			0x00FF0000 /* M3 div */
+#define  SSB_PMU1_PLLCTL1_M3DIV_SHIFT		16
+#define  SSB_PMU1_PLLCTL1_M4DIV			0xFF000000 /* M4 div */
+#define  SSB_PMU1_PLLCTL1_M4DIV_SHIFT		24
+#define SSB_PMU1_PLLCTL2			2
+#define  SSB_PMU1_PLLCTL2_M5DIV			0x000000FF /* M5 div */
+#define  SSB_PMU1_PLLCTL2_M5DIV_SHIFT		0
+#define  SSB_PMU1_PLLCTL2_M6DIV			0x0000FF00 /* M6 div */
+#define  SSB_PMU1_PLLCTL2_M6DIV_SHIFT		8
+#define  SSB_PMU1_PLLCTL2_NDIVMODE		0x000E0000 /* NDIV mode */
+#define  SSB_PMU1_PLLCTL2_NDIVMODE_SHIFT	17
+#define  SSB_PMU1_PLLCTL2_NDIVINT		0x1FF00000 /* NDIV int */
+#define  SSB_PMU1_PLLCTL2_NDIVINT_SHIFT		20
+#define SSB_PMU1_PLLCTL3			3
+#define  SSB_PMU1_PLLCTL3_NDIVFRAC		0x00FFFFFF /* NDIV frac */
+#define  SSB_PMU1_PLLCTL3_NDIVFRAC_SHIFT	0
+#define SSB_PMU1_PLLCTL4			4
+#define SSB_PMU1_PLLCTL5			5
+#define  SSB_PMU1_PLLCTL5_CLKDRV		0xFFFFFF00 /* clk drv */
+#define  SSB_PMU1_PLLCTL5_CLKDRV_SHIFT		8
+
+/* BCM4312 PLL resource numbers. */
+#define SSB_PMURES_4312_SWITCHER_BURST		0
+#define SSB_PMURES_4312_SWITCHER_PWM    	1
+#define SSB_PMURES_4312_PA_REF_LDO		2
+#define SSB_PMURES_4312_CORE_LDO_BURST		3
+#define SSB_PMURES_4312_CORE_LDO_PWM		4
+#define SSB_PMURES_4312_RADIO_LDO		5
+#define SSB_PMURES_4312_ILP_REQUEST		6
+#define SSB_PMURES_4312_BG_FILTBYP		7
+#define SSB_PMURES_4312_TX_FILTBYP		8
+#define SSB_PMURES_4312_RX_FILTBYP		9
+#define SSB_PMURES_4312_XTAL_PU			10
+#define SSB_PMURES_4312_ALP_AVAIL		11
+#define SSB_PMURES_4312_BB_PLL_FILTBYP		12
+#define SSB_PMURES_4312_RF_PLL_FILTBYP		13
+#define SSB_PMURES_4312_HT_AVAIL		14
+
+/* BCM4325 PLL resource numbers. */
+#define SSB_PMURES_4325_BUCK_BOOST_BURST	0
+#define SSB_PMURES_4325_CBUCK_BURST		1
+#define SSB_PMURES_4325_CBUCK_PWM		2
+#define SSB_PMURES_4325_CLDO_CBUCK_BURST	3
+#define SSB_PMURES_4325_CLDO_CBUCK_PWM		4
+#define SSB_PMURES_4325_BUCK_BOOST_PWM		5
+#define SSB_PMURES_4325_ILP_REQUEST		6
+#define SSB_PMURES_4325_ABUCK_BURST		7
+#define SSB_PMURES_4325_ABUCK_PWM		8
+#define SSB_PMURES_4325_LNLDO1_PU		9
+#define SSB_PMURES_4325_LNLDO2_PU		10
+#define SSB_PMURES_4325_LNLDO3_PU		11
+#define SSB_PMURES_4325_LNLDO4_PU		12
+#define SSB_PMURES_4325_XTAL_PU			13
+#define SSB_PMURES_4325_ALP_AVAIL		14
+#define SSB_PMURES_4325_RX_PWRSW_PU		15
+#define SSB_PMURES_4325_TX_PWRSW_PU		16
+#define SSB_PMURES_4325_RFPLL_PWRSW_PU		17
+#define SSB_PMURES_4325_LOGEN_PWRSW_PU		18
+#define SSB_PMURES_4325_AFE_PWRSW_PU		19
+#define SSB_PMURES_4325_BBPLL_PWRSW_PU		20
+#define SSB_PMURES_4325_HT_AVAIL		21
+
+/* BCM4328 PLL resource numbers. */
+#define SSB_PMURES_4328_EXT_SWITCHER_PWM	0
+#define SSB_PMURES_4328_BB_SWITCHER_PWM		1
+#define SSB_PMURES_4328_BB_SWITCHER_BURST	2
+#define SSB_PMURES_4328_BB_EXT_SWITCHER_BURST	3
+#define SSB_PMURES_4328_ILP_REQUEST		4
+#define SSB_PMURES_4328_RADIO_SWITCHER_PWM	5
+#define SSB_PMURES_4328_RADIO_SWITCHER_BURST	6
+#define SSB_PMURES_4328_ROM_SWITCH		7
+#define SSB_PMURES_4328_PA_REF_LDO		8
+#define SSB_PMURES_4328_RADIO_LDO		9
+#define SSB_PMURES_4328_AFE_LDO			10
+#define SSB_PMURES_4328_PLL_LDO			11
+#define SSB_PMURES_4328_BG_FILTBYP		12
+#define SSB_PMURES_4328_TX_FILTBYP		13
+#define SSB_PMURES_4328_RX_FILTBYP		14
+#define SSB_PMURES_4328_XTAL_PU			15
+#define SSB_PMURES_4328_XTAL_EN			16
+#define SSB_PMURES_4328_BB_PLL_FILTBYP		17
+#define SSB_PMURES_4328_RF_PLL_FILTBYP		18
+#define SSB_PMURES_4328_BB_PLL_PU		19
+
+/* BCM5354 PLL resource numbers. */
+#define SSB_PMURES_5354_EXT_SWITCHER_PWM	0
+#define SSB_PMURES_5354_BB_SWITCHER_PWM		1
+#define SSB_PMURES_5354_BB_SWITCHER_BURST	2
+#define SSB_PMURES_5354_BB_EXT_SWITCHER_BURST	3
+#define SSB_PMURES_5354_ILP_REQUEST		4
+#define SSB_PMURES_5354_RADIO_SWITCHER_PWM	5
+#define SSB_PMURES_5354_RADIO_SWITCHER_BURST	6
+#define SSB_PMURES_5354_ROM_SWITCH		7
+#define SSB_PMURES_5354_PA_REF_LDO		8
+#define SSB_PMURES_5354_RADIO_LDO		9
+#define SSB_PMURES_5354_AFE_LDO			10
+#define SSB_PMURES_5354_PLL_LDO			11
+#define SSB_PMURES_5354_BG_FILTBYP		12
+#define SSB_PMURES_5354_TX_FILTBYP		13
+#define SSB_PMURES_5354_RX_FILTBYP		14
+#define SSB_PMURES_5354_XTAL_PU			15
+#define SSB_PMURES_5354_XTAL_EN			16
+#define SSB_PMURES_5354_BB_PLL_FILTBYP		17
+#define SSB_PMURES_5354_RF_PLL_FILTBYP		18
+#define SSB_PMURES_5354_BB_PLL_PU		19
+
+
+
+/** Chip specific Chip-Status register contents. */
+#define SSB_CHIPCO_CHST_4325_SPROM_OTP_SEL	0x00000003
+#define SSB_CHIPCO_CHST_4325_DEFCIS_SEL		0 /* OTP is powered up, use def. CIS, no SPROM */
+#define SSB_CHIPCO_CHST_4325_SPROM_SEL		1 /* OTP is powered up, SPROM is present */
+#define SSB_CHIPCO_CHST_4325_OTP_SEL		2 /* OTP is powered up, no SPROM */
+#define SSB_CHIPCO_CHST_4325_OTP_PWRDN		3 /* OTP is powered down, SPROM is present */
+#define SSB_CHIPCO_CHST_4325_SDIO_USB_MODE	0x00000004
+#define SSB_CHIPCO_CHST_4325_SDIO_USB_MODE_SHIFT  2
+#define SSB_CHIPCO_CHST_4325_RCAL_VALID		0x00000008
+#define SSB_CHIPCO_CHST_4325_RCAL_VALID_SHIFT	3
+#define SSB_CHIPCO_CHST_4325_RCAL_VALUE		0x000001F0
+#define SSB_CHIPCO_CHST_4325_RCAL_VALUE_SHIFT	4
+#define SSB_CHIPCO_CHST_4325_PMUTOP_2B 		0x00000200 /* 1 for 2b, 0 for to 2a */
 
 
@@ -353,11 +553,20 @@
 struct ssb_device;
 struct ssb_serial_port;
 
+/* Data for the PMU, if available.
+ * Check availability with ((struct ssb_chipcommon)->capabilities & SSB_CHIPCO_CAP_PMU)
+ */
+struct ssb_chipcommon_pmu {
+	u8 rev;			/* PMU revision */
+	u32 crystalfreq;	/* The active crystal frequency (in kHz) */
+};
+
 struct ssb_chipcommon {
 	struct ssb_device *dev;
 	u32 capabilities;
 	/* Fast Powerup Delay constant */
 	u16 fast_pwrup_delay;
+	struct ssb_chipcommon_pmu pmu;
 };
 
 static inline bool ssb_chipco_available(struct ssb_chipcommon *cc)
@@ -365,6 +574,17 @@ static inline bool ssb_chipco_available(struct ssb_chipcommon *cc)
 	return (cc->dev != NULL);
 }
 
+/* Register access */
+#define chipco_read32(cc, offset)	ssb_read32((cc)->dev, offset)
+#define chipco_write32(cc, offset, val)	ssb_write32((cc)->dev, offset, val)
+
+#define chipco_mask32(cc, offset, mask) \
+		chipco_write32(cc, offset, chipco_read32(cc, offset) & (mask))
+#define chipco_set32(cc, offset, set) \
+		chipco_write32(cc, offset, chipco_read32(cc, offset) | (set))
+#define chipco_maskset32(cc, offset, mask, set) \
+		chipco_write32(cc, offset, (chipco_read32(cc, offset) & (mask)) | (set))
+
 extern void ssb_chipcommon_init(struct ssb_chipcommon *cc);
 
 extern void ssb_chipco_suspend(struct ssb_chipcommon *cc);
@@ -406,4 +626,8 @@ extern int ssb_chipco_serial_init(struct ssb_chipcommon *cc,
 				  struct ssb_serial_port *ports);
 #endif /* CONFIG_SSB_SERIAL */
 
+/* PMU support */
+extern void ssb_pmu_init(struct ssb_chipcommon *cc);
+
+
 #endif /* LINUX_SSB_CHIPCO_H_ */
-- 
cgit v1.2.3-71-gd317


From d54e6d872767ae6512978f86a35d623a8ed948c5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 9 Feb 2009 23:45:29 -0800
Subject: net: Kill skbuff macros from the stone ages.

This kills of HAVE_ALLOC_SKB and HAVE_ALIGNABLE_SKB.

Nothing in-tree uses them and nothing in-tree has used them
since 2.0.x times.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5eba4007e07f..924700844580 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,9 +29,6 @@
 #include <linux/dmaengine.h>
 #include <linux/hrtimer.h>
 
-#define HAVE_ALLOC_SKB		/* For the drivers to know */
-#define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
-
 /* Don't change this without changing skb_csum_unnecessary! */
 #define CHECKSUM_NONE 0
 #define CHECKSUM_UNNECESSARY 1
-- 
cgit v1.2.3-71-gd317


From c3706f005c3aaf570e71f0f083fdbb59a5a9fa2e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:18 -0500
Subject: tracing: fix typos in comments

Impact: clean up.

Fix typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h      | 2 +-
 kernel/trace/ring_buffer.c       | 8 ++++----
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace_hw_branches.c | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3c103d636da3..8e6646a54acf 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -8,7 +8,7 @@ struct ring_buffer;
 struct ring_buffer_iter;
 
 /*
- * Don't reference this struct directly, use functions below.
+ * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
 	u32		type:2, len:3, time_delta:27;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10d202ea06f3..fa64e1f003eb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(tracing_off);
  * tracing_off_permanent - permanently disable ring buffers
  *
  * This function, once called, will disable all ring buffers
- * permanenty.
+ * permanently.
  */
 void tracing_off_permanent(void)
 {
@@ -210,7 +210,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
-	local_t		 commit;	/* write commited index */
+	local_t		 commit;	/* write committed index */
 	unsigned char	 data[];	/* data of buffer page */
 };
 
@@ -260,7 +260,7 @@ struct ring_buffer_per_cpu {
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
 	struct buffer_page		*tail_page;	/* write to tail */
-	struct buffer_page		*commit_page;	/* commited pages */
+	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
@@ -303,7 +303,7 @@ struct ring_buffer_iter {
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
  *
- * As a safty measure we check to make sure the data pages have not
+ * As a safety measure we check to make sure the data pages have not
  * been corrupted.
  */
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d89821283b47..d7c175a442df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1963,7 +1963,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	struct tracer_opt *trace_opts = current_trace->flags->opts;
 
 
-	/* calulate max size */
+	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
 		len += 3; /* "no" and space */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e3e7db61c067..0794dd33f27b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -75,7 +75,7 @@ static void bts_trace_start(struct trace_array *tr)
 }
 
 /*
- * Start tracing on the current cpu.
+ * Stop tracing on the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_mutex must be locked.
-- 
cgit v1.2.3-71-gd317


From ed850a52af971528b048812c4215cef298af0d3b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 10 Feb 2009 23:01:19 -0500
Subject: integrity: shmem zero fix

Based on comments from Mike Frysinger and Randy Dunlap:
(http://lkml.org/lkml/2009/2/9/262)
- moved ima.h include before CONFIG_SHMEM test to fix compiler error
  on Blackfin:
mm/shmem.c: In function 'shmem_zero_setup':
mm/shmem.c:2670: error: implicit declaration of function 'ima_shm_check'

- added 'struct linux_binprm' in ima.h to fix compiler warning on Blackfin:
In file included from mm/shmem.c:32:
include/linux/ima.h:25: warning: 'struct linux_binprm' declared inside
parameter list
include/linux/ima.h:25: warning: its scope is only this definition or
declaration, which is probably not what you want

- moved fs.h include within _LINUX_IMA_H definition

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/ima.h | 5 +++--
 mm/shmem.c          | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 6db30a328d98..0e2aa45cb0ce 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -7,11 +7,12 @@
  * the Free Software Foundation, version 2 of the License.
  */
 
-#include <linux/fs.h>
-
 #ifndef _LINUX_IMA_H
 #define _LINUX_IMA_H
 
+#include <linux/fs.h>
+struct linux_binprm;
+
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_inode_alloc(struct inode *inode);
diff --git a/mm/shmem.c b/mm/shmem.c
index 75199888a6bd..8135fac294ee 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/ima.h>
 
 static struct vfsmount *shm_mnt;
 
@@ -59,7 +60,6 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
-#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 #include <asm/div64.h>
-- 
cgit v1.2.3-71-gd317


From ad0b0fd554dfc126b5750d14908dccc3bbf602be Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 10 Feb 2009 11:42:26 -0800
Subject: sched, latencytop: incorporate review feedback from Andrew Morton

Andrew had some suggestions for the latencytop file; this patch takes care
of most of these:

* Add documentation
* Turn account_scheduler_latency into an inline function
* Don't report negative values to userspace
* Make the file operations struct const
* Fix a few checkpatch.pl warnings

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/latencytop.h | 10 +++++-
 kernel/latencytop.c        | 83 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 80 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index 901c2d6377a8..b0e99898527c 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -9,6 +9,7 @@
 #ifndef _INCLUDE_GUARD_LATENCYTOP_H_
 #define _INCLUDE_GUARD_LATENCYTOP_H_
 
+#include <linux/compiler.h>
 #ifdef CONFIG_LATENCYTOP
 
 #define LT_SAVECOUNT		32
@@ -24,7 +25,14 @@ struct latency_record {
 
 struct task_struct;
 
-void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+extern int latencytop_enabled;
+void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+static inline void
+account_scheduler_latency(struct task_struct *task, int usecs, int inter)
+{
+	if (unlikely(latencytop_enabled))
+		__account_scheduler_latency(task, usecs, inter);
+}
 
 void clear_all_latency_tracing(struct task_struct *p);
 
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 449db466bdbc..ca07c5c0c914 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -9,6 +9,44 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+
+/*
+ * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
+ * used by the "latencytop" userspace tool. The latency that is tracked is not
+ * the 'traditional' interrupt latency (which is primarily caused by something
+ * else consuming CPU), but instead, it is the latency an application encounters
+ * because the kernel sleeps on its behalf for various reasons.
+ *
+ * This code tracks 2 levels of statistics:
+ * 1) System level latency
+ * 2) Per process latency
+ *
+ * The latency is stored in fixed sized data structures in an accumulated form;
+ * if the "same" latency cause is hit twice, this will be tracked as one entry
+ * in the data structure. Both the count, total accumulated latency and maximum
+ * latency are tracked in this data structure. When the fixed size structure is
+ * full, no new causes are tracked until the buffer is flushed by writing to
+ * the /proc file; the userspace tool does this on a regular basis.
+ *
+ * A latency cause is identified by a stringified backtrace at the point that
+ * the scheduler gets invoked. The userland tool will use this string to
+ * identify the cause of the latency in human readable form.
+ *
+ * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
+ * These files look like this:
+ *
+ * Latency Top version : v0.1
+ * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
+ * |    |    |    |
+ * |    |    |    +----> the stringified backtrace
+ * |    |    +---------> The maximum latency for this entry in microseconds
+ * |    +--------------> The accumulated latency for this entry (microseconds)
+ * +-------------------> The number of times this entry is hit
+ *
+ * (note: the average latency is the accumulated latency divided by the number
+ * of times)
+ */
+
 #include <linux/latencytop.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
@@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
 				firstnonnull = i;
 			continue;
 		}
-		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+		for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 			unsigned long record = lat->backtrace[q];
 
 			if (latency_record[i].backtrace[q] != record) {
@@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
 	memcpy(&latency_record[i], lat, sizeof(struct latency_record));
 }
 
-static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+/*
+ * Iterator to store a backtrace into a latency record entry
+ */
+static inline void store_stacktrace(struct task_struct *tsk,
+					struct latency_record *lat)
 {
 	struct stack_trace trace;
 
 	memset(&trace, 0, sizeof(trace));
 	trace.max_entries = LT_BACKTRACEDEPTH;
 	trace.entries = &lat->backtrace[0];
-	trace.skip = 0;
 	save_stack_trace_tsk(tsk, &trace);
 }
 
+/**
+ * __account_scheduler_latency - record an occured latency
+ * @tsk - the task struct of the task hitting the latency
+ * @usecs - the duration of the latency in microseconds
+ * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
+ *
+ * This function is the main entry point for recording latency entries
+ * as called by the scheduler.
+ *
+ * This function has a few special cases to deal with normal 'non-latency'
+ * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
+ * since this usually is caused by waiting for events via select() and co.
+ *
+ * Negative latencies (caused by time going backwards) are also explicitly
+ * skipped.
+ */
 void __sched
-account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 {
 	unsigned long flags;
 	int i, q;
 	struct latency_record lat;
 
-	if (!latencytop_enabled)
-		return;
-
 	/* Long interruptible waits are generally user requested... */
 	if (inter && usecs > 5000)
 		return;
 
+	/* Negative sleeps are time going backwards */
+	/* Zero-time sleeps are non-interesting */
+	if (usecs <= 0)
+		return;
+
 	memset(&lat, 0, sizeof(lat));
 	lat.count = 1;
 	lat.time = usecs;
@@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 	if (tsk->latency_record_count >= LT_SAVECOUNT)
 		goto out_unlock;
 
-	for (i = 0; i < LT_SAVECOUNT ; i++) {
+	for (i = 0; i < LT_SAVECOUNT; i++) {
 		struct latency_record *mylat;
 		int same = 1;
 
 		mylat = &tsk->latency_record[i];
-		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+		for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 			unsigned long record = lat.backtrace[q];
 
 			if (mylat->backtrace[q] != record) {
@@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)
 	for (i = 0; i < MAXLR; i++) {
 		if (latency_record[i].backtrace[0]) {
 			int q;
-			seq_printf(m, "%i %li %li ",
+			seq_printf(m, "%i %lu %lu ",
 				latency_record[i].count,
 				latency_record[i].time,
 				latency_record[i].max);
@@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)
 	return single_open(filp, lstats_show, NULL);
 }
 
-static struct file_operations lstats_fops = {
+static const struct file_operations lstats_fops = {
 	.open		= lstats_open,
 	.read		= seq_read,
 	.write		= lstats_write,
@@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)
 	proc_create("latency_stats", 0644, NULL, &lstats_fops);
 	return 0;
 }
-__initcall(init_lstats_procfs);
+device_initcall(init_lstats_procfs);
-- 
cgit v1.2.3-71-gd317


From 523979adfa0b79d4e3aa053220c37a9233294206 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 11 Feb 2009 11:12:28 -0500
Subject: integrity: audit update

Based on discussions on linux-audit, as per Steve Grubb's request
http://lkml.org/lkml/2009/2/6/269, the following changes were made:
- forced audit result to be either 0 or 1.
- made template names const
- Added new stand-alone message type: AUDIT_INTEGRITY_RULE

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h               |  4 +++-
 security/integrity/ima/ima.h        |  2 +-
 security/integrity/ima/ima_api.c    |  2 +-
 security/integrity/ima/ima_audit.c  | 21 ++++++++++++---------
 security/integrity/ima/ima_fs.c     |  2 +-
 security/integrity/ima/ima_init.c   |  2 +-
 security/integrity/ima/ima_policy.c | 17 +++++++++--------
 7 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 930939abfbc6..4fa2810b675e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -36,7 +36,8 @@
  * 1500 - 1599 kernel LSPP events
  * 1600 - 1699 kernel crypto events
  * 1700 - 1799 kernel anomaly records
- * 1800 - 1999 future kernel use (maybe integrity labels and related events)
+ * 1800 - 1899 kernel integrity events
+ * 1900 - 1999 future kernel use
  * 2000 is for otherwise unclassified kernel audit messages (legacy)
  * 2001 - 2099 unused (kernel)
  * 2100 - 2199 user space anomaly records
@@ -130,6 +131,7 @@
 #define AUDIT_INTEGRITY_STATUS	    1802 /* Integrity enable status */
 #define AUDIT_INTEGRITY_HASH	    1803 /* Integrity HASH type */
 #define AUDIT_INTEGRITY_PCR	    1804 /* PCR invalidation msgs */
+#define AUDIT_INTEGRITY_RULE	    1805 /* policy rule */
 
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index e3c16a21a38e..165eb5397ea5 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -47,7 +47,7 @@ struct ima_template_data {
 
 struct ima_template_entry {
 	u8 digest[IMA_DIGEST_SIZE];	/* sha1 or md5 measurement hash */
-	char *template_name;
+	const char *template_name;
 	int template_len;
 	struct ima_template_data template;
 };
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index a148a25804f6..3cd58b60afd2 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -15,7 +15,7 @@
 #include <linux/module.h>
 
 #include "ima.h"
-static char *IMA_TEMPLATE_NAME = "ima";
+static const char *IMA_TEMPLATE_NAME = "ima";
 
 /*
  * ima_store_template - store ima template measurements
diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
index 8a0f1e23ccf1..1e082bb987be 100644
--- a/security/integrity/ima/ima_audit.c
+++ b/security/integrity/ima/ima_audit.c
@@ -22,16 +22,18 @@ static int ima_audit;
 static int __init ima_audit_setup(char *str)
 {
 	unsigned long audit;
-	int rc;
-	char *op;
+	int rc, result = 0;
+	char *op = "ima_audit";
+	char *cause;
 
 	rc = strict_strtoul(str, 0, &audit);
 	if (rc || audit > 1)
-		printk(KERN_INFO "ima: invalid ima_audit value\n");
+		result = 1;
 	else
 		ima_audit = audit;
-	op = ima_audit ? "ima_audit_enabled" : "ima_audit_not_enabled";
-	integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, NULL, op, 0, 0);
+	cause = ima_audit ? "enabled" : "not_enabled";
+	integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL,
+			    op, cause, result, 0);
 	return 1;
 }
 __setup("ima_audit=", ima_audit_setup);
@@ -47,20 +49,21 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 		return;
 
 	ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno);
-	audit_log_format(ab, "integrity: pid=%d uid=%u auid=%u",
+	audit_log_format(ab, "integrity: pid=%d uid=%u auid=%u ses=%u",
 			 current->pid, current->cred->uid,
-			 audit_get_loginuid(current));
+			 audit_get_loginuid(current),
+			 audit_get_sessionid(current));
 	audit_log_task_context(ab);
 	switch (audit_msgno) {
 	case AUDIT_INTEGRITY_DATA:
 	case AUDIT_INTEGRITY_METADATA:
 	case AUDIT_INTEGRITY_PCR:
+	case AUDIT_INTEGRITY_STATUS:
 		audit_log_format(ab, " op=%s cause=%s", op, cause);
 		break;
 	case AUDIT_INTEGRITY_HASH:
 		audit_log_format(ab, " op=%s hash=%s", op, cause);
 		break;
-	case AUDIT_INTEGRITY_STATUS:
 	default:
 		audit_log_format(ab, " op=%s", op);
 	}
@@ -73,6 +76,6 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 	if (inode)
 		audit_log_format(ab, " dev=%s ino=%lu",
 				 inode->i_sb->s_id, inode->i_ino);
-	audit_log_format(ab, " res=%d", result);
+	audit_log_format(ab, " res=%d", !result ? 0 : 1);
 	audit_log_end(ab);
 }
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 573780c76f1f..ffbe259700b1 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -137,7 +137,7 @@ static int ima_measurements_show(struct seq_file *m, void *v)
 	ima_putc(m, &namelen, sizeof namelen);
 
 	/* 4th:  template name */
-	ima_putc(m, e->template_name, namelen);
+	ima_putc(m, (void *)e->template_name, namelen);
 
 	/* 5th:  template specific data */
 	ima_template_show(m, (struct ima_template_data *)&e->template,
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index cf227dbfac2c..0b0bb8c978cc 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -20,7 +20,7 @@
 #include "ima.h"
 
 /* name for boot aggregate entry */
-static char *boot_aggregate_name = "boot_aggregate";
+static const char *boot_aggregate_name = "boot_aggregate";
 int ima_used_chip;
 
 /* Add the boot aggregate to the IMA measurement list and extend
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 23810e0bfc68..b5291ad5ef56 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -12,7 +12,6 @@
  */
 #include <linux/module.h>
 #include <linux/list.h>
-#include <linux/audit.h>
 #include <linux/security.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
@@ -239,8 +238,7 @@ static int ima_parse_rule(char *rule, struct ima_measure_rule_entry *entry)
 	char *p;
 	int result = 0;
 
-	ab = audit_log_start(current->audit_context, GFP_KERNEL,
-			     AUDIT_INTEGRITY_STATUS);
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_INTEGRITY_RULE);
 
 	entry->action = -1;
 	while ((p = strsep(&rule, " \n")) != NULL) {
@@ -345,15 +343,14 @@ static int ima_parse_rule(char *rule, struct ima_measure_rule_entry *entry)
 						   AUDIT_SUBJ_TYPE);
 			break;
 		case Opt_err:
-			printk(KERN_INFO "%s: unknown token: %s\n",
-			       __FUNCTION__, p);
+			audit_log_format(ab, "UNKNOWN=%s ", p);
 			break;
 		}
 	}
 	if (entry->action == UNKNOWN)
 		result = -EINVAL;
 
-	audit_log_format(ab, "res=%d", result);
+	audit_log_format(ab, "res=%d", !result ? 0 : 1);
 	audit_log_end(ab);
 	return result;
 }
@@ -367,7 +364,7 @@ static int ima_parse_rule(char *rule, struct ima_measure_rule_entry *entry)
  */
 int ima_parse_add_rule(char *rule)
 {
-	const char *op = "add_rule";
+	const char *op = "update_policy";
 	struct ima_measure_rule_entry *entry;
 	int result = 0;
 	int audit_info = 0;
@@ -394,8 +391,12 @@ int ima_parse_add_rule(char *rule)
 		mutex_lock(&ima_measure_mutex);
 		list_add_tail(&entry->list, &measure_policy_rules);
 		mutex_unlock(&ima_measure_mutex);
-	} else
+	} else {
 		kfree(entry);
+		integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
+				    NULL, op, "invalid policy", result,
+				    audit_info);
+	}
 	return result;
 }
 
-- 
cgit v1.2.3-71-gd317


From 1d93e52eb48df986a3c4d5ad8a520bf1f6837367 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jw@emlix.com>
Date: Wed, 11 Feb 2009 08:47:19 -0700
Subject: dmaengine: update kerneldoc

Some of the kerneldoc comments in the dmaengine header describe
already removed structure members.  Remove them.

Also add a short description for dma_device->device_is_tx_complete.

Signed-off-by: Johannes Weiner <jw@emlix.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dmaengine.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3e68469c1885..087e79acf8c7 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -97,7 +97,6 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
 
 /**
  * struct dma_chan_percpu - the per-CPU part of struct dma_chan
- * @refcount: local_t used for open-coded "bigref" counting
  * @memcpy_count: transaction counter
  * @bytes_transferred: byte counter
  */
@@ -114,9 +113,6 @@ struct dma_chan_percpu {
  * @cookie: last cookie value returned to client
  * @chan_id: channel ID for sysfs
  * @dev: class device for sysfs
- * @refcount: kref, used in "bigref" slow-mode
- * @slow_ref: indicates that the DMA channel is free
- * @rcu: the DMA channel's RCU head
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client-count: how many clients are using this channel
@@ -211,8 +207,6 @@ struct dma_async_tx_descriptor {
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
- * @refcount: reference count
- * @done: IO completion struct
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
@@ -225,6 +219,7 @@ struct dma_async_tx_descriptor {
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
  * @device_terminate_all: terminate all pending operations
+ * @device_is_tx_complete: poll for transaction completion
  * @device_issue_pending: push pending transactions to hardware
  */
 struct dma_device {
-- 
cgit v1.2.3-71-gd317


From f9ce1f1cda8b73a36f47e424975a9dfa78b7840c Mon Sep 17 00:00:00 2001
From: Kentaro Takeda <takedakn@nttdata.co.jp>
Date: Thu, 5 Feb 2009 17:18:11 +0900
Subject: Add in_execve flag into task_struct.

This patch allows LSM modules to determine whether current process is in an
execve operation or not so that they can behave differently while an execve
operation is in progress.

This patch is needed by TOMOYO. Please see another patch titled "LSM adapter
functions." for backgrounds.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/compat.c           | 3 +++
 fs/exec.c             | 3 +++
 include/linux/sched.h | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 65a070e705ab..25589f8322f2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1402,6 +1402,7 @@ int compat_do_execve(char * filename,
 	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
 	if (retval < 0)
 		goto out_free;
+	current->in_execve = 1;
 
 	retval = -ENOMEM;
 	bprm->cred = prepare_exec_creds();
@@ -1454,6 +1455,7 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
@@ -1470,6 +1472,7 @@ out_file:
 	}
 
 out_unlock:
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 
 out_free:
diff --git a/fs/exec.c b/fs/exec.c
index febfd8ed6ad1..9881dc3bb488 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1278,6 +1278,7 @@ int do_execve(char * filename,
 	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
 	if (retval < 0)
 		goto out_free;
+	current->in_execve = 1;
 
 	retval = -ENOMEM;
 	bprm->cred = prepare_exec_creds();
@@ -1331,6 +1332,7 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
@@ -1349,6 +1351,7 @@ out_file:
 	}
 
 out_unlock:
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 
 out_free:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2127e959e0f4..397c20cfb6a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1158,6 +1158,8 @@ struct task_struct {
 	/* ??? */
 	unsigned int personality;
 	unsigned did_exec:1;
+	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
+				 * execve */
 	pid_t pid;
 	pid_t tgid;
 
-- 
cgit v1.2.3-71-gd317


From 5a5fb7dbe88dd57dc2bef0f3be9da991e789612d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Feb 2009 10:53:37 -0500
Subject: preempt-count: force hardirq-count to max of 10

To add a bit in the preempt_count to be set when in NMI context, we
found that some archs did not have enough bits to spare. This is
due to the hardirq_count being a mask that can hold NR_IRQS.

Some archs allow for over 16000 IRQs, and that would require a mask
of 14 bits. The sofitrq mask is 8 bits and the preempt disable mask
is also 8 bits.  The PREEMP_ACTIVE bit is bit 30, and bit 31 would
make the preempt_count (which is type int) a negative number.
A negative preempt_count is a sign of failure.

Add them up 14+8+8+1+1 you get 32 bits. No room for the NMI bit.

But the hardirq_count is to track the number of nested IRQs, not
the number of total IRQs.  This originally took the paranoid approach
of setting the max nesting to NR_IRQS. But when we have archs with
over 1000 IRQs, it is not practical to think they will ever all
nest on a single CPU. Not to mention that this would most definitely
cause a stack overflow.

This patch sets a max of 10 bits to be used for IRQ nesting.
I did a 'git grep HARDIRQ' to examine all users of HARDIRQ_BITS and
HARDIRQ_MASK, and found that making it a max of 10 would not hurt
anyone. I did find that the m68k expected it to be 8 bits, so
I allow for the archs to set the number to be less than 10.

I removed the setting of HARDIRQ_BITS from the archs that set it
to more than 10. This includes ALPHA, ia64 and avr32.

This will always allow room for the NMI bit, and if we need to allow
for NMI nesting, we have 4 bits to play with.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/alpha/include/asm/hardirq.h | 13 -----------
 arch/avr32/include/asm/hardirq.h | 11 ---------
 arch/ia64/include/asm/hardirq.h  | 10 ---------
 include/linux/hardirq.h          | 48 ++++++++++++++++++++--------------------
 4 files changed, 24 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/hardirq.h b/arch/alpha/include/asm/hardirq.h
index d953e234daa8..88971460fa6c 100644
--- a/arch/alpha/include/asm/hardirq.h
+++ b/arch/alpha/include/asm/hardirq.h
@@ -14,17 +14,4 @@ typedef struct {
 
 void ack_bad_irq(unsigned int irq);
 
-#define HARDIRQ_BITS	12
-
-/*
- * The hardirq mask has to be large enough to have
- * space for potentially nestable IRQ sources in the system
- * to nest on a single CPU. On Alpha, interrupts are masked at the CPU
- * by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode)
- * so we really only have 8 nestable IRQs, but allow some overhead
- */
-#if (1 << HARDIRQ_BITS) < 16
-#error HARDIRQ_BITS is too low!
-#endif
-
 #endif /* _ALPHA_HARDIRQ_H */
diff --git a/arch/avr32/include/asm/hardirq.h b/arch/avr32/include/asm/hardirq.h
index 267354356f60..015bc75ea798 100644
--- a/arch/avr32/include/asm/hardirq.h
+++ b/arch/avr32/include/asm/hardirq.h
@@ -20,15 +20,4 @@ void ack_bad_irq(unsigned int irq);
 
 #endif /* __ASSEMBLY__ */
 
-#define HARDIRQ_BITS	12
-
-/*
- * The hardirq mask has to be large enough to have
- * space for potentially all IRQ sources in the system
- * nesting on a single CPU:
- */
-#if (1 << HARDIRQ_BITS) < NR_IRQS
-# error HARDIRQ_BITS is too low!
-#endif
-
 #endif /* __ASM_AVR32_HARDIRQ_H */
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
index 140e495b8e0e..d514cd9edb49 100644
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -20,16 +20,6 @@
 
 #define local_softirq_pending()		(local_cpu_data->softirq_pending)
 
-#define HARDIRQ_BITS	14
-
-/*
- * The hardirq mask has to be large enough to have space for potentially all IRQ sources
- * in the system nesting on a single CPU:
- */
-#if (1 << HARDIRQ_BITS) < NR_IRQS
-# error HARDIRQ_BITS is too low!
-#endif
-
 extern void __iomem *ipi_base_addr;
 
 void ack_bad_irq(unsigned int irq);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f3cf86e1465b..9841221f53f2 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -15,61 +15,61 @@
  * - bits 0-7 are the preemption count (max preemption depth: 256)
  * - bits 8-15 are the softirq count (max # of softirqs: 256)
  *
- * The hardirq count can be overridden per architecture, the default is:
+ * The hardirq count can in theory reach the same as NR_IRQS.
+ * In reality, the number of nested IRQS is limited to the stack
+ * size as well. For archs with over 1000 IRQS it is not practical
+ * to expect that they will all nest. We give a max of 10 bits for
+ * hardirq nesting. An arch may choose to give less than 10 bits.
+ * m68k expects it to be 8.
  *
- * - bits 16-27 are the hardirq count (max # of hardirqs: 4096)
- * - ( bit 28 is the PREEMPT_ACTIVE flag. )
+ * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
+ * - bit 26 is the NMI_MASK
+ * - bit 28 is the PREEMPT_ACTIVE flag
  *
  * PREEMPT_MASK: 0x000000ff
  * SOFTIRQ_MASK: 0x0000ff00
- * HARDIRQ_MASK: 0x0fff0000
+ * HARDIRQ_MASK: 0x03ff0000
+ *     NMI_MASK: 0x04000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
+#define NMI_BITS	1
 
-#ifndef HARDIRQ_BITS
-#define HARDIRQ_BITS	12
+#define MAX_HARDIRQ_BITS 10
 
-#ifndef MAX_HARDIRQS_PER_CPU
-#define MAX_HARDIRQS_PER_CPU NR_IRQS
+#ifndef HARDIRQ_BITS
+# define HARDIRQ_BITS	MAX_HARDIRQ_BITS
 #endif
 
-/*
- * The hardirq mask has to be large enough to have space for potentially
- * all IRQ sources in the system nesting on a single CPU.
- */
-#if (1 << HARDIRQ_BITS) < MAX_HARDIRQS_PER_CPU
-# error HARDIRQ_BITS is too low!
-#endif
+#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
+#error HARDIRQ_BITS too high!
 #endif
 
 #define PREEMPT_SHIFT	0
 #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
 #define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
 
 #define __IRQ_MASK(x)	((1UL << (x))-1)
 
 #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
 #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
 #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
 
 #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
 #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define NMI_OFFSET	(1UL << NMI_SHIFT)
 
-#if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
+#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
 #error PREEMPT_ACTIVE is too low!
 #endif
 
-#define NMI_OFFSET	(PREEMPT_ACTIVE << 1)
-
-#if NMI_OFFSET >= 0x80000000
-#error PREEMPT_ACTIVE too high!
-#endif
-
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
+				 | NMI_MASK))
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
@@ -82,7 +82,7 @@
 /*
  * Are we in NMI context?
  */
-#define in_nmi()	(preempt_count() & NMI_OFFSET)
+#define in_nmi()	(preempt_count() & NMI_MASK)
 
 #if defined(CONFIG_PREEMPT)
 # define PREEMPT_INATOMIC_BASE kernel_locked()
-- 
cgit v1.2.3-71-gd317


From 2a7b8df04c11a70105c1abe67d006455d3bdc944 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Feb 2009 14:16:46 -0500
Subject: sched: do not account for NMIs

Impact: avoid corruption in system time accounting

Martin Schwidefsky told me that there was an issue with NMIs and
system accounting. The problem is that the accounting code is
not reentrant, and if an NMI goes off after an interrupt it can
corrupt the accounting.

For now, the best we can do is to treat NMIs like SMIs and they
are not accounted for.

This patch changes nmi_enter to not call __irq_enter and to do
the preempt-count and tracing calls directly.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/hardirq.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 9841221f53f2..faa1cf848bcd 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -175,24 +175,24 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()				\
-	do {					\
-		ftrace_nmi_enter();		\
-		BUG_ON(in_nmi());		\
-		add_preempt_count(NMI_OFFSET);	\
-		lockdep_off();			\
-		rcu_nmi_enter();		\
-		__irq_enter();			\
+#define nmi_enter()						\
+	do {							\
+		ftrace_nmi_enter();				\
+		BUG_ON(in_nmi());				\
+		add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		lockdep_off();					\
+		rcu_nmi_enter();				\
+		trace_hardirq_enter();				\
 	} while (0)
 
-#define nmi_exit()				\
-	do {					\
-		__irq_exit();			\
-		rcu_nmi_exit();			\
-		lockdep_on();			\
-		BUG_ON(!in_nmi());		\
-		sub_preempt_count(NMI_OFFSET);	\
-		ftrace_nmi_exit();		\
+#define nmi_exit()						\
+	do {							\
+		trace_hardirq_exit();				\
+		rcu_nmi_exit();					\
+		lockdep_on();					\
+		BUG_ON(!in_nmi());				\
+		sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		ftrace_nmi_exit();				\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
-- 
cgit v1.2.3-71-gd317


From 2a5193119269062608582418deba7af82844159a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 10 Feb 2009 21:25:55 +0100
Subject: cfg80211/nl80211: scanning (and mac80211 update to use it)

This patch adds basic scan capability to cfg80211/nl80211 and
changes mac80211 to use it. The BSS list that cfg80211 maintains
is made driver-accessible with a private area in each BSS struct,
but mac80211 doesn't yet use it. That's another large project.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlwifi/iwl-agn.c      |  12 +-
 drivers/net/wireless/iwlwifi/iwl-core.c     |   1 +
 drivers/net/wireless/iwlwifi/iwl-scan.c     |   2 +-
 drivers/net/wireless/iwlwifi/iwl3945-base.c |  17 +-
 include/linux/nl80211.h                     |  65 +++
 include/net/cfg80211.h                      | 131 +++++
 include/net/mac80211.h                      |   6 +-
 include/net/wireless.h                      |   3 +
 net/mac80211/cfg.c                          |  20 +
 net/mac80211/ieee80211_i.h                  |  18 +-
 net/mac80211/iface.c                        |   2 +-
 net/mac80211/main.c                         |  32 +-
 net/mac80211/mlme.c                         |  37 +-
 net/mac80211/scan.c                         | 356 +++---------
 net/mac80211/wext.c                         |  59 +-
 net/wireless/Makefile                       |   2 +-
 net/wireless/core.c                         |   8 +
 net/wireless/core.h                         |  20 +
 net/wireless/nl80211.c                      | 323 +++++++++++
 net/wireless/nl80211.h                      |   8 +
 net/wireless/scan.c                         | 807 ++++++++++++++++++++++++++++
 21 files changed, 1546 insertions(+), 383 deletions(-)
 create mode 100644 net/wireless/scan.c

(limited to 'include/linux')

diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index c196abc6db7a..539960da7e13 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -2678,11 +2678,19 @@ static void iwl_bss_info_changed(struct ieee80211_hw *hw,
 
 }
 
-static int iwl_mac_hw_scan(struct ieee80211_hw *hw, u8 *ssid, size_t ssid_len)
+static int iwl_mac_hw_scan(struct ieee80211_hw *hw,
+			   struct cfg80211_scan_request *req)
 {
 	unsigned long flags;
 	struct iwl_priv *priv = hw->priv;
 	int ret;
+	u8 *ssid = NULL;
+	size_t ssid_len = 0;
+
+	if (req->n_ssids) {
+		ssid = req->ssids[0].ssid;
+		ssid_len = req->ssids[0].ssid_len;
+	}
 
 	IWL_DEBUG_MAC80211(priv, "enter\n");
 
@@ -2718,7 +2726,7 @@ static int iwl_mac_hw_scan(struct ieee80211_hw *hw, u8 *ssid, size_t ssid_len)
 
 	if (ssid_len) {
 		priv->one_direct_scan = 1;
-		priv->direct_ssid_len =  min_t(u8, ssid_len, IW_ESSID_MAX_SIZE);
+		priv->direct_ssid_len = ssid_len;
 		memcpy(priv->direct_ssid, ssid, priv->direct_ssid_len);
 	} else {
 		priv->one_direct_scan = 0;
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index e18c3f326f71..260bf903cb71 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -1271,6 +1271,7 @@ int iwl_setup_mac(struct iwl_priv *priv)
 		BIT(NL80211_IFTYPE_ADHOC);
 
 	hw->wiphy->custom_regulatory = true;
+	hw->wiphy->max_scan_ssids = 1;
 
 	/* Default value; 4 EDCA QOS priorities */
 	hw->queues = 4;
diff --git a/drivers/net/wireless/iwlwifi/iwl-scan.c b/drivers/net/wireless/iwlwifi/iwl-scan.c
index 22bad3ce7d6a..1ec2b20eb37c 100644
--- a/drivers/net/wireless/iwlwifi/iwl-scan.c
+++ b/drivers/net/wireless/iwlwifi/iwl-scan.c
@@ -860,7 +860,7 @@ void iwl_bg_scan_completed(struct work_struct *work)
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
 		return;
 
-	ieee80211_scan_completed(priv->hw);
+	ieee80211_scan_completed(priv->hw, false);
 
 	/* Since setting the TXPOWER may have been deferred while
 	 * performing the scan, fire one off */
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 42cc2884971c..0cd8cb96a5ef 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -4442,15 +4442,23 @@ static void iwl3945_bss_info_changed(struct ieee80211_hw *hw,
 
 }
 
-static int iwl3945_mac_hw_scan(struct ieee80211_hw *hw, u8 *ssid, size_t len)
+static int iwl3945_mac_hw_scan(struct ieee80211_hw *hw,
+			       struct cfg80211_scan_request *req)
 {
 	int rc = 0;
 	unsigned long flags;
 	struct iwl_priv *priv = hw->priv;
+	size_t len = 0;
+	u8 *ssid = NULL;
 	DECLARE_SSID_BUF(ssid_buf);
 
 	IWL_DEBUG_MAC80211(priv, "enter\n");
 
+	if (req->n_ssids) {
+		ssid = req->ssids[0].ssid;
+		len = req->ssids[0].ssid_len;
+	}
+
 	mutex_lock(&priv->mutex);
 	spin_lock_irqsave(&priv->lock, flags);
 
@@ -4478,9 +4486,8 @@ static int iwl3945_mac_hw_scan(struct ieee80211_hw *hw, u8 *ssid, size_t len)
 			       print_ssid(ssid_buf, ssid, len), len);
 
 		priv->one_direct_scan = 1;
-		priv->direct_ssid_len = (u8)
-		    min((u8) len, (u8) IW_ESSID_MAX_SIZE);
-		memcpy(priv->direct_ssid, ssid, priv->direct_ssid_len);
+		priv->direct_ssid_len = len;
+		memcpy(priv->direct_ssid, ssid, len);
 	} else
 		priv->one_direct_scan = 0;
 
@@ -5412,6 +5419,8 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e
 
 	hw->wiphy->custom_regulatory = true;
 
+	hw->wiphy->max_scan_ssids = 1;
+
 	/* 4 EDCA QOS priorities */
 	hw->queues = 4;
 
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 4bc27049f4e5..8802d1bda382 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -143,6 +143,13 @@
  *	added to all specified management frames generated by
  *	kernel/firmware/driver.
  *
+ * @NL80211_CMD_GET_SCAN: get scan results
+ * @NL80211_CMD_TRIGGER_SCAN: trigger a new scan with the given parameters
+ * @NL80211_CMD_NEW_SCAN_RESULTS: scan notification (as a reply to
+ *	NL80211_CMD_GET_SCAN and on the "scan" multicast group)
+ * @NL80211_CMD_SCAN_ABORTED: scan was aborted, for unspecified reasons,
+ *	partial scan results may be available
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -192,6 +199,11 @@ enum nl80211_commands {
 
 	NL80211_CMD_GET_REG,
 
+	NL80211_CMD_GET_SCAN,
+	NL80211_CMD_TRIGGER_SCAN,
+	NL80211_CMD_NEW_SCAN_RESULTS,
+	NL80211_CMD_SCAN_ABORTED,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -305,6 +317,18 @@ enum nl80211_commands {
  * @NL80211_ATTR_IE: Information element(s) data (used, e.g., with
  *	%NL80211_CMD_SET_MGMT_EXTRA_IE).
  *
+ * @NL80211_ATTR_MAX_NUM_SCAN_SSIDS: number of SSIDs you can scan with
+ *	a single scan request, a wiphy attribute.
+ *
+ * @NL80211_ATTR_SCAN_FREQUENCIES: nested attribute with frequencies (in MHz)
+ * @NL80211_ATTR_SCAN_SSIDS: nested attribute with SSIDs, leave out for passive
+ *	scanning and include a zero-length SSID (wildcard) for wildcard scan
+ * @NL80211_ATTR_SCAN_GENERATION: the scan generation increases whenever the
+ *	scan result list changes (BSS expired or added) so that applications
+ *	can verify that they got a single, consistent snapshot (when all dump
+ *	messages carried the same generation number)
+ * @NL80211_ATTR_BSS: scan result BSS
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -372,6 +396,13 @@ enum nl80211_attrs {
 	NL80211_ATTR_MGMT_SUBTYPE,
 	NL80211_ATTR_IE,
 
+	NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
+
+	NL80211_ATTR_SCAN_FREQUENCIES,
+	NL80211_ATTR_SCAN_SSIDS,
+	NL80211_ATTR_SCAN_GENERATION,
+	NL80211_ATTR_BSS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -841,4 +872,38 @@ enum nl80211_channel_type {
 	NL80211_CHAN_HT40MINUS,
 	NL80211_CHAN_HT40PLUS
 };
+
+/**
+ * enum nl80211_bss - netlink attributes for a BSS
+ *
+ * @__NL80211_BSS_INVALID: invalid
+ * @NL80211_BSS_FREQUENCY: frequency in MHz (u32)
+ * @NL80211_BSS_TSF: TSF of the received probe response/beacon (u64)
+ * @NL80211_BSS_BEACON_INTERVAL: beacon interval of the (I)BSS (u16)
+ * @NL80211_BSS_CAPABILITY: capability field (CPU order, u16)
+ * @NL80211_BSS_INFORMATION_ELEMENTS: binary attribute containing the
+ *	raw information elements from the probe response/beacon (bin)
+ * @NL80211_BSS_SIGNAL_MBM: signal strength of probe response/beacon
+ *	in mBm (100 * dBm) (s32)
+ * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon
+ *	in unspecified units, scaled to 0..100 (u8)
+ * @__NL80211_BSS_AFTER_LAST: internal
+ * @NL80211_BSS_MAX: highest BSS attribute
+ */
+enum nl80211_bss {
+	__NL80211_BSS_INVALID,
+	NL80211_BSS_BSSID,
+	NL80211_BSS_FREQUENCY,
+	NL80211_BSS_TSF,
+	NL80211_BSS_BEACON_INTERVAL,
+	NL80211_BSS_CAPABILITY,
+	NL80211_BSS_INFORMATION_ELEMENTS,
+	NL80211_BSS_SIGNAL_MBM,
+	NL80211_BSS_SIGNAL_UNSPEC,
+
+	/* keep last */
+	__NL80211_BSS_AFTER_LAST,
+	NL80211_BSS_MAX = __NL80211_BSS_AFTER_LAST - 1
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index dd1fd51638fc..09a0b268e5cf 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4,6 +4,10 @@
 #include <linux/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/nl80211.h>
+#include <linux/if_ether.h>
+#include <linux/ieee80211.h>
+#include <linux/wireless.h>
+#include <net/iw_handler.h>
 #include <net/genetlink.h>
 /* remove once we remove the wext stuff */
 #include <net/iw_handler.h>
@@ -504,6 +508,83 @@ struct wiphy;
 /* from net/ieee80211.h */
 struct ieee80211_channel;
 
+/**
+ * struct cfg80211_ssid - SSID description
+ * @ssid: the SSID
+ * @ssid_len: length of the ssid
+ */
+struct cfg80211_ssid {
+	u8 ssid[IEEE80211_MAX_SSID_LEN];
+	u8 ssid_len;
+};
+
+/**
+ * struct cfg80211_scan_request - scan request description
+ *
+ * @ssids: SSIDs to scan for (active scan only)
+ * @n_ssids: number of SSIDs
+ * @channels: channels to scan on.
+ * @n_channels: number of channels for each band
+ * @wiphy: the wiphy this was for
+ * @ifidx: the interface index
+ */
+struct cfg80211_scan_request {
+	struct cfg80211_ssid *ssids;
+	int n_ssids;
+	struct ieee80211_channel **channels;
+	u32 n_channels;
+
+	/* internal */
+	struct wiphy *wiphy;
+	int ifidx;
+};
+
+/**
+ * enum cfg80211_signal_type - signal type
+ *
+ * @CFG80211_SIGNAL_TYPE_NONE: no signal strength information available
+ * @CFG80211_SIGNAL_TYPE_MBM: signal strength in mBm (100*dBm)
+ * @CFG80211_SIGNAL_TYPE_UNSPEC: signal strength, increasing from 0 through 100
+ */
+enum cfg80211_signal_type {
+	CFG80211_SIGNAL_TYPE_NONE,
+	CFG80211_SIGNAL_TYPE_MBM,
+	CFG80211_SIGNAL_TYPE_UNSPEC,
+};
+
+/**
+ * struct cfg80211_bss - BSS description
+ *
+ * This structure describes a BSS (which may also be a mesh network)
+ * for use in scan results and similar.
+ *
+ * @bssid: BSSID of the BSS
+ * @tsf: timestamp of last received update
+ * @beacon_interval: the beacon interval as from the frame
+ * @capability: the capability field in host byte order
+ * @information_elements: the information elements (Note that there
+ *	is no guarantee that these are well-formed!)
+ * @len_information_elements: total length of the information elements
+ * @signal: signal strength value
+ * @signal_type: signal type
+ * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes
+ */
+struct cfg80211_bss {
+	struct ieee80211_channel *channel;
+
+	u8 bssid[ETH_ALEN];
+	u64 tsf;
+	u16 beacon_interval;
+	u16 capability;
+	u8 *information_elements;
+	size_t len_information_elements;
+
+	s32 signal;
+	enum cfg80211_signal_type signal_type;
+
+	u8 priv[0] __attribute__((__aligned__(sizeof(void *))));
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -571,6 +652,11 @@ struct ieee80211_channel;
  * @set_channel: Set channel
  *
  * @set_mgmt_extra_ie: Set extra IE data for management frames
+ *
+ * @scan: Request to do a scan. If returning zero, the scan request is given
+ *	the driver, and will be valid until passed to cfg80211_scan_done().
+ *	For scan results, call cfg80211_inform_bss(); you can call this outside
+ *	the scan/scan_done bracket too.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy);
@@ -648,6 +734,9 @@ struct cfg80211_ops {
 	int	(*set_mgmt_extra_ie)(struct wiphy *wiphy,
 				     struct net_device *dev,
 				     struct mgmt_extra_ie_params *params);
+
+	int	(*scan)(struct wiphy *wiphy, struct net_device *dev,
+			struct cfg80211_scan_request *request);
 };
 
 /* temporary wext handlers */
@@ -658,5 +747,47 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
 			  u32 *mode, char *extra);
 int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
 			  u32 *mode, char *extra);
+int cfg80211_wext_siwscan(struct net_device *dev,
+			  struct iw_request_info *info,
+			  union iwreq_data *wrqu, char *extra);
+int cfg80211_wext_giwscan(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_point *data, char *extra);
+
+/**
+ * cfg80211_scan_done - notify that scan finished
+ *
+ * @request: the corresponding scan request
+ * @aborted: set to true if the scan was aborted for any reason,
+ *	userspace will be notified of that
+ */
+void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted);
+
+/**
+ * cfg80211_inform_bss - inform cfg80211 of a new BSS
+ *
+ * @wiphy: the wiphy reporting the BSS
+ * @bss: the found BSS
+ * @gfp: context flags
+ *
+ * This informs cfg80211 that BSS information was found and
+ * the BSS should be updated/added.
+ */
+struct cfg80211_bss*
+cfg80211_inform_bss_frame(struct wiphy *wiphy,
+			  struct ieee80211_channel *channel,
+			  struct ieee80211_mgmt *mgmt, size_t len,
+			  s32 signal, enum cfg80211_signal_type sigtype,
+			  gfp_t gfp);
+
+struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
+				      struct ieee80211_channel *channel,
+				      const u8 *bssid,
+				      const u8 *ssid, size_t ssid_len);
+struct cfg80211_bss *cfg80211_get_mesh(struct wiphy *wiphy,
+				       struct ieee80211_channel *channel,
+				       const u8 *meshid, size_t meshidlen,
+				       const u8 *meshcfg);
+void cfg80211_put_bss(struct cfg80211_bss *bss);
 
 #endif /* __NET_CFG80211_H */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 341f3e595ebd..88fa3e03e3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1406,7 +1406,8 @@ struct ieee80211_ops {
 	void (*update_tkip_key)(struct ieee80211_hw *hw,
 			struct ieee80211_key_conf *conf, const u8 *address,
 			u32 iv32, u16 *phase1key);
-	int (*hw_scan)(struct ieee80211_hw *hw, u8 *ssid, size_t len);
+	int (*hw_scan)(struct ieee80211_hw *hw,
+		       struct cfg80211_scan_request *req);
 	int (*get_stats)(struct ieee80211_hw *hw,
 			 struct ieee80211_low_level_stats *stats);
 	void (*get_tkip_seq)(struct ieee80211_hw *hw, u8 hw_key_idx,
@@ -1844,8 +1845,9 @@ void ieee80211_wake_queues(struct ieee80211_hw *hw);
  * mac80211 that the scan finished.
  *
  * @hw: the hardware that finished the scan
+ * @aborted: set to true if scan was aborted
  */
-void ieee80211_scan_completed(struct ieee80211_hw *hw);
+void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted);
 
 /**
  * ieee80211_iterate_active_interfaces - iterate active interfaces
diff --git a/include/net/wireless.h b/include/net/wireless.h
index a42c1562d52b..1c6285eb1666 100644
--- a/include/net/wireless.h
+++ b/include/net/wireless.h
@@ -213,6 +213,9 @@ struct wiphy {
 	bool custom_regulatory;
 	bool strict_regulatory;
 
+	int bss_priv_size;
+	u8 max_scan_ssids;
+
 	/* If multiple wiphys are registered and you're handed e.g.
 	 * a regular netdev with assigned ieee80211_ptr, you won't
 	 * know whether it points to a wiphy your driver has registered
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 42d692fd9bec..c8d969be440b 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1277,6 +1277,25 @@ static int ieee80211_resume(struct wiphy *wiphy)
 #define ieee80211_resume NULL
 #endif
 
+static int ieee80211_scan(struct wiphy *wiphy,
+			  struct net_device *dev,
+			  struct cfg80211_scan_request *req)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+	    sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	return ieee80211_request_scan(sdata, req);
+}
+
 struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -1309,4 +1328,5 @@ struct cfg80211_ops mac80211_config_ops = {
 	.set_mgmt_extra_ie = ieee80211_set_mgmt_extra_ie,
 	.suspend = ieee80211_suspend,
 	.resume = ieee80211_resume,
+	.scan = ieee80211_scan,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9122416fd6af..cbc0b7d647f9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -294,8 +294,6 @@ struct ieee80211_if_sta {
 	u8 ssid[IEEE80211_MAX_SSID_LEN];
 	enum ieee80211_sta_mlme_state state;
 	size_t ssid_len;
-	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
-	size_t scan_ssid_len;
 	u16 aid;
 	u16 ap_capab, capab;
 	u8 *extra_ie; /* to be added to the end of AssocReq */
@@ -658,17 +656,18 @@ struct ieee80211_local {
 
 	/* Scanning and BSS list */
 	bool sw_scanning, hw_scanning;
+	struct cfg80211_ssid scan_ssid;
+	struct cfg80211_scan_request int_scan_req;
+	struct cfg80211_scan_request *scan_req;
+	struct ieee80211_channel *scan_channel;
 	int scan_channel_idx;
-	enum ieee80211_band scan_band;
 
 	enum { SCAN_SET_CHANNEL, SCAN_SEND_PROBE } scan_state;
 	unsigned long last_scan_completed;
 	struct delayed_work scan_work;
 	struct ieee80211_sub_if_data *scan_sdata;
-	struct ieee80211_channel *oper_channel, *scan_channel, *csa_channel;
 	enum nl80211_channel_type oper_channel_type;
-	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
-	size_t scan_ssid_len;
+	struct ieee80211_channel *oper_channel, *csa_channel;
 	struct list_head bss_list;
 	struct ieee80211_bss *bss_hash[STA_HASH_SIZE];
 	spinlock_t bss_lock;
@@ -929,7 +928,7 @@ void ieee80211_send_pspoll(struct ieee80211_local *local,
 
 /* scan/BSS handling */
 int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
-			   u8 *ssid, size_t ssid_len);
+			   struct cfg80211_scan_request *req);
 int ieee80211_scan_results(struct ieee80211_local *local,
 			   struct iw_request_info *info,
 			   char *buf, size_t len);
@@ -944,14 +943,15 @@ int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata,
 
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
 int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
-			 u8 *ssid, size_t ssid_len);
+			 struct cfg80211_scan_request *req);
 struct ieee80211_bss *
 ieee80211_bss_info_update(struct ieee80211_local *local,
 			  struct ieee80211_rx_status *rx_status,
 			  struct ieee80211_mgmt *mgmt,
 			  size_t len,
 			  struct ieee802_11_elems *elems,
-			  int freq, bool beacon);
+			  struct ieee80211_channel *channel,
+			  bool beacon);
 struct ieee80211_bss *
 ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq,
 		     u8 *ssid, u8 ssid_len);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 1c17fb8e4058..df94b9365264 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -522,7 +522,7 @@ static int ieee80211_stop(struct net_device *dev)
 			 * scan event to userspace -- the scan is incomplete.
 			 */
 			if (local->sw_scanning)
-				ieee80211_scan_completed(&local->hw);
+				ieee80211_scan_completed(&local->hw, true);
 		}
 
 		conf.vif = &sdata->vif;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 956afea4214d..954edfbb6b6f 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -733,6 +733,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 		return NULL;
 
 	wiphy->privid = mac80211_wiphy_privid;
+	wiphy->max_scan_ssids = 4;
 
 	local = wiphy_priv(wiphy);
 	local->hw.wiphy = wiphy;
@@ -817,25 +818,33 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	enum ieee80211_band band;
 	struct net_device *mdev;
 	struct ieee80211_master_priv *mpriv;
+	int channels, i, j;
 
 	/*
 	 * generic code guarantees at least one band,
 	 * set this very early because much code assumes
 	 * that hw.conf.channel is assigned
 	 */
+	channels = 0;
 	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
 		struct ieee80211_supported_band *sband;
 
 		sband = local->hw.wiphy->bands[band];
-		if (sband) {
+		if (sband && !local->oper_channel) {
 			/* init channel we're on */
 			local->hw.conf.channel =
 			local->oper_channel =
 			local->scan_channel = &sband->channels[0];
-			break;
 		}
+		if (sband)
+			channels += sband->n_channels;
 	}
 
+	local->int_scan_req.n_channels = channels;
+	local->int_scan_req.channels = kzalloc(sizeof(void *) * channels, GFP_KERNEL);
+	if (!local->int_scan_req.channels)
+		return -ENOMEM;
+
 	/* if low-level driver supports AP, we also support VLAN */
 	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP))
 		local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN);
@@ -845,7 +854,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	result = wiphy_register(local->hw.wiphy);
 	if (result < 0)
-		return result;
+		goto fail_wiphy_register;
 
 	/*
 	 * We use the number of queues for feature tests (QoS, HT) internally
@@ -948,6 +957,20 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	ieee80211_led_init(local);
 
+	/* alloc internal scan request */
+	i = 0;
+	local->int_scan_req.ssids = &local->scan_ssid;
+	local->int_scan_req.n_ssids = 1;
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		if (!hw->wiphy->bands[band])
+			continue;
+		for (j = 0; j < hw->wiphy->bands[band]->n_channels; j++) {
+			local->int_scan_req.channels[i] =
+				&hw->wiphy->bands[band]->channels[j];
+			i++;
+		}
+	}
+
 	return 0;
 
 fail_wep:
@@ -966,6 +989,8 @@ fail_workqueue:
 		free_netdev(local->mdev);
 fail_mdev_alloc:
 	wiphy_unregister(local->hw.wiphy);
+fail_wiphy_register:
+	kfree(local->int_scan_req.channels);
 	return result;
 }
 EXPORT_SYMBOL(ieee80211_register_hw);
@@ -1011,6 +1036,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
 	ieee80211_wep_free(local);
 	ieee80211_led_exit(local);
 	free_netdev(local->mdev);
+	kfree(local->int_scan_req.channels);
 }
 EXPORT_SYMBOL(ieee80211_unregister_hw);
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index bfc47b330687..46b4817cdea9 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1743,7 +1743,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	}
 
 	bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
-					freq, beacon);
+					channel, beacon);
 	if (!bss)
 		return;
 
@@ -2162,7 +2162,15 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
 
 	printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
 	       "IBSS networks with same SSID (merge)\n", sdata->dev->name);
-	ieee80211_request_scan(sdata, ifsta->ssid, ifsta->ssid_len);
+
+	/* XXX maybe racy? */
+	if (sdata->local->scan_req)
+		return;
+
+	memcpy(sdata->local->int_scan_req.ssids[0].ssid,
+	       ifsta->ssid, IEEE80211_MAX_SSID_LEN);
+	sdata->local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
+	ieee80211_request_scan(sdata, &sdata->local->int_scan_req);
 }
 
 
@@ -2378,8 +2386,15 @@ dont_join:
 			      IEEE80211_SCAN_INTERVAL)) {
 		printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
 		       "join\n", sdata->dev->name);
-		return ieee80211_request_scan(sdata, ifsta->ssid,
-					      ifsta->ssid_len);
+
+		/* XXX maybe racy? */
+		if (local->scan_req)
+			return -EBUSY;
+
+		memcpy(local->int_scan_req.ssids[0].ssid,
+		       ifsta->ssid, IEEE80211_MAX_SSID_LEN);
+		local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
+		return ieee80211_request_scan(sdata, &local->int_scan_req);
 	} else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) {
 		int interval = IEEE80211_SCAN_INTERVAL;
 
@@ -2478,11 +2493,16 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 	} else {
 		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
 			ifsta->assoc_scan_tries++;
+			/* XXX maybe racy? */
+			if (local->scan_req)
+				return -1;
+			memcpy(local->int_scan_req.ssids[0].ssid,
+			       ifsta->ssid, IEEE80211_MAX_SSID_LEN);
 			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
-				ieee80211_start_scan(sdata, NULL, 0);
+				local->int_scan_req.ssids[0].ssid_len = 0;
 			else
-				ieee80211_start_scan(sdata, ifsta->ssid,
-							 ifsta->ssid_len);
+				local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
+			ieee80211_start_scan(sdata, &local->int_scan_req);
 			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
 			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
 		} else {
@@ -2520,8 +2540,7 @@ static void ieee80211_sta_work(struct work_struct *work)
 	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
 	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
 	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
-		ieee80211_start_scan(sdata, ifsta->scan_ssid,
-				     ifsta->scan_ssid_len);
+		ieee80211_start_scan(sdata, local->scan_req);
 		return;
 	}
 
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index eddca4e1e13c..c6b275b10cf9 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -13,6 +13,9 @@
  */
 
 /* TODO:
+ * figure out how to avoid that the "current BSS" expires
+ * clean up IBSS code (in MLME), see why it adds a BSS to the list
+ * use cfg80211's BSS handling (depends on IBSS TODO above)
  * order BSS list by RSSI(?) ("quality of AP")
  * scan result table filtering (by capability (privacy, IBSS/BSS, WPA/RSN IE,
  *    SSID)
@@ -225,10 +228,26 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 			  struct ieee80211_mgmt *mgmt,
 			  size_t len,
 			  struct ieee802_11_elems *elems,
-			  int freq, bool beacon)
+			  struct ieee80211_channel *channel,
+			  bool beacon)
 {
 	struct ieee80211_bss *bss;
-	int clen;
+	int clen, freq = channel->center_freq;
+	enum cfg80211_signal_type sigtype = CFG80211_SIGNAL_TYPE_NONE;
+	s32 signal = 0;
+
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
+		sigtype = CFG80211_SIGNAL_TYPE_MBM;
+		signal = rx_status->signal * 100;
+	} else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
+		sigtype = CFG80211_SIGNAL_TYPE_UNSPEC;
+		signal = (rx_status->signal * 100) / local->hw.max_signal;
+	}
+
+	cfg80211_put_bss(
+		cfg80211_inform_bss_frame(local->hw.wiphy, channel,
+					  mgmt, len, signal, sigtype,
+					  GFP_ATOMIC));
 
 #ifdef CONFIG_MAC80211_MESH
 	if (elems->mesh_config)
@@ -401,7 +420,7 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 
 	bss = ieee80211_bss_info_update(sdata->local, rx_status,
 					mgmt, skb->len, &elems,
-					freq, beacon);
+					channel, beacon);
 	if (bss)
 		ieee80211_rx_bss_put(sdata->local, bss);
 
@@ -439,26 +458,22 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
 	ieee80211_tx_skb(sdata, skb, 0);
 }
 
-void ieee80211_scan_completed(struct ieee80211_hw *hw)
+void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata;
-	union iwreq_data wrqu;
 
 	if (WARN_ON(!local->hw_scanning && !local->sw_scanning))
 		return;
 
-	local->last_scan_completed = jiffies;
-	memset(&wrqu, 0, sizeof(wrqu));
+	if (WARN_ON(!local->scan_req))
+		return;
 
-	/*
-	 * local->scan_sdata could have been NULLed by the interface
-	 * down code in case we were scanning on an interface that is
-	 * being taken down.
-	 */
-	sdata = local->scan_sdata;
-	if (sdata)
-		wireless_send_event(sdata->dev, SIOCGIWSCAN, &wrqu, NULL);
+	if (local->scan_req != &local->int_scan_req)
+		cfg80211_scan_done(local->scan_req, aborted);
+	local->scan_req = NULL;
+
+	local->last_scan_completed = jiffies;
 
 	if (local->hw_scanning) {
 		local->hw_scanning = false;
@@ -520,9 +535,8 @@ void ieee80211_scan_work(struct work_struct *work)
 	struct ieee80211_local *local =
 		container_of(work, struct ieee80211_local, scan_work.work);
 	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
-	struct ieee80211_supported_band *sband;
 	struct ieee80211_channel *chan;
-	int skip;
+	int skip, i;
 	unsigned long next_delay = 0;
 
 	/*
@@ -533,33 +547,13 @@ void ieee80211_scan_work(struct work_struct *work)
 
 	switch (local->scan_state) {
 	case SCAN_SET_CHANNEL:
-		/*
-		 * Get current scan band. scan_band may be IEEE80211_NUM_BANDS
-		 * after we successfully scanned the last channel of the last
-		 * band (and the last band is supported by the hw)
-		 */
-		if (local->scan_band < IEEE80211_NUM_BANDS)
-			sband = local->hw.wiphy->bands[local->scan_band];
-		else
-			sband = NULL;
-
-		/*
-		 * If we are at an unsupported band and have more bands
-		 * left to scan, advance to the next supported one.
-		 */
-		while (!sband && local->scan_band < IEEE80211_NUM_BANDS - 1) {
-			local->scan_band++;
-			sband = local->hw.wiphy->bands[local->scan_band];
-			local->scan_channel_idx = 0;
-		}
-
 		/* if no more bands/channels left, complete scan */
-		if (!sband || local->scan_channel_idx >= sband->n_channels) {
-			ieee80211_scan_completed(local_to_hw(local));
+		if (local->scan_channel_idx >= local->scan_req->n_channels) {
+			ieee80211_scan_completed(local_to_hw(local), false);
 			return;
 		}
 		skip = 0;
-		chan = &sband->channels[local->scan_channel_idx];
+		chan = local->scan_req->channels[local->scan_channel_idx];
 
 		if (chan->flags & IEEE80211_CHAN_DISABLED ||
 		    (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
@@ -575,15 +569,6 @@ void ieee80211_scan_work(struct work_struct *work)
 
 		/* advance state machine to next channel/band */
 		local->scan_channel_idx++;
-		if (local->scan_channel_idx >= sband->n_channels) {
-			/*
-			 * scan_band may end up == IEEE80211_NUM_BANDS, but
-			 * we'll catch that case above and complete the scan
-			 * if that is the case.
-			 */
-			local->scan_band++;
-			local->scan_channel_idx = 0;
-		}
 
 		if (skip)
 			break;
@@ -596,10 +581,14 @@ void ieee80211_scan_work(struct work_struct *work)
 		next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
 		local->scan_state = SCAN_SET_CHANNEL;
 
-		if (local->scan_channel->flags & IEEE80211_CHAN_PASSIVE_SCAN)
+		if (local->scan_channel->flags & IEEE80211_CHAN_PASSIVE_SCAN ||
+		    !local->scan_req->n_ssids)
 			break;
-		ieee80211_send_probe_req(sdata, NULL, local->scan_ssid,
-					 local->scan_ssid_len);
+		for (i = 0; i < local->scan_req->n_ssids; i++)
+			ieee80211_send_probe_req(
+				sdata, NULL,
+				local->scan_req->ssids[i].ssid,
+				local->scan_req->ssids[i].ssid_len);
 		next_delay = IEEE80211_CHANNEL_TIME;
 		break;
 	}
@@ -610,14 +599,19 @@ void ieee80211_scan_work(struct work_struct *work)
 
 
 int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
-			 u8 *ssid, size_t ssid_len)
+			 struct cfg80211_scan_request *req)
 {
 	struct ieee80211_local *local = scan_sdata->local;
 	struct ieee80211_sub_if_data *sdata;
 
-	if (ssid_len > IEEE80211_MAX_SSID_LEN)
+	if (!req)
 		return -EINVAL;
 
+	if (local->scan_req && local->scan_req != req)
+		return -EBUSY;
+
+	local->scan_req = req;
+
 	/* MLME-SCAN.request (page 118)  page 144 (11.1.3.1)
 	 * BSSType: INFRASTRUCTURE, INDEPENDENT, ANY_BSS
 	 * BSSID: MACAddress
@@ -645,7 +639,7 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 		int rc;
 
 		local->hw_scanning = true;
-		rc = local->ops->hw_scan(local_to_hw(local), ssid, ssid_len);
+		rc = local->ops->hw_scan(local_to_hw(local), req);
 		if (rc) {
 			local->hw_scanning = false;
 			return rc;
@@ -678,15 +672,10 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 	}
 	mutex_unlock(&local->iflist_mtx);
 
-	if (ssid) {
-		local->scan_ssid_len = ssid_len;
-		memcpy(local->scan_ssid, ssid, ssid_len);
-	} else
-		local->scan_ssid_len = 0;
 	local->scan_state = SCAN_SET_CHANNEL;
 	local->scan_channel_idx = 0;
-	local->scan_band = IEEE80211_BAND_2GHZ;
 	local->scan_sdata = scan_sdata;
+	local->scan_req = req;
 
 	netif_addr_lock_bh(local->mdev);
 	local->filter_flags |= FIF_BCN_PRBRESP_PROMISC;
@@ -706,13 +695,21 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 
 
 int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
-			   u8 *ssid, size_t ssid_len)
+			   struct cfg80211_scan_request *req)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_sta *ifsta;
 
+	if (!req)
+		return -EINVAL;
+
+	if (local->scan_req && local->scan_req != req)
+		return -EBUSY;
+
+	local->scan_req = req;
+
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
-		return ieee80211_start_scan(sdata, ssid, ssid_len);
+		return ieee80211_start_scan(sdata, req);
 
 	/*
 	 * STA has a state machine that might need to defer scanning
@@ -727,241 +724,8 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 	}
 
 	ifsta = &sdata->u.sta;
-
-	ifsta->scan_ssid_len = ssid_len;
-	if (ssid_len)
-		memcpy(ifsta->scan_ssid, ssid, ssid_len);
 	set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request);
 	queue_work(local->hw.workqueue, &ifsta->work);
 
 	return 0;
 }
-
-
-static void ieee80211_scan_add_ies(struct iw_request_info *info,
-				   struct ieee80211_bss *bss,
-				   char **current_ev, char *end_buf)
-{
-	u8 *pos, *end, *next;
-	struct iw_event iwe;
-
-	if (bss == NULL || bss->ies == NULL)
-		return;
-
-	/*
-	 * If needed, fragment the IEs buffer (at IE boundaries) into short
-	 * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
-	 */
-	pos = bss->ies;
-	end = pos + bss->ies_len;
-
-	while (end - pos > IW_GENERIC_IE_MAX) {
-		next = pos + 2 + pos[1];
-		while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
-			next = next + 2 + next[1];
-
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVGENIE;
-		iwe.u.data.length = next - pos;
-		*current_ev = iwe_stream_add_point(info, *current_ev,
-						   end_buf, &iwe, pos);
-
-		pos = next;
-	}
-
-	if (end > pos) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVGENIE;
-		iwe.u.data.length = end - pos;
-		*current_ev = iwe_stream_add_point(info, *current_ev,
-						   end_buf, &iwe, pos);
-	}
-}
-
-
-static char *
-ieee80211_scan_result(struct ieee80211_local *local,
-		      struct iw_request_info *info,
-		      struct ieee80211_bss *bss,
-		      char *current_ev, char *end_buf)
-{
-	struct iw_event iwe;
-	char *buf;
-
-	if (time_after(jiffies,
-		       bss->last_update + IEEE80211_SCAN_RESULT_EXPIRE))
-		return current_ev;
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWAP;
-	iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
-	memcpy(iwe.u.ap_addr.sa_data, bss->bssid, ETH_ALEN);
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_ADDR_LEN);
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWESSID;
-	if (bss_mesh_cfg(bss)) {
-		iwe.u.data.length = bss_mesh_id_len(bss);
-		iwe.u.data.flags = 1;
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, bss_mesh_id(bss));
-	} else {
-		iwe.u.data.length = bss->ssid_len;
-		iwe.u.data.flags = 1;
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, bss->ssid);
-	}
-
-	if (bss->capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)
-	    || bss_mesh_cfg(bss)) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = SIOCGIWMODE;
-		if (bss_mesh_cfg(bss))
-			iwe.u.mode = IW_MODE_MESH;
-		else if (bss->capability & WLAN_CAPABILITY_ESS)
-			iwe.u.mode = IW_MODE_MASTER;
-		else
-			iwe.u.mode = IW_MODE_ADHOC;
-		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
-						  &iwe, IW_EV_UINT_LEN);
-	}
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWFREQ;
-	iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq);
-	iwe.u.freq.e = 0;
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_FREQ_LEN);
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWFREQ;
-	iwe.u.freq.m = bss->freq;
-	iwe.u.freq.e = 6;
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_FREQ_LEN);
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = IWEVQUAL;
-	iwe.u.qual.qual = bss->qual;
-	iwe.u.qual.level = bss->signal;
-	iwe.u.qual.noise = bss->noise;
-	iwe.u.qual.updated = local->wstats_flags;
-	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
-					  IW_EV_QUAL_LEN);
-
-	memset(&iwe, 0, sizeof(iwe));
-	iwe.cmd = SIOCGIWENCODE;
-	if (bss->capability & WLAN_CAPABILITY_PRIVACY)
-		iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
-	else
-		iwe.u.data.flags = IW_ENCODE_DISABLED;
-	iwe.u.data.length = 0;
-	current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-					  &iwe, "");
-
-	ieee80211_scan_add_ies(info, bss, &current_ev, end_buf);
-
-	if (bss->supp_rates_len > 0) {
-		/* display all supported rates in readable format */
-		char *p = current_ev + iwe_stream_lcp_len(info);
-		int i;
-
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = SIOCGIWRATE;
-		/* Those two flags are ignored... */
-		iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
-
-		for (i = 0; i < bss->supp_rates_len; i++) {
-			iwe.u.bitrate.value = ((bss->supp_rates[i] &
-							0x7f) * 500000);
-			p = iwe_stream_add_value(info, current_ev, p,
-					end_buf, &iwe, IW_EV_PARAM_LEN);
-		}
-		current_ev = p;
-	}
-
-	buf = kmalloc(30, GFP_ATOMIC);
-	if (buf) {
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVCUSTOM;
-		sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->timestamp));
-		iwe.u.data.length = strlen(buf);
-		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
-						  &iwe, buf);
-		memset(&iwe, 0, sizeof(iwe));
-		iwe.cmd = IWEVCUSTOM;
-		sprintf(buf, " Last beacon: %dms ago",
-			jiffies_to_msecs(jiffies - bss->last_update));
-		iwe.u.data.length = strlen(buf);
-		current_ev = iwe_stream_add_point(info, current_ev,
-						  end_buf, &iwe, buf);
-		kfree(buf);
-	}
-
-	if (bss_mesh_cfg(bss)) {
-		u8 *cfg = bss_mesh_cfg(bss);
-		buf = kmalloc(50, GFP_ATOMIC);
-		if (buf) {
-			memset(&iwe, 0, sizeof(iwe));
-			iwe.cmd = IWEVCUSTOM;
-			sprintf(buf, "Mesh network (version %d)", cfg[0]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Path Selection Protocol ID: "
-				"0x%02X%02X%02X%02X", cfg[1], cfg[2], cfg[3],
-							cfg[4]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Path Selection Metric ID: "
-				"0x%02X%02X%02X%02X", cfg[5], cfg[6], cfg[7],
-							cfg[8]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Congestion Control Mode ID: "
-				"0x%02X%02X%02X%02X", cfg[9], cfg[10],
-							cfg[11], cfg[12]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			sprintf(buf, "Channel Precedence: "
-				"0x%02X%02X%02X%02X", cfg[13], cfg[14],
-							cfg[15], cfg[16]);
-			iwe.u.data.length = strlen(buf);
-			current_ev = iwe_stream_add_point(info, current_ev,
-							  end_buf,
-							  &iwe, buf);
-			kfree(buf);
-		}
-	}
-
-	return current_ev;
-}
-
-
-int ieee80211_scan_results(struct ieee80211_local *local,
-			   struct iw_request_info *info,
-			   char *buf, size_t len)
-{
-	char *current_ev = buf;
-	char *end_buf = buf + len;
-	struct ieee80211_bss *bss;
-
-	spin_lock_bh(&local->bss_lock);
-	list_for_each_entry(bss, &local->bss_list, list) {
-		if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
-			spin_unlock_bh(&local->bss_lock);
-			return -E2BIG;
-		}
-		current_ev = ieee80211_scan_result(local, info, bss,
-						       current_ev, end_buf);
-	}
-	spin_unlock_bh(&local->bss_lock);
-	return current_ev - buf;
-}
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index acd5808b87f4..b337d7d5edb3 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -173,8 +173,9 @@ static int ieee80211_ioctl_giwrange(struct net_device *dev,
 	range->num_encoding_sizes = 2;
 	range->max_encoding_tokens = NUM_DEFAULT_KEYS;
 
+	/* cfg80211 requires this, and enforces 0..100 */
 	if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
-		range->max_qual.level = local->hw.max_signal;
+		range->max_qual.level = 100;
 	else if  (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
 		range->max_qual.level = -110;
 	else
@@ -415,58 +416,6 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
 }
 
 
-static int ieee80211_ioctl_siwscan(struct net_device *dev,
-				   struct iw_request_info *info,
-				   union iwreq_data *wrqu, char *extra)
-{
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	struct iw_scan_req *req = NULL;
-	u8 *ssid = NULL;
-	size_t ssid_len = 0;
-
-	if (!netif_running(dev))
-		return -ENETDOWN;
-
-	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
-	    sdata->vif.type != NL80211_IFTYPE_ADHOC &&
-	    sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
-		return -EOPNOTSUPP;
-
-	/* if SSID was specified explicitly then use that */
-	if (wrqu->data.length == sizeof(struct iw_scan_req) &&
-	    wrqu->data.flags & IW_SCAN_THIS_ESSID) {
-		req = (struct iw_scan_req *)extra;
-		ssid = req->essid;
-		ssid_len = req->essid_len;
-	}
-
-	return ieee80211_request_scan(sdata, ssid, ssid_len);
-}
-
-
-static int ieee80211_ioctl_giwscan(struct net_device *dev,
-				   struct iw_request_info *info,
-				   struct iw_point *data, char *extra)
-{
-	int res;
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_sub_if_data *sdata;
-
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
-	if (local->sw_scanning || local->hw_scanning)
-		return -EAGAIN;
-
-	res = ieee80211_scan_results(local, info, extra, data->length);
-	if (res >= 0) {
-		data->length = res;
-		return 0;
-	}
-	data->length = 0;
-	return res;
-}
-
-
 static int ieee80211_ioctl_siwrate(struct net_device *dev,
 				  struct iw_request_info *info,
 				  struct iw_param *rate, char *extra)
@@ -1165,8 +1114,8 @@ static const iw_handler ieee80211_handler[] =
 	(iw_handler) ieee80211_ioctl_giwap,		/* SIOCGIWAP */
 	(iw_handler) ieee80211_ioctl_siwmlme,		/* SIOCSIWMLME */
 	(iw_handler) NULL,				/* SIOCGIWAPLIST */
-	(iw_handler) ieee80211_ioctl_siwscan,		/* SIOCSIWSCAN */
-	(iw_handler) ieee80211_ioctl_giwscan,		/* SIOCGIWSCAN */
+	(iw_handler) cfg80211_wext_siwscan,		/* SIOCSIWSCAN */
+	(iw_handler) cfg80211_wext_giwscan,		/* SIOCGIWSCAN */
 	(iw_handler) ieee80211_ioctl_siwessid,		/* SIOCSIWESSID */
 	(iw_handler) ieee80211_ioctl_giwessid,		/* SIOCGIWESSID */
 	(iw_handler) NULL,				/* SIOCSIWNICKN */
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 938a334c8dbc..dad43c24f695 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o
 obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o
 obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o
 
-cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o
+cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o
 cfg80211-$(CONFIG_WIRELESS_EXT) += wext-compat.o
 cfg80211-$(CONFIG_NL80211) += nl80211.o
 
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 125226476089..3cccd1390cea 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -240,6 +240,8 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv)
 	mutex_init(&drv->mtx);
 	mutex_init(&drv->devlist_mtx);
 	INIT_LIST_HEAD(&drv->netdev_list);
+	spin_lock_init(&drv->bss_lock);
+	INIT_LIST_HEAD(&drv->bss_list);
 
 	device_initialize(&drv->wiphy.dev);
 	drv->wiphy.dev.class = &ieee80211_class;
@@ -259,6 +261,9 @@ int wiphy_register(struct wiphy *wiphy)
 	int i;
 	u16 ifmodes = wiphy->interface_modes;
 
+	if (WARN_ON(wiphy->max_scan_ssids < 1))
+		return -EINVAL;
+
 	/* sanity check ifmodes */
 	WARN_ON(!ifmodes);
 	ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1;
@@ -367,8 +372,11 @@ EXPORT_SYMBOL(wiphy_unregister);
 
 void cfg80211_dev_free(struct cfg80211_registered_device *drv)
 {
+	struct cfg80211_internal_bss *scan, *tmp;
 	mutex_destroy(&drv->mtx);
 	mutex_destroy(&drv->devlist_mtx);
+	list_for_each_entry_safe(scan, tmp, &drv->bss_list, list)
+		kfree(scan);
 	kfree(drv);
 }
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index f7fb9f413028..e29ad4cd464f 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -8,6 +8,8 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
 #include <net/genetlink.h>
 #include <net/wireless.h>
 #include <net/cfg80211.h>
@@ -41,6 +43,13 @@ struct cfg80211_registered_device {
 	struct mutex devlist_mtx;
 	struct list_head netdev_list;
 
+	/* BSSes/scanning */
+	spinlock_t bss_lock;
+	struct list_head bss_list;
+	struct rb_root bss_tree;
+	u32 bss_generation;
+	struct cfg80211_scan_request *scan_req; /* protected by RTNL */
+
 	/* must be last because of the way we do wiphy_priv(),
 	 * and it should at least be aligned to NETDEV_ALIGN */
 	struct wiphy wiphy __attribute__((__aligned__(NETDEV_ALIGN)));
@@ -56,6 +65,15 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
 extern struct mutex cfg80211_drv_mutex;
 extern struct list_head cfg80211_drv_list;
 
+struct cfg80211_internal_bss {
+	struct list_head list;
+	struct rb_node rbn;
+	unsigned long ts;
+	struct kref ref;
+	/* must be last because of priv member */
+	struct cfg80211_bss pub;
+};
+
 /*
  * This function returns a pointer to the driver
  * that the genl_info item that is passed refers to.
@@ -94,4 +112,6 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv,
 void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
 void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby);
 
+void cfg80211_bss_expire(struct cfg80211_registered_device *dev);
+
 #endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d452396006ee..298a4de59948 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14,6 +14,7 @@
 #include <linux/nl80211.h>
 #include <linux/rtnetlink.h>
 #include <linux/netlink.h>
+#include <linux/etherdevice.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>
 #include "core.h"
@@ -109,6 +110,8 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
 	[NL80211_ATTR_IE] = { .type = NLA_BINARY,
 			      .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED },
+	[NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED },
 };
 
 /* message building helper */
@@ -141,6 +144,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 
 	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->idx);
 	NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
+	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
+		   dev->wiphy.max_scan_ssids);
 
 	nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
 	if (!nl_modes)
@@ -2270,6 +2275,246 @@ static int nl80211_set_mgmt_extra_ie(struct sk_buff *skb,
 	return err;
 }
 
+static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	struct cfg80211_scan_request *request;
+	struct cfg80211_ssid *ssid;
+	struct ieee80211_channel *channel;
+	struct nlattr *attr;
+	struct wiphy *wiphy;
+	int err, tmp, n_ssids = 0, n_channels = 0, i;
+	enum ieee80211_band band;
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		return err;
+
+	wiphy = &drv->wiphy;
+
+	if (!drv->ops->scan) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rtnl_lock();
+
+	if (drv->scan_req) {
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
+	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp)
+			n_channels++;
+		if (!n_channels) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+	} else {
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+			if (wiphy->bands[band])
+				n_channels += wiphy->bands[band]->n_channels;
+	}
+
+	if (info->attrs[NL80211_ATTR_SCAN_SSIDS])
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp)
+			n_ssids++;
+
+	if (n_ssids > wiphy->max_scan_ssids) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	request = kzalloc(sizeof(*request)
+			+ sizeof(*ssid) * n_ssids
+			+ sizeof(channel) * n_channels, GFP_KERNEL);
+	if (!request) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	request->channels = (void *)((char *)request + sizeof(*request));
+	request->n_channels = n_channels;
+	if (n_ssids)
+		request->ssids = (void *)(request->channels + n_channels);
+	request->n_ssids = n_ssids;
+
+	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+		/* user specified, bail out if channel not found */
+		request->n_channels = n_channels;
+		i = 0;
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) {
+			request->channels[i] = ieee80211_get_channel(wiphy, nla_get_u32(attr));
+			if (!request->channels[i]) {
+				err = -EINVAL;
+				goto out_free;
+			}
+			i++;
+		}
+	} else {
+		/* all channels */
+		i = 0;
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			int j;
+			if (!wiphy->bands[band])
+				continue;
+			for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+				request->channels[i] = &wiphy->bands[band]->channels[j];
+				i++;
+			}
+		}
+	}
+
+	i = 0;
+	if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) {
+			if (request->ssids[i].ssid_len > IEEE80211_MAX_SSID_LEN) {
+				err = -EINVAL;
+				goto out_free;
+			}
+			memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));
+			request->ssids[i].ssid_len = nla_len(attr);
+			i++;
+		}
+	}
+
+	request->ifidx = dev->ifindex;
+	request->wiphy = &drv->wiphy;
+
+	drv->scan_req = request;
+	err = drv->ops->scan(&drv->wiphy, dev, request);
+
+ out_free:
+	if (err) {
+		drv->scan_req = NULL;
+		kfree(request);
+	}
+ out_unlock:
+	rtnl_unlock();
+ out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+	return err;
+}
+
+static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
+			    struct cfg80211_registered_device *rdev,
+			    struct net_device *dev,
+			    struct cfg80211_bss *res)
+{
+	void *hdr;
+	struct nlattr *bss;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags,
+			     NL80211_CMD_NEW_SCAN_RESULTS);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_SCAN_GENERATION,
+		    rdev->bss_generation);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+
+	bss = nla_nest_start(msg, NL80211_ATTR_BSS);
+	if (!bss)
+		goto nla_put_failure;
+	if (!is_zero_ether_addr(res->bssid))
+		NLA_PUT(msg, NL80211_BSS_BSSID, ETH_ALEN, res->bssid);
+	if (res->information_elements && res->len_information_elements)
+		NLA_PUT(msg, NL80211_BSS_INFORMATION_ELEMENTS,
+			res->len_information_elements,
+			res->information_elements);
+	if (res->tsf)
+		NLA_PUT_U64(msg, NL80211_BSS_TSF, res->tsf);
+	if (res->beacon_interval)
+		NLA_PUT_U16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval);
+	NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability);
+	NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq);
+
+	switch (res->signal_type) {
+	case CFG80211_SIGNAL_TYPE_MBM:
+		NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal);
+		break;
+	case CFG80211_SIGNAL_TYPE_UNSPEC:
+		NLA_PUT_U8(msg, NL80211_BSS_SIGNAL_UNSPEC, res->signal);
+		break;
+	default:
+		break;
+	}
+
+	nla_nest_end(msg, bss);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_scan(struct sk_buff *skb,
+			     struct netlink_callback *cb)
+{
+	struct cfg80211_registered_device *dev;
+	struct net_device *netdev;
+	struct cfg80211_internal_bss *scan;
+	int ifidx = cb->args[0];
+	int start = cb->args[1], idx = 0;
+	int err;
+
+	if (!ifidx) {
+		err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
+				  nl80211_fam.attrbuf, nl80211_fam.maxattr,
+				  nl80211_policy);
+		if (err)
+			return err;
+
+		if (!nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX])
+			return -EINVAL;
+
+		ifidx = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]);
+		if (!ifidx)
+			return -EINVAL;
+		cb->args[0] = ifidx;
+	}
+
+	netdev = dev_get_by_index(&init_net, ifidx);
+	if (!netdev)
+		return -ENODEV;
+
+	dev = cfg80211_get_dev_from_ifindex(ifidx);
+	if (IS_ERR(dev)) {
+		err = PTR_ERR(dev);
+		goto out_put_netdev;
+	}
+
+	spin_lock_bh(&dev->bss_lock);
+	cfg80211_bss_expire(dev);
+
+	list_for_each_entry(scan, &dev->bss_list, list) {
+		if (++idx <= start)
+			continue;
+		if (nl80211_send_bss(skb,
+				NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				dev, netdev, &scan->pub) < 0) {
+			idx--;
+			goto out;
+		}
+	}
+
+ out:
+	spin_unlock_bh(&dev->bss_lock);
+
+	cb->args[1] = idx;
+	err = skb->len;
+	cfg80211_put_dev(dev);
+ out_put_netdev:
+	dev_put(netdev);
+
+	return err;
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -2443,12 +2688,26 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_TRIGGER_SCAN,
+		.doit = nl80211_trigger_scan,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_GET_SCAN,
+		.policy = nl80211_policy,
+		.dumpit = nl80211_dump_scan,
+	},
 };
 
 /* multicast groups */
 static struct genl_multicast_group nl80211_config_mcgrp = {
 	.name = "config",
 };
+static struct genl_multicast_group nl80211_scan_mcgrp = {
+	.name = "scan",
+};
 
 /* notification functions */
 
@@ -2468,6 +2727,66 @@ void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev)
 	genlmsg_multicast(msg, 0, nl80211_config_mcgrp.id, GFP_KERNEL);
 }
 
+static int nl80211_send_scan_donemsg(struct sk_buff *msg,
+				    struct cfg80211_registered_device *rdev,
+				    struct net_device *netdev,
+				    u32 pid, u32 seq, int flags,
+				    u32 cmd)
+{
+	void *hdr;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags, cmd);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+
+	/* XXX: we should probably bounce back the request? */
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
+			    struct net_device *netdev)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_scan_donemsg(msg, rdev, netdev, 0, 0, 0,
+				      NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL);
+}
+
+void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_scan_donemsg(msg, rdev, netdev, 0, 0, 0,
+				      NL80211_CMD_SCAN_ABORTED) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL);
+}
+
 /* initialisation/exit functions */
 
 int nl80211_init(void)
@@ -2488,6 +2807,10 @@ int nl80211_init(void)
 	if (err)
 		goto err_out;
 
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_scan_mcgrp);
+	if (err)
+		goto err_out;
+
 	return 0;
  err_out:
 	genl_unregister_family(&nl80211_fam);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index f3ea5c029aee..b565a5f84e97 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -7,6 +7,10 @@
 extern int nl80211_init(void);
 extern void nl80211_exit(void);
 extern void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev);
+extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
+				   struct net_device *netdev);
+extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
+				      struct net_device *netdev);
 #else
 static inline int nl80211_init(void)
 {
@@ -19,6 +23,10 @@ static inline void nl80211_notify_dev_rename(
 	struct cfg80211_registered_device *rdev)
 {
 }
+static inline void
+nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
+		       struct net_device *netdev)
+{}
 #endif /* CONFIG_NL80211 */
 
 #endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
new file mode 100644
index 000000000000..009d12810c55
--- /dev/null
+++ b/net/wireless/scan.c
@@ -0,0 +1,807 @@
+/*
+ * cfg80211 scan result handling
+ *
+ * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/wireless.h>
+#include <linux/nl80211.h>
+#include <linux/etherdevice.h>
+#include <net/arp.h>
+#include <net/cfg80211.h>
+#include <net/iw_handler.h>
+#include "core.h"
+#include "nl80211.h"
+
+#define IEEE80211_SCAN_RESULT_EXPIRE	(10 * HZ)
+
+void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted)
+{
+	struct net_device *dev;
+#ifdef CONFIG_WIRELESS_EXT
+	union iwreq_data wrqu;
+#endif
+
+	dev = dev_get_by_index(&init_net, request->ifidx);
+	if (!dev)
+		goto out;
+
+	WARN_ON(request != wiphy_to_dev(request->wiphy)->scan_req);
+	wiphy_to_dev(request->wiphy)->scan_req = NULL;
+
+	if (aborted)
+		nl80211_send_scan_aborted(wiphy_to_dev(request->wiphy), dev);
+	else
+		nl80211_send_scan_done(wiphy_to_dev(request->wiphy), dev);
+
+#ifdef CONFIG_WIRELESS_EXT
+	if (!aborted) {
+		memset(&wrqu, 0, sizeof(wrqu));
+
+		wireless_send_event(dev, SIOCGIWSCAN, &wrqu, NULL);
+	}
+#endif
+
+	dev_put(dev);
+
+ out:
+	kfree(request);
+}
+EXPORT_SYMBOL(cfg80211_scan_done);
+
+static void bss_release(struct kref *ref)
+{
+	struct cfg80211_internal_bss *bss;
+
+	bss = container_of(ref, struct cfg80211_internal_bss, ref);
+	kfree(bss);
+}
+
+/* must hold dev->bss_lock! */
+void cfg80211_bss_expire(struct cfg80211_registered_device *dev)
+{
+	struct cfg80211_internal_bss *bss, *tmp;
+	bool expired = false;
+
+	list_for_each_entry_safe(bss, tmp, &dev->bss_list, list) {
+		if (!time_after(jiffies, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE))
+			continue;
+		list_del(&bss->list);
+		rb_erase(&bss->rbn, &dev->bss_tree);
+		kref_put(&bss->ref, bss_release);
+		expired = true;
+	}
+
+	if (expired)
+		dev->bss_generation++;
+}
+
+static u8 *find_ie(u8 num, u8 *ies, size_t len)
+{
+	while (len > 2 && ies[0] != num) {
+		len -= ies[1] + 2;
+		ies += ies[1] + 2;
+	}
+	if (len < 2)
+		return NULL;
+	if (len < 2 + ies[1])
+		return NULL;
+	return ies;
+}
+
+static int cmp_ies(u8 num, u8 *ies1, size_t len1, u8 *ies2, size_t len2)
+{
+	const u8 *ie1 = find_ie(num, ies1, len1);
+	const u8 *ie2 = find_ie(num, ies2, len2);
+	int r;
+
+	if (!ie1 && !ie2)
+		return 0;
+	if (!ie1)
+		return -1;
+
+	r = memcmp(ie1 + 2, ie2 + 2, min(ie1[1], ie2[1]));
+	if (r == 0 && ie1[1] != ie2[1])
+		return ie2[1] - ie1[1];
+	return r;
+}
+
+static bool is_bss(struct cfg80211_bss *a,
+		   const u8 *bssid,
+		   const u8 *ssid, size_t ssid_len)
+{
+	const u8 *ssidie;
+
+	if (compare_ether_addr(a->bssid, bssid))
+		return false;
+
+	ssidie = find_ie(WLAN_EID_SSID,
+			 a->information_elements,
+			 a->len_information_elements);
+	if (!ssidie)
+		return false;
+	if (ssidie[1] != ssid_len)
+		return false;
+	return memcmp(ssidie + 2, ssid, ssid_len) == 0;
+}
+
+static bool is_mesh(struct cfg80211_bss *a,
+		    const u8 *meshid, size_t meshidlen,
+		    const u8 *meshcfg)
+{
+	const u8 *ie;
+
+	if (!is_zero_ether_addr(a->bssid))
+		return false;
+
+	ie = find_ie(WLAN_EID_MESH_ID,
+		     a->information_elements,
+		     a->len_information_elements);
+	if (!ie)
+		return false;
+	if (ie[1] != meshidlen)
+		return false;
+	if (memcmp(ie + 2, meshid, meshidlen))
+		return false;
+
+	ie = find_ie(WLAN_EID_MESH_CONFIG,
+		     a->information_elements,
+		     a->len_information_elements);
+	if (ie[1] != IEEE80211_MESH_CONFIG_LEN)
+		return false;
+
+	/*
+	 * Ignore mesh capability (last two bytes of the IE) when
+	 * comparing since that may differ between stations taking
+	 * part in the same mesh.
+	 */
+	return memcmp(ie + 2, meshcfg, IEEE80211_MESH_CONFIG_LEN - 2) == 0;
+}
+
+static int cmp_bss(struct cfg80211_bss *a,
+		   struct cfg80211_bss *b)
+{
+	int r;
+
+	if (a->channel != b->channel)
+		return b->channel->center_freq - a->channel->center_freq;
+
+	r = memcmp(a->bssid, b->bssid, ETH_ALEN);
+	if (r)
+		return r;
+
+	if (is_zero_ether_addr(a->bssid)) {
+		r = cmp_ies(WLAN_EID_MESH_ID,
+			    a->information_elements,
+			    a->len_information_elements,
+			    b->information_elements,
+			    b->len_information_elements);
+		if (r)
+			return r;
+		return cmp_ies(WLAN_EID_MESH_CONFIG,
+			       a->information_elements,
+			       a->len_information_elements,
+			       b->information_elements,
+			       b->len_information_elements);
+	}
+
+	return cmp_ies(WLAN_EID_SSID,
+		       a->information_elements,
+		       a->len_information_elements,
+		       b->information_elements,
+		       b->len_information_elements);
+}
+
+struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
+				      struct ieee80211_channel *channel,
+				      const u8 *bssid,
+				      const u8 *ssid, size_t ssid_len)
+{
+	struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy);
+	struct cfg80211_internal_bss *bss, *res = NULL;
+
+	spin_lock_bh(&dev->bss_lock);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		if (channel && bss->pub.channel != channel)
+			continue;
+		if (is_bss(&bss->pub, bssid, ssid, ssid_len)) {
+			res = bss;
+			kref_get(&res->ref);
+			break;
+		}
+	}
+
+	spin_unlock_bh(&dev->bss_lock);
+	if (!res)
+		return NULL;
+	return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_get_bss);
+
+struct cfg80211_bss *cfg80211_get_mesh(struct wiphy *wiphy,
+				       struct ieee80211_channel *channel,
+				       const u8 *meshid, size_t meshidlen,
+				       const u8 *meshcfg)
+{
+	struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy);
+	struct cfg80211_internal_bss *bss, *res = NULL;
+
+	spin_lock_bh(&dev->bss_lock);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		if (channel && bss->pub.channel != channel)
+			continue;
+		if (is_mesh(&bss->pub, meshid, meshidlen, meshcfg)) {
+			res = bss;
+			kref_get(&res->ref);
+			break;
+		}
+	}
+
+	spin_unlock_bh(&dev->bss_lock);
+	if (!res)
+		return NULL;
+	return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_get_mesh);
+
+
+static void rb_insert_bss(struct cfg80211_registered_device *dev,
+			  struct cfg80211_internal_bss *bss)
+{
+	struct rb_node **p = &dev->bss_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfg80211_internal_bss *tbss;
+	int cmp;
+
+	while (*p) {
+		parent = *p;
+		tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn);
+
+		cmp = cmp_bss(&bss->pub, &tbss->pub);
+
+		if (WARN_ON(!cmp)) {
+			/* will sort of leak this BSS */
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&bss->rbn, parent, p);
+	rb_insert_color(&bss->rbn, &dev->bss_tree);
+}
+
+static struct cfg80211_internal_bss *
+rb_find_bss(struct cfg80211_registered_device *dev,
+	    struct cfg80211_internal_bss *res)
+{
+	struct rb_node *n = dev->bss_tree.rb_node;
+	struct cfg80211_internal_bss *bss;
+	int r;
+
+	while (n) {
+		bss = rb_entry(n, struct cfg80211_internal_bss, rbn);
+		r = cmp_bss(&res->pub, &bss->pub);
+
+		if (r == 0)
+			return bss;
+		else if (r < 0)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+	}
+
+	return NULL;
+}
+
+static struct cfg80211_internal_bss *
+cfg80211_bss_update(struct cfg80211_registered_device *dev,
+		    struct cfg80211_internal_bss *res,
+		    bool overwrite)
+{
+	struct cfg80211_internal_bss *found = NULL;
+	const u8 *meshid, *meshcfg;
+
+	/*
+	 * The reference to "res" is donated to this function.
+	 */
+
+	if (WARN_ON(!res->pub.channel)) {
+		kref_put(&res->ref, bss_release);
+		return NULL;
+	}
+
+	res->ts = jiffies;
+
+	if (is_zero_ether_addr(res->pub.bssid)) {
+		/* must be mesh, verify */
+		meshid = find_ie(WLAN_EID_MESH_ID, res->pub.information_elements,
+				 res->pub.len_information_elements);
+		meshcfg = find_ie(WLAN_EID_MESH_CONFIG,
+				  res->pub.information_elements,
+				  res->pub.len_information_elements);
+		if (!meshid || !meshcfg ||
+		    meshcfg[1] != IEEE80211_MESH_CONFIG_LEN) {
+			/* bogus mesh */
+			kref_put(&res->ref, bss_release);
+			return NULL;
+		}
+	}
+
+	spin_lock_bh(&dev->bss_lock);
+
+	found = rb_find_bss(dev, res);
+
+	if (found && overwrite) {
+		list_replace(&found->list, &res->list);
+		rb_replace_node(&found->rbn, &res->rbn,
+				&dev->bss_tree);
+		kref_put(&found->ref, bss_release);
+		found = res;
+	} else if (found) {
+		kref_get(&found->ref);
+		found->pub.beacon_interval = res->pub.beacon_interval;
+		found->pub.tsf = res->pub.tsf;
+		found->pub.signal = res->pub.signal;
+		found->pub.signal_type = res->pub.signal_type;
+		found->pub.capability = res->pub.capability;
+		found->ts = res->ts;
+		kref_put(&res->ref, bss_release);
+	} else {
+		/* this "consumes" the reference */
+		list_add_tail(&res->list, &dev->bss_list);
+		rb_insert_bss(dev, res);
+		found = res;
+	}
+
+	dev->bss_generation++;
+	spin_unlock_bh(&dev->bss_lock);
+
+	kref_get(&found->ref);
+	return found;
+}
+
+struct cfg80211_bss *
+cfg80211_inform_bss_frame(struct wiphy *wiphy,
+			  struct ieee80211_channel *channel,
+			  struct ieee80211_mgmt *mgmt, size_t len,
+			  s32 signal, enum cfg80211_signal_type sigtype,
+			  gfp_t gfp)
+{
+	struct cfg80211_internal_bss *res;
+	size_t ielen = len - offsetof(struct ieee80211_mgmt,
+				      u.probe_resp.variable);
+	bool overwrite;
+	size_t privsz = wiphy->bss_priv_size;
+
+	if (WARN_ON(sigtype == NL80211_BSS_SIGNAL_UNSPEC &&
+	            (signal < 0 || signal > 100)))
+		return NULL;
+
+	if (WARN_ON(!mgmt || !wiphy ||
+		    len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable)))
+		return NULL;
+
+	res = kzalloc(sizeof(*res) + privsz + ielen, gfp);
+	if (!res)
+		return NULL;
+
+	memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN);
+	res->pub.channel = channel;
+	res->pub.signal_type = sigtype;
+	res->pub.signal = signal;
+	res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
+	res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
+	res->pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
+	/* point to after the private area */
+	res->pub.information_elements = (u8 *)res + sizeof(*res) + privsz;
+	memcpy(res->pub.information_elements, mgmt->u.probe_resp.variable, ielen);
+	res->pub.len_information_elements = ielen;
+
+	kref_init(&res->ref);
+
+	overwrite = ieee80211_is_probe_resp(mgmt->frame_control);
+
+	res = cfg80211_bss_update(wiphy_to_dev(wiphy), res, overwrite);
+	if (!res)
+		return NULL;
+
+	/* cfg80211_bss_update gives us a referenced result */
+	return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_inform_bss_frame);
+
+void cfg80211_put_bss(struct cfg80211_bss *pub)
+{
+	struct cfg80211_internal_bss *bss;
+
+	if (!pub)
+		return;
+
+	bss = container_of(pub, struct cfg80211_internal_bss, pub);
+	kref_put(&bss->ref, bss_release);
+}
+EXPORT_SYMBOL(cfg80211_put_bss);
+
+#ifdef CONFIG_WIRELESS_EXT
+int cfg80211_wext_siwscan(struct net_device *dev,
+			  struct iw_request_info *info,
+			  union iwreq_data *wrqu, char *extra)
+{
+	struct cfg80211_registered_device *rdev;
+	struct wiphy *wiphy;
+	struct iw_scan_req *wreq = NULL;
+	struct cfg80211_scan_request *creq;
+	int i, err, n_channels = 0;
+	enum ieee80211_band band;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	rdev = cfg80211_get_dev_from_ifindex(dev->ifindex);
+
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+
+	if (rdev->scan_req) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	wiphy = &rdev->wiphy;
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+		if (wiphy->bands[band])
+			n_channels += wiphy->bands[band]->n_channels;
+
+	creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) +
+		       n_channels * sizeof(void *),
+		       GFP_ATOMIC);
+	if (!creq) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	creq->wiphy = wiphy;
+	creq->ifidx = dev->ifindex;
+	creq->ssids = (void *)(creq + 1);
+	creq->channels = (void *)(creq->ssids + 1);
+	creq->n_channels = n_channels;
+	creq->n_ssids = 1;
+
+	/* all channels */
+	i = 0;
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		int j;
+		if (!wiphy->bands[band])
+			continue;
+		for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+			creq->channels[i] = &wiphy->bands[band]->channels[j];
+			i++;
+		}
+	}
+
+	/* translate scan request */
+	if (wrqu->data.length == sizeof(struct iw_scan_req)) {
+		wreq = (struct iw_scan_req *)extra;
+
+		if (wrqu->data.flags & IW_SCAN_THIS_ESSID) {
+			if (wreq->essid_len > IEEE80211_MAX_SSID_LEN)
+				return -EINVAL;
+			memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len);
+			creq->ssids[0].ssid_len = wreq->essid_len;
+		}
+		if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE)
+			creq->n_ssids = 0;
+	}
+
+	rdev->scan_req = creq;
+	err = rdev->ops->scan(wiphy, dev, creq);
+	if (err) {
+		rdev->scan_req = NULL;
+		kfree(creq);
+	}
+ out:
+	cfg80211_put_dev(rdev);
+	return err;
+}
+EXPORT_SYMBOL(cfg80211_wext_siwscan);
+
+static void ieee80211_scan_add_ies(struct iw_request_info *info,
+				   struct cfg80211_bss *bss,
+				   char **current_ev, char *end_buf)
+{
+	u8 *pos, *end, *next;
+	struct iw_event iwe;
+
+	if (!bss->information_elements ||
+	    !bss->len_information_elements)
+		return;
+
+	/*
+	 * If needed, fragment the IEs buffer (at IE boundaries) into short
+	 * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
+	 */
+	pos = bss->information_elements;
+	end = pos + bss->len_information_elements;
+
+	while (end - pos > IW_GENERIC_IE_MAX) {
+		next = pos + 2 + pos[1];
+		while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
+			next = next + 2 + next[1];
+
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = next - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+
+		pos = next;
+	}
+
+	if (end > pos) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = end - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+	}
+}
+
+
+static char *
+ieee80211_bss(struct iw_request_info *info,
+		      struct cfg80211_internal_bss *bss,
+		      char *current_ev, char *end_buf)
+{
+	struct iw_event iwe;
+	u8 *buf, *cfg, *p;
+	u8 *ie = bss->pub.information_elements;
+	int rem = bss->pub.len_information_elements, i;
+	bool ismesh = false;
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWAP;
+	iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
+	memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN);
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_ADDR_LEN);
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWFREQ;
+	iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq);
+	iwe.u.freq.e = 0;
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_FREQ_LEN);
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWFREQ;
+	iwe.u.freq.m = bss->pub.channel->center_freq;
+	iwe.u.freq.e = 6;
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_FREQ_LEN);
+
+	if (bss->pub.signal_type != CFG80211_SIGNAL_TYPE_NONE) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVQUAL;
+		iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED |
+				     IW_QUAL_NOISE_INVALID |
+				     IW_QUAL_QUAL_INVALID;
+		switch (bss->pub.signal_type) {
+		case CFG80211_SIGNAL_TYPE_MBM:
+			iwe.u.qual.level = bss->pub.signal / 100;
+			iwe.u.qual.updated |= IW_QUAL_DBM;
+			break;
+		case CFG80211_SIGNAL_TYPE_UNSPEC:
+			iwe.u.qual.level = bss->pub.signal;
+			break;
+		default:
+			/* not reached */
+			break;
+		}
+		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
+						  &iwe, IW_EV_QUAL_LEN);
+	}
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWENCODE;
+	if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY)
+		iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
+	else
+		iwe.u.data.flags = IW_ENCODE_DISABLED;
+	iwe.u.data.length = 0;
+	current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+					  &iwe, "");
+
+	while (rem >= 2) {
+		/* invalid data */
+		if (ie[1] > rem - 2)
+			break;
+
+		switch (ie[0]) {
+		case WLAN_EID_SSID:
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = SIOCGIWESSID;
+			iwe.u.data.length = ie[1];
+			iwe.u.data.flags = 1;
+			current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+							  &iwe, ie + 2);
+			break;
+		case WLAN_EID_MESH_ID:
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = SIOCGIWESSID;
+			iwe.u.data.length = ie[1];
+			iwe.u.data.flags = 1;
+			current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+							  &iwe, ie + 2);
+			break;
+		case WLAN_EID_MESH_CONFIG:
+			ismesh = true;
+			if (ie[1] != IEEE80211_MESH_CONFIG_LEN)
+				break;
+			buf = kmalloc(50, GFP_ATOMIC);
+			if (!buf)
+				break;
+			cfg = ie + 2;
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = IWEVCUSTOM;
+			sprintf(buf, "Mesh network (version %d)", cfg[0]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Path Selection Protocol ID: "
+				"0x%02X%02X%02X%02X", cfg[1], cfg[2], cfg[3],
+							cfg[4]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Path Selection Metric ID: "
+				"0x%02X%02X%02X%02X", cfg[5], cfg[6], cfg[7],
+							cfg[8]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Congestion Control Mode ID: "
+				"0x%02X%02X%02X%02X", cfg[9], cfg[10],
+							cfg[11], cfg[12]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Channel Precedence: "
+				"0x%02X%02X%02X%02X", cfg[13], cfg[14],
+							cfg[15], cfg[16]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			kfree(buf);
+			break;
+		case WLAN_EID_SUPP_RATES:
+		case WLAN_EID_EXT_SUPP_RATES:
+			/* display all supported rates in readable format */
+			p = current_ev + iwe_stream_lcp_len(info);
+
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = SIOCGIWRATE;
+			/* Those two flags are ignored... */
+			iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
+
+			for (i = 0; i < ie[1]; i++) {
+				iwe.u.bitrate.value =
+					((ie[i + 2] & 0x7f) * 500000);
+				p = iwe_stream_add_value(info, current_ev, p,
+						end_buf, &iwe, IW_EV_PARAM_LEN);
+			}
+			current_ev = p;
+			break;
+		}
+		rem -= ie[1] + 2;
+		ie += ie[1] + 2;
+	}
+
+	if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)
+	    || ismesh) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = SIOCGIWMODE;
+		if (ismesh)
+			iwe.u.mode = IW_MODE_MESH;
+		else if (bss->pub.capability & WLAN_CAPABILITY_ESS)
+			iwe.u.mode = IW_MODE_MASTER;
+		else
+			iwe.u.mode = IW_MODE_ADHOC;
+		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
+						  &iwe, IW_EV_UINT_LEN);
+	}
+
+	buf = kmalloc(30, GFP_ATOMIC);
+	if (buf) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->pub.tsf));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+						  &iwe, buf);
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, " Last beacon: %dms ago",
+			jiffies_to_msecs(jiffies - bss->ts));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev,
+						  end_buf, &iwe, buf);
+		kfree(buf);
+	}
+
+	ieee80211_scan_add_ies(info, &bss->pub, &current_ev, end_buf);
+
+	return current_ev;
+}
+
+
+static int ieee80211_scan_results(struct cfg80211_registered_device *dev,
+				  struct iw_request_info *info,
+				  char *buf, size_t len)
+{
+	char *current_ev = buf;
+	char *end_buf = buf + len;
+	struct cfg80211_internal_bss *bss;
+
+	spin_lock_bh(&dev->bss_lock);
+	cfg80211_bss_expire(dev);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
+			spin_unlock_bh(&dev->bss_lock);
+			return -E2BIG;
+		}
+		current_ev = ieee80211_bss(info, bss,
+						   current_ev, end_buf);
+	}
+	spin_unlock_bh(&dev->bss_lock);
+	return current_ev - buf;
+}
+
+
+int cfg80211_wext_giwscan(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_point *data, char *extra)
+{
+	struct cfg80211_registered_device *rdev;
+	int res;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	rdev = cfg80211_get_dev_from_ifindex(dev->ifindex);
+
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+
+	if (rdev->scan_req) {
+		res = -EAGAIN;
+		goto out;
+	}
+
+	res = ieee80211_scan_results(rdev, info, extra, data->length);
+	data->length = 0;
+	if (res >= 0) {
+		data->length = res;
+		res = 0;
+	}
+
+ out:
+	cfg80211_put_dev(rdev);
+	return res;
+}
+EXPORT_SYMBOL(cfg80211_wext_giwscan);
+#endif
-- 
cgit v1.2.3-71-gd317


From 6f2b9b9a9d750a9175dc79c74bfed5add840983c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 29 Jan 2009 16:03:20 +0100
Subject: timer: implement lockdep deadlock detection

This modifies the timer code in a way to allow lockdep to detect
deadlocks resulting from a lock being taken in the timer function
as well as around the del_timer_sync() call.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/linux/timer.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/timer.c        | 68 +++++++++++++++++++++++++++++++------
 2 files changed, 141 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index daf9685b861c..51774eb87cc6 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -5,6 +5,7 @@
 #include <linux/ktime.h>
 #include <linux/stddef.h>
 #include <linux/debugobjects.h>
+#include <linux/stringify.h>
 
 struct tvec_base;
 
@@ -21,52 +22,126 @@ struct timer_list {
 	char start_comm[16];
 	int start_pid;
 #endif
+#ifdef CONFIG_LOCKDEP
+	struct lockdep_map lockdep_map;
+#endif
 };
 
 extern struct tvec_base boot_tvec_bases;
 
+#ifdef CONFIG_LOCKDEP
+/*
+ * NB: because we have to copy the lockdep_map, setting the lockdep_map key
+ * (second argument) here is required, otherwise it could be initialised to
+ * the copy of the lockdep_map later! We use the pointer to and the string
+ * "<file>:<line>" as the key resp. the name of the lockdep_map.
+ */
+#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)				\
+	.lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
+#else
+#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
+#endif
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.entry = { .prev = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.base = &boot_tvec_bases,			\
+		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
+			__FILE__ ":" __stringify(__LINE__))	\
 	}
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
 
-void init_timer(struct timer_list *timer);
-void init_timer_deferrable(struct timer_list *timer);
+void init_timer_key(struct timer_list *timer,
+		    const char *name,
+		    struct lock_class_key *key);
+void init_timer_deferrable_key(struct timer_list *timer,
+			       const char *name,
+			       struct lock_class_key *key);
+
+#ifdef CONFIG_LOCKDEP
+#define init_timer(timer)						\
+	do {								\
+		static struct lock_class_key __key;			\
+		init_timer_key((timer), #timer, &__key);		\
+	} while (0)
+
+#define init_timer_deferrable(timer)					\
+	do {								\
+		static struct lock_class_key __key;			\
+		init_timer_deferrable_key((timer), #timer, &__key);	\
+	} while (0)
+
+#define init_timer_on_stack(timer)					\
+	do {								\
+		static struct lock_class_key __key;			\
+		init_timer_on_stack_key((timer), #timer, &__key);	\
+	} while (0)
+
+#define setup_timer(timer, fn, data)					\
+	do {								\
+		static struct lock_class_key __key;			\
+		setup_timer_key((timer), #timer, &__key, (fn), (data));\
+	} while (0)
+
+#define setup_timer_on_stack(timer, fn, data)				\
+	do {								\
+		static struct lock_class_key __key;			\
+		setup_timer_on_stack_key((timer), #timer, &__key,	\
+					 (fn), (data));			\
+	} while (0)
+#else
+#define init_timer(timer)\
+	init_timer_key((timer), NULL, NULL)
+#define init_timer_deferrable(timer)\
+	init_timer_deferrable_key((timer), NULL, NULL)
+#define init_timer_on_stack(timer)\
+	init_timer_on_stack_key((timer), NULL, NULL)
+#define setup_timer(timer, fn, data)\
+	setup_timer_key((timer), NULL, NULL, (fn), (data))
+#define setup_timer_on_stack(timer, fn, data)\
+	setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
+#endif
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-extern void init_timer_on_stack(struct timer_list *timer);
+extern void init_timer_on_stack_key(struct timer_list *timer,
+				    const char *name,
+				    struct lock_class_key *key);
 extern void destroy_timer_on_stack(struct timer_list *timer);
 #else
 static inline void destroy_timer_on_stack(struct timer_list *timer) { }
-static inline void init_timer_on_stack(struct timer_list *timer)
+static inline void init_timer_on_stack_key(struct timer_list *timer,
+					   const char *name,
+					   struct lock_class_key *key)
 {
-	init_timer(timer);
+	init_timer_key(timer, name, key);
 }
 #endif
 
-static inline void setup_timer(struct timer_list * timer,
+static inline void setup_timer_key(struct timer_list * timer,
+				const char *name,
+				struct lock_class_key *key,
 				void (*function)(unsigned long),
 				unsigned long data)
 {
 	timer->function = function;
 	timer->data = data;
-	init_timer(timer);
+	init_timer_key(timer, name, key);
 }
 
-static inline void setup_timer_on_stack(struct timer_list *timer,
+static inline void setup_timer_on_stack_key(struct timer_list *timer,
+					const char *name,
+					struct lock_class_key *key,
 					void (*function)(unsigned long),
 					unsigned long data)
 {
 	timer->function = function;
 	timer->data = data;
-	init_timer_on_stack(timer);
+	init_timer_on_stack_key(timer, name, key);
 }
 
 /**
diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64fe143d..ef1c385bc572 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)
 	debug_object_free(timer, &timer_debug_descr);
 }
 
-static void __init_timer(struct timer_list *timer);
+static void __init_timer(struct timer_list *timer,
+			 const char *name,
+			 struct lock_class_key *key);
 
-void init_timer_on_stack(struct timer_list *timer)
+void init_timer_on_stack_key(struct timer_list *timer,
+			     const char *name,
+			     struct lock_class_key *key)
 {
 	debug_object_init_on_stack(timer, &timer_debug_descr);
-	__init_timer(timer);
+	__init_timer(timer, name, key);
 }
-EXPORT_SYMBOL_GPL(init_timer_on_stack);
+EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 
 void destroy_timer_on_stack(struct timer_list *timer)
 {
@@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
 static inline void debug_timer_deactivate(struct timer_list *timer) { }
 #endif
 
-static void __init_timer(struct timer_list *timer)
+static void __init_timer(struct timer_list *timer,
+			 const char *name,
+			 struct lock_class_key *key)
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
@@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)
 	timer->start_pid = -1;
 	memset(timer->start_comm, 0, TASK_COMM_LEN);
 #endif
+	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
 /**
@@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)
  * init_timer() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void init_timer(struct timer_list *timer)
+void init_timer_key(struct timer_list *timer,
+		    const char *name,
+		    struct lock_class_key *key)
 {
 	debug_timer_init(timer);
-	__init_timer(timer);
+	__init_timer(timer, name, key);
 }
-EXPORT_SYMBOL(init_timer);
+EXPORT_SYMBOL(init_timer_key);
 
-void init_timer_deferrable(struct timer_list *timer)
+void init_timer_deferrable_key(struct timer_list *timer,
+			       const char *name,
+			       struct lock_class_key *key)
 {
-	init_timer(timer);
+	init_timer_key(timer, name, key);
 	timer_set_deferrable(timer);
 }
-EXPORT_SYMBOL(init_timer_deferrable);
+EXPORT_SYMBOL(init_timer_deferrable_key);
 
 static inline void detach_timer(struct timer_list *timer,
 				int clear_pending)
@@ -789,6 +800,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  */
 int del_timer_sync(struct timer_list *timer)
 {
+#ifdef CONFIG_LOCKDEP
+	unsigned long flags;
+
+	local_irq_save(flags);
+	lock_map_acquire(&timer->lockdep_map);
+	lock_map_release(&timer->lockdep_map);
+	local_irq_restore(flags);
+#endif
+
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
@@ -861,10 +881,36 @@ static inline void __run_timers(struct tvec_base *base)
 
 			set_running_timer(base, timer);
 			detach_timer(timer, 1);
+
 			spin_unlock_irq(&base->lock);
 			{
 				int preempt_count = preempt_count();
+
+#ifdef CONFIG_LOCKDEP
+				/*
+				 * It is permissible to free the timer from
+				 * inside the function that is called from
+				 * it, this we need to take into account for
+				 * lockdep too. To avoid bogus "held lock
+				 * freed" warnings as well as problems when
+				 * looking into timer->lockdep_map, make a
+				 * copy and use that here.
+				 */
+				struct lockdep_map lockdep_map =
+					timer->lockdep_map;
+#endif
+				/*
+				 * Couple the lock chain with the lock chain at
+				 * del_timer_sync() by acquiring the lock_map
+				 * around the fn() call here and in
+				 * del_timer_sync().
+				 */
+				lock_map_acquire(&lockdep_map);
+
 				fn(data);
+
+				lock_map_release(&lockdep_map);
+
 				if (preempt_count != preempt_count()) {
 					printk(KERN_ERR "huh, entered %p "
 					       "with preempt_count %08x, exited"
-- 
cgit v1.2.3-71-gd317


From cf40bd16fdad42c053040bcd3988f5fdedbb6c57 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 21 Jan 2009 08:12:39 +0100
Subject: lockdep: annotate reclaim context (__GFP_NOFS)

Here is another version, with the incremental patch rolled up, and
added reclaim context annotation to kswapd, and allocation tracing
to slab allocators (which may only ever reach the page allocator
in rare cases, so it is good to put annotations here too).

Haven't tested this version as such, but it should be getting closer
to merge worthy ;)

--
After noticing some code in mm/filemap.c accidentally perform a __GFP_FS
allocation when it should not have been, I thought it might be a good idea to
try to catch this kind of thing with lockdep.

I coded up a little idea that seems to work. Unfortunately the system has to
actually be in __GFP_FS page reclaim, then take the lock, before it will mark
it. But at least that might still be some orders of magnitude more common
(and more debuggable) than an actual deadlock condition, so we have some
improvement I hope (the concept is no less complete than discovery of a lock's
interrupt contexts).

I guess we could even do the same thing with __GFP_IO (normal reclaim), and
even GFP_NOIO locks too... but filesystems will have the most locks and fiddly
code paths, so let's start there and see how it goes.

It *seems* to work. I did a quick test.

=================================
[ INFO: inconsistent lock state ]
2.6.28-rc6-00007-ged31348-dirty #26
---------------------------------
inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage.
modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]
{in-reclaim-W} state was registered at:
  [<ffffffff80267bdb>] __lock_acquire+0x75b/0x1a60
  [<ffffffff80268f71>] lock_acquire+0x91/0xc0
  [<ffffffff8070f0e1>] mutex_lock_nested+0xb1/0x310
  [<ffffffffa002002b>] brd_init+0x2b/0x216 [brd]
  [<ffffffff8020903b>] _stext+0x3b/0x170
  [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
  [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b
  [<ffffffffffffffff>] 0xffffffffffffffff
irq event stamp: 3929
hardirqs last  enabled at (3929): [<ffffffff8070f2b5>] mutex_lock_nested+0x285/0x310
hardirqs last disabled at (3928): [<ffffffff8070f089>] mutex_lock_nested+0x59/0x310
softirqs last  enabled at (3732): [<ffffffff8061f623>] sk_filter+0x83/0xe0
softirqs last disabled at (3730): [<ffffffff8061f5b6>] sk_filter+0x16/0xe0

other info that might help us debug this:
1 lock held by modprobe/8526:
 #0:  (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]

stack backtrace:
Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26
Call Trace:
 [<ffffffff80265483>] print_usage_bug+0x193/0x1d0
 [<ffffffff80266530>] mark_lock+0xaf0/0xca0
 [<ffffffff80266735>] mark_held_locks+0x55/0xc0
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffff802667ca>] trace_reclaim_fs+0x2a/0x60
 [<ffffffff80285005>] __alloc_pages_internal+0x475/0x580
 [<ffffffff8070f29e>] ? mutex_lock_nested+0x26e/0x310
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffffa002006a>] brd_init+0x6a/0x216 [brd]
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffff8020903b>] _stext+0x3b/0x170
 [<ffffffff8070f8b9>] ? mutex_unlock+0x9/0x10
 [<ffffffff8070f83d>] ? __mutex_unlock_slowpath+0x10d/0x180
 [<ffffffff802669ec>] ? trace_hardirqs_on_caller+0x12c/0x190
 [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
 [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h    |  17 +++-
 include/linux/sched.h      |   1 +
 kernel/lockdep.c           | 229 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/lockdep_internals.h |   3 +-
 kernel/lockdep_proc.c      |   6 +-
 mm/page_alloc.c            |   5 +
 mm/slab.c                  |   4 +
 mm/slob.c                  |   2 +
 mm/slub.c                  |   1 +
 mm/vmscan.c                |   3 +
 10 files changed, 254 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 23bf02fb124f..cc97bdbc7969 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -27,12 +27,16 @@ enum lock_usage_bit
 	LOCK_USED = 0,
 	LOCK_USED_IN_HARDIRQ,
 	LOCK_USED_IN_SOFTIRQ,
+	LOCK_USED_IN_RECLAIM_FS,
 	LOCK_ENABLED_SOFTIRQS,
 	LOCK_ENABLED_HARDIRQS,
+	LOCK_HELD_OVER_RECLAIM_FS,
 	LOCK_USED_IN_HARDIRQ_READ,
 	LOCK_USED_IN_SOFTIRQ_READ,
+	LOCK_USED_IN_RECLAIM_FS_READ,
 	LOCK_ENABLED_SOFTIRQS_READ,
 	LOCK_ENABLED_HARDIRQS_READ,
+	LOCK_HELD_OVER_RECLAIM_FS_READ,
 	LOCK_USAGE_STATES
 };
 
@@ -42,16 +46,20 @@ enum lock_usage_bit
 #define LOCKF_USED			(1 << LOCK_USED)
 #define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
 #define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
+#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
 #define LOCKF_ENABLED_HARDIRQS		(1 << LOCK_ENABLED_HARDIRQS)
 #define LOCKF_ENABLED_SOFTIRQS		(1 << LOCK_ENABLED_SOFTIRQS)
+#define LOCKF_HELD_OVER_RECLAIM_FS	(1 << LOCK_HELD_OVER_RECLAIM_FS)
 
 #define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
 
 #define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
 #define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
+#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
 #define LOCKF_ENABLED_HARDIRQS_READ	(1 << LOCK_ENABLED_HARDIRQS_READ)
 #define LOCKF_ENABLED_SOFTIRQS_READ	(1 << LOCK_ENABLED_SOFTIRQS_READ)
+#define LOCKF_HELD_OVER_RECLAIM_FS_READ	(1 << LOCK_HELD_OVER_RECLAIM_FS_READ)
 
 #define LOCKF_ENABLED_IRQS_READ \
 		(LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ)
@@ -324,7 +332,11 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 	lock_set_class(lock, lock->name, lock->key, subclass, ip);
 }
 
-# define INIT_LOCKDEP				.lockdep_recursion = 0,
+extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
+extern void lockdep_clear_current_reclaim_state(void);
+extern void lockdep_trace_alloc(gfp_t mask);
+
+# define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
 
@@ -342,6 +354,9 @@ static inline void lockdep_on(void)
 # define lock_release(l, n, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
+# define lockdep_set_current_reclaim_state(g)	do { } while (0)
+# define lockdep_clear_current_reclaim_state()	do { } while (0)
+# define lockdep_trace_alloc(g)			do { } while (0)
 # define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4efb552aca47..b00a77f4999e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1313,6 +1313,7 @@ struct task_struct {
 	int lockdep_depth;
 	unsigned int lockdep_recursion;
 	struct held_lock held_locks[MAX_LOCK_DEPTH];
+	gfp_t lockdep_reclaim_gfp;
 #endif
 
 /* journalling filesystem info */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c3568f0b..977f940fd562 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -310,12 +310,14 @@ EXPORT_SYMBOL(lockdep_on);
 #if VERBOSE
 # define HARDIRQ_VERBOSE	1
 # define SOFTIRQ_VERBOSE	1
+# define RECLAIM_VERBOSE	1
 #else
 # define HARDIRQ_VERBOSE	0
 # define SOFTIRQ_VERBOSE	0
+# define RECLAIM_VERBOSE	0
 #endif
 
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
 /*
  * Quick filtering for interesting events:
  */
@@ -454,6 +456,10 @@ static const char *usage_str[] =
 	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
 	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R",
 	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R",
+	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
+	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
+	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
+	[LOCK_HELD_OVER_RECLAIM_FS_READ] = "ov-reclaim-R",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -462,9 +468,10 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 }
 
 void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6)
 {
-	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
+	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.', *c5 = '.', *c6 = '.';
 
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 		*c1 = '+';
@@ -493,14 +500,29 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
 			*c4 = '?';
 	}
+
+	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
+		*c5 = '+';
+	else
+		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS)
+			*c5 = '-';
+
+	if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+		*c6 = '-';
+	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
+		*c6 = '+';
+		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+			*c6 = '?';
+	}
+
 }
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[KSYM_NAME_LEN], c1, c2, c3, c4;
+	char str[KSYM_NAME_LEN], c1, c2, c3, c4, c5, c6;
 	const char *name;
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4);
+	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
 
 	name = class->name;
 	if (!name) {
@@ -513,7 +535,7 @@ static void print_lock_name(struct lock_class *class)
 		if (class->subclass)
 			printk("/%d", class->subclass);
 	}
-	printk("){%c%c%c%c}", c1, c2, c3, c4);
+	printk("){%c%c%c%c%c%c}", c1, c2, c3, c4, c5, c6);
 }
 
 static void print_lockdep_cache(struct lockdep_map *lock)
@@ -1306,6 +1328,26 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 					LOCK_ENABLED_SOFTIRQS, "soft"))
 		return 0;
 
+	/*
+	 * Prove that the new dependency does not connect a reclaim-fs-safe
+	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
+					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+		return 0;
+
+	/*
+	 * Prove that the new dependency does not connect a reclaim-fs-safe-read
+	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
+					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs-read"))
+		return 0;
+
 	return 1;
 }
 
@@ -1949,6 +1991,14 @@ static int softirq_verbose(struct lock_class *class)
 	return 0;
 }
 
+static int reclaim_verbose(struct lock_class *class)
+{
+#if RECLAIM_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+
 #define STRICT_READ_CHECKS	1
 
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
@@ -2007,6 +2057,31 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_USED_IN_RECLAIM_FS:
+		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_HELD_OVER_RECLAIM_FS_READ))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-safe, check that this lock
+		 * took no reclaim-fs-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it reclaim-fs-safe, check that this lock
+		 * took no reclaim-fs-unsafe-read lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+				LOCK_HELD_OVER_RECLAIM_FS_READ, "reclaim-fs-read"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
 			return 0;
@@ -2033,6 +2108,19 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_USED_IN_RECLAIM_FS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-read-safe, check that this lock
+		 * took no reclaim-fs-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_ENABLED_HARDIRQS:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
@@ -2085,6 +2173,32 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_HELD_OVER_RECLAIM_FS:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_USED_IN_RECLAIM_FS_READ))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-unsafe, check that no reclaim-fs-safe
+		 * lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it softirq-unsafe, check that no
+		 * softirq-safe-read lock in the system ever took
+		 * it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+				   LOCK_USED_IN_RECLAIM_FS_READ, "reclaim-fs-read"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_ENABLED_HARDIRQS_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
@@ -2115,6 +2229,21 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it reclaim-fs-read-unsafe, check that no
+		 * reclaim-fs-safe lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	default:
 		WARN_ON(1);
 		break;
@@ -2123,11 +2252,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	return ret;
 }
 
+enum mark_type {
+	HARDIRQ,
+	SOFTIRQ,
+	RECLAIM_FS,
+};
+
 /*
  * Mark all held locks with a usage bit:
  */
 static int
-mark_held_locks(struct task_struct *curr, int hardirq)
+mark_held_locks(struct task_struct *curr, enum mark_type mark)
 {
 	enum lock_usage_bit usage_bit;
 	struct held_lock *hlock;
@@ -2136,17 +2271,32 @@ mark_held_locks(struct task_struct *curr, int hardirq)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		hlock = curr->held_locks + i;
 
-		if (hardirq) {
+		switch (mark) {
+		case HARDIRQ:
 			if (hlock->read)
 				usage_bit = LOCK_ENABLED_HARDIRQS_READ;
 			else
 				usage_bit = LOCK_ENABLED_HARDIRQS;
-		} else {
+			break;
+
+		case SOFTIRQ:
 			if (hlock->read)
 				usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
 			else
 				usage_bit = LOCK_ENABLED_SOFTIRQS;
+			break;
+
+		case RECLAIM_FS:
+			if (hlock->read)
+				usage_bit = LOCK_HELD_OVER_RECLAIM_FS_READ;
+			else
+				usage_bit = LOCK_HELD_OVER_RECLAIM_FS;
+			break;
+
+		default:
+			BUG();
 		}
+
 		if (!mark_lock(curr, hlock, usage_bit))
 			return 0;
 	}
@@ -2200,7 +2350,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	 * We are going to turn hardirqs on, so set the
 	 * usage bit for all held locks:
 	 */
-	if (!mark_held_locks(curr, 1))
+	if (!mark_held_locks(curr, HARDIRQ))
 		return;
 	/*
 	 * If we have softirqs enabled, then set the usage
@@ -2208,7 +2358,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	 * this bit from being set before)
 	 */
 	if (curr->softirqs_enabled)
-		if (!mark_held_locks(curr, 0))
+		if (!mark_held_locks(curr, SOFTIRQ))
 			return;
 
 	curr->hardirq_enable_ip = ip;
@@ -2288,7 +2438,7 @@ void trace_softirqs_on(unsigned long ip)
 	 * enabled too:
 	 */
 	if (curr->hardirqs_enabled)
-		mark_held_locks(curr, 0);
+		mark_held_locks(curr, SOFTIRQ);
 }
 
 /*
@@ -2317,6 +2467,31 @@ void trace_softirqs_off(unsigned long ip)
 		debug_atomic_inc(&redundant_softirqs_off);
 }
 
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	/* no reclaim without waiting on it */
+	if (!(gfp_mask & __GFP_WAIT))
+		return;
+
+	/* this guy won't enter reclaim */
+	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+		return;
+
+	/* We're only interested __GFP_FS allocations for now */
+	if (!(gfp_mask & __GFP_FS))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(irqs_disabled()))
+		return;
+
+	mark_held_locks(curr, RECLAIM_FS);
+}
+
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
 	/*
@@ -2362,6 +2537,22 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 		}
 	}
 
+	/*
+	 * We reuse the irq context infrastructure more broadly as a general
+	 * context checking code. This tests GFP_FS recursion (a lock taken
+	 * during reclaim for a GFP_FS allocation is held over a GFP_FS
+	 * allocation).
+	 */
+	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
+		if (hlock->read) {
+			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
+					return 0;
+		} else {
+			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
+					return 0;
+		}
+	}
+
 	return 1;
 }
 
@@ -2453,6 +2644,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_SOFTIRQS:
 	case LOCK_ENABLED_HARDIRQS_READ:
 	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_USED_IN_RECLAIM_FS:
+	case LOCK_USED_IN_RECLAIM_FS_READ:
+	case LOCK_HELD_OVER_RECLAIM_FS:
+	case LOCK_HELD_OVER_RECLAIM_FS_READ:
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
@@ -2966,6 +3161,16 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
+void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
+{
+	current->lockdep_reclaim_gfp = gfp_mask;
+}
+
+void lockdep_clear_current_reclaim_state(void)
+{
+	current->lockdep_reclaim_gfp = 0;
+}
+
 #ifdef CONFIG_LOCK_STAT
 static int
 print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 56b196932c08..e887b783244f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -32,7 +32,8 @@ extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
 
 extern void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6);
 
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
 
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 13716b813896..b84a1dfa9077 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
 {
 	struct lock_class *class = v;
 	struct lock_list *entry;
-	char c1, c2, c3, c4;
+	char c1, c2, c3, c4, c5, c6;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
 	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
 #endif
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4);
-	seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
+	seq_printf(m, " %c%c%c%c%c%c", c1, c2, c3, c4, c5, c6);
 
 	seq_printf(m, ": ");
 	print_name(m, class);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5675b3073854..22b15a4cde8a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1479,6 +1479,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	unsigned long pages_reclaimed = 0;
 
+	lockdep_trace_alloc(gfp_mask);
+
 	might_sleep_if(wait);
 
 	if (should_fail_alloc_page(gfp_mask, order))
@@ -1578,12 +1580,15 @@ nofail_alloc:
 	 */
 	cpuset_update_task_memory_state();
 	p->flags |= PF_MEMALLOC;
+
+	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
 	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
 
 	p->reclaim_state = NULL;
+	lockdep_clear_current_reclaim_state();
 	p->flags &= ~PF_MEMALLOC;
 
 	cond_resched();
diff --git a/mm/slab.c b/mm/slab.c
index ddc41f337d58..6b61de8543ec 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3318,6 +3318,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	unsigned long save_flags;
 	void *ptr;
 
+	lockdep_trace_alloc(flags);
+
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
@@ -3394,6 +3396,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	unsigned long save_flags;
 	void *objp;
 
+	lockdep_trace_alloc(flags);
+
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
diff --git a/mm/slob.c b/mm/slob.c
index bf7e8fc3aed8..1264799df5d1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -464,6 +464,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
+	lockdep_trace_alloc(flags);
+
 	if (size < PAGE_SIZE - align) {
 		if (!size)
 			return ZERO_SIZE_PTR;
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..214eb207c513 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1596,6 +1596,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	unsigned long flags;
 	unsigned int objsize;
 
+	lockdep_trace_alloc(gfpflags);
 	might_sleep_if(gfpflags & __GFP_WAIT);
 
 	if (should_failslab(s->objsize, gfpflags))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a27c44aa327..303eb658b50b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1963,6 +1963,9 @@ static int kswapd(void *p)
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
+
+	lockdep_set_current_reclaim_state(GFP_KERNEL);
+
 	node_to_cpumask_ptr(cpumask, pgdat->node_id);
 
 	if (!cpumask_empty(cpumask))
-- 
cgit v1.2.3-71-gd317


From 4fc95e867f1e75351b89db3c68212dfcce7ea563 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 13:10:52 +0100
Subject: lockdep: sanitize bit names

s/\(LOCKF\?_ENABLED_[^ ]*\)S\(_READ\)\?\>/\1\2/g

So that the USED_IN and ENABLED have the same names.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h | 22 ++++++-------
 kernel/lockdep.c        | 84 ++++++++++++++++++++++++-------------------------
 kernel/lockdep_proc.c   | 12 +++----
 3 files changed, 59 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index cc97bdbc7969..da2e2b25b3b2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -28,14 +28,14 @@ enum lock_usage_bit
 	LOCK_USED_IN_HARDIRQ,
 	LOCK_USED_IN_SOFTIRQ,
 	LOCK_USED_IN_RECLAIM_FS,
-	LOCK_ENABLED_SOFTIRQS,
-	LOCK_ENABLED_HARDIRQS,
+	LOCK_ENABLED_SOFTIRQ,
+	LOCK_ENABLED_HARDIRQ,
 	LOCK_HELD_OVER_RECLAIM_FS,
 	LOCK_USED_IN_HARDIRQ_READ,
 	LOCK_USED_IN_SOFTIRQ_READ,
 	LOCK_USED_IN_RECLAIM_FS_READ,
-	LOCK_ENABLED_SOFTIRQS_READ,
-	LOCK_ENABLED_HARDIRQS_READ,
+	LOCK_ENABLED_SOFTIRQ_READ,
+	LOCK_ENABLED_HARDIRQ_READ,
 	LOCK_HELD_OVER_RECLAIM_FS_READ,
 	LOCK_USAGE_STATES
 };
@@ -47,22 +47,22 @@ enum lock_usage_bit
 #define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
 #define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
 #define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
-#define LOCKF_ENABLED_HARDIRQS		(1 << LOCK_ENABLED_HARDIRQS)
-#define LOCKF_ENABLED_SOFTIRQS		(1 << LOCK_ENABLED_SOFTIRQS)
+#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
+#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
 #define LOCKF_HELD_OVER_RECLAIM_FS	(1 << LOCK_HELD_OVER_RECLAIM_FS)
 
-#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS)
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
 
 #define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
 #define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
 #define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
-#define LOCKF_ENABLED_HARDIRQS_READ	(1 << LOCK_ENABLED_HARDIRQS_READ)
-#define LOCKF_ENABLED_SOFTIRQS_READ	(1 << LOCK_ENABLED_SOFTIRQS_READ)
+#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
+#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
 #define LOCKF_HELD_OVER_RECLAIM_FS_READ	(1 << LOCK_HELD_OVER_RECLAIM_FS_READ)
 
-#define LOCKF_ENABLED_IRQS_READ \
-		(LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ)
+#define LOCKF_ENABLED_IRQ_READ \
+		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
 #define LOCKF_USED_IN_IRQ_READ \
 		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 977f940fd562..32f944752b18 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -450,12 +450,12 @@ static const char *usage_str[] =
 	[LOCK_USED] =			"initial-use ",
 	[LOCK_USED_IN_HARDIRQ] =	"in-hardirq-W",
 	[LOCK_USED_IN_SOFTIRQ] =	"in-softirq-W",
-	[LOCK_ENABLED_SOFTIRQS] =	"softirq-on-W",
-	[LOCK_ENABLED_HARDIRQS] =	"hardirq-on-W",
+	[LOCK_ENABLED_SOFTIRQ] =	"softirq-on-W",
+	[LOCK_ENABLED_HARDIRQ] =	"hardirq-on-W",
 	[LOCK_USED_IN_HARDIRQ_READ] =	"in-hardirq-R",
 	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
-	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R",
-	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R",
+	[LOCK_ENABLED_SOFTIRQ_READ] =	"softirq-on-R",
+	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
 	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
 	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
 	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
@@ -476,28 +476,28 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 		*c1 = '+';
 	else
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
 			*c1 = '-';
 
 	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
 		*c2 = '+';
 	else
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
 			*c2 = '-';
 
-	if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 		*c3 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
 		*c3 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 			*c3 = '?';
 	}
 
-	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 		*c4 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
 		*c4 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 			*c4 = '?';
 	}
 
@@ -1296,7 +1296,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
-					LOCK_ENABLED_HARDIRQS, "hard"))
+					LOCK_ENABLED_HARDIRQ, "hard"))
 		return 0;
 
 	/*
@@ -1306,7 +1306,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
-					LOCK_ENABLED_HARDIRQS, "hard-read"))
+					LOCK_ENABLED_HARDIRQ, "hard-read"))
 		return 0;
 
 	/*
@@ -1316,7 +1316,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
+					LOCK_ENABLED_SOFTIRQ, "soft"))
 		return 0;
 	/*
 	 * Prove that the new dependency does not connect a softirq-safe-read
@@ -1325,7 +1325,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
+					LOCK_ENABLED_SOFTIRQ, "soft"))
 		return 0;
 
 	/*
@@ -2008,17 +2008,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_HARDIRQS_READ))
+				 LOCK_ENABLED_HARDIRQ_READ))
 			return 0;
 		/*
 		 * just marked it hardirq-safe, check that this lock
 		 * took no hardirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQS, "hard"))
+					  LOCK_ENABLED_HARDIRQ, "hard"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2026,24 +2026,24 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no hardirq-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
+				LOCK_ENABLED_HARDIRQ_READ, "hard-read"))
 			return 0;
 #endif
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_SOFTIRQS_READ))
+				 LOCK_ENABLED_SOFTIRQ_READ))
 			return 0;
 		/*
 		 * just marked it softirq-safe, check that this lock
 		 * took no softirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQS, "soft"))
+					  LOCK_ENABLED_SOFTIRQ, "soft"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2051,7 +2051,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no softirq-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
+				LOCK_ENABLED_SOFTIRQ_READ, "soft-read"))
 			return 0;
 #endif
 		if (softirq_verbose(hlock_class(this)))
@@ -2083,27 +2083,27 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
 			return 0;
 		/*
 		 * just marked it hardirq-read-safe, check that this lock
 		 * took no hardirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQS, "hard"))
+					  LOCK_ENABLED_HARDIRQ, "hard"))
 			return 0;
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
 			return 0;
 		/*
 		 * just marked it softirq-read-safe, check that this lock
 		 * took no softirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQS, "soft"))
+					  LOCK_ENABLED_SOFTIRQ, "soft"))
 			return 0;
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
@@ -2121,7 +2121,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_HARDIRQS:
+	case LOCK_ENABLED_HARDIRQ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2147,7 +2147,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_SOFTIRQS:
+	case LOCK_ENABLED_SOFTIRQ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2199,7 +2199,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_HARDIRQS_READ:
+	case LOCK_ENABLED_HARDIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2214,7 +2214,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2274,16 +2274,16 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 		switch (mark) {
 		case HARDIRQ:
 			if (hlock->read)
-				usage_bit = LOCK_ENABLED_HARDIRQS_READ;
+				usage_bit = LOCK_ENABLED_HARDIRQ_READ;
 			else
-				usage_bit = LOCK_ENABLED_HARDIRQS;
+				usage_bit = LOCK_ENABLED_HARDIRQ;
 			break;
 
 		case SOFTIRQ:
 			if (hlock->read)
-				usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
+				usage_bit = LOCK_ENABLED_SOFTIRQ_READ;
 			else
-				usage_bit = LOCK_ENABLED_SOFTIRQS;
+				usage_bit = LOCK_ENABLED_SOFTIRQ;
 			break;
 
 		case RECLAIM_FS:
@@ -2520,19 +2520,19 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 	if (!hlock->hardirqs_off) {
 		if (hlock->read) {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS_READ))
+					LOCK_ENABLED_HARDIRQ_READ))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS_READ))
+						LOCK_ENABLED_SOFTIRQ_READ))
 					return 0;
 		} else {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS))
+					LOCK_ENABLED_HARDIRQ))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS))
+						LOCK_ENABLED_SOFTIRQ))
 					return 0;
 		}
 	}
@@ -2640,10 +2640,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_SOFTIRQ:
 	case LOCK_USED_IN_HARDIRQ_READ:
 	case LOCK_USED_IN_SOFTIRQ_READ:
-	case LOCK_ENABLED_HARDIRQS:
-	case LOCK_ENABLED_SOFTIRQS:
-	case LOCK_ENABLED_HARDIRQS_READ:
-	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_ENABLED_HARDIRQ:
+	case LOCK_ENABLED_SOFTIRQ:
+	case LOCK_ENABLED_HARDIRQ_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
 	case LOCK_HELD_OVER_RECLAIM_FS:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b84a1dfa9077..bd474fd9df9d 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			nr_uncategorized++;
 		if (class->usage_mask & LOCKF_USED_IN_IRQ)
 			nr_irq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_IRQS)
+		if (class->usage_mask & LOCKF_ENABLED_IRQ)
 			nr_irq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
 			nr_softirq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
 			nr_softirq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 			nr_hardirq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
 			nr_hardirq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
 			nr_irq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
 			nr_irq_read_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
 			nr_softirq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 			nr_softirq_read_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
 			nr_hardirq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 			nr_hardirq_read_unsafe++;
 
 #ifdef CONFIG_PROVE_LOCKING
-- 
cgit v1.2.3-71-gd317


From a652d7081bc96b3094e85ca30e47f50185d2f717 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 13:13:11 +0100
Subject: lockdep: sanitize reclaim bit names

s/HELD_OVER/ENABLED/g

so that its similar to the hard and soft-irq names.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h |  8 ++++----
 kernel/lockdep.c        | 38 +++++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index da2e2b25b3b2..6d729c9d1d27 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -30,13 +30,13 @@ enum lock_usage_bit
 	LOCK_USED_IN_RECLAIM_FS,
 	LOCK_ENABLED_SOFTIRQ,
 	LOCK_ENABLED_HARDIRQ,
-	LOCK_HELD_OVER_RECLAIM_FS,
+	LOCK_ENABLED_RECLAIM_FS,
 	LOCK_USED_IN_HARDIRQ_READ,
 	LOCK_USED_IN_SOFTIRQ_READ,
 	LOCK_USED_IN_RECLAIM_FS_READ,
 	LOCK_ENABLED_SOFTIRQ_READ,
 	LOCK_ENABLED_HARDIRQ_READ,
-	LOCK_HELD_OVER_RECLAIM_FS_READ,
+	LOCK_ENABLED_RECLAIM_FS_READ,
 	LOCK_USAGE_STATES
 };
 
@@ -49,7 +49,7 @@ enum lock_usage_bit
 #define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
 #define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
 #define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
-#define LOCKF_HELD_OVER_RECLAIM_FS	(1 << LOCK_HELD_OVER_RECLAIM_FS)
+#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
 
 #define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
@@ -59,7 +59,7 @@ enum lock_usage_bit
 #define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
 #define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
 #define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
-#define LOCKF_HELD_OVER_RECLAIM_FS_READ	(1 << LOCK_HELD_OVER_RECLAIM_FS_READ)
+#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
 
 #define LOCKF_ENABLED_IRQ_READ \
 		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 32f944752b18..dd4716c08325 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -458,8 +458,8 @@ static const char *usage_str[] =
 	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
 	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
 	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
-	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
-	[LOCK_HELD_OVER_RECLAIM_FS_READ] = "ov-reclaim-R",
+	[LOCK_ENABLED_RECLAIM_FS] =	"ov-reclaim-W",
+	[LOCK_ENABLED_RECLAIM_FS_READ] = "ov-reclaim-R",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -504,14 +504,14 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
 	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
 		*c5 = '+';
 	else
-		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS)
+		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS)
 			*c5 = '-';
 
-	if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
 		*c6 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
 		*c6 = '+';
-		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
 			*c6 = '?';
 	}
 
@@ -1335,7 +1335,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
-					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 		return 0;
 
 	/*
@@ -1345,7 +1345,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
-					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs-read"))
+					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs-read"))
 		return 0;
 
 	return 1;
@@ -2058,17 +2058,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_RECLAIM_FS:
-		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_HELD_OVER_RECLAIM_FS_READ))
+				 LOCK_ENABLED_RECLAIM_FS_READ))
 			return 0;
 		/*
 		 * just marked it reclaim-fs-safe, check that this lock
 		 * took no reclaim-fs-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2076,7 +2076,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no reclaim-fs-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_HELD_OVER_RECLAIM_FS_READ, "reclaim-fs-read"))
+				LOCK_ENABLED_RECLAIM_FS_READ, "reclaim-fs-read"))
 			return 0;
 #endif
 		if (reclaim_verbose(hlock_class(this)))
@@ -2109,14 +2109,14 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
 			return 0;
 		/*
 		 * just marked it reclaim-fs-read-safe, check that this lock
 		 * took no reclaim-fs-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 			return 0;
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
@@ -2173,7 +2173,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_HELD_OVER_RECLAIM_FS:
+	case LOCK_ENABLED_RECLAIM_FS:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2229,7 +2229,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2288,9 +2288,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 
 		case RECLAIM_FS:
 			if (hlock->read)
-				usage_bit = LOCK_HELD_OVER_RECLAIM_FS_READ;
+				usage_bit = LOCK_ENABLED_RECLAIM_FS_READ;
 			else
-				usage_bit = LOCK_HELD_OVER_RECLAIM_FS;
+				usage_bit = LOCK_ENABLED_RECLAIM_FS;
 			break;
 
 		default:
@@ -2646,8 +2646,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-	case LOCK_HELD_OVER_RECLAIM_FS:
-	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+	case LOCK_ENABLED_RECLAIM_FS:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
-- 
cgit v1.2.3-71-gd317


From 9851673bc32bc9fcafbbaeffc858ead434bd6d58 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:18:40 +0100
Subject: lockdep: move state bit definitions around

For convenience later.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h    | 49 ++++------------------------------------------
 kernel/lockdep_internals.h | 46 +++++++++++++++++++++++++++++++++++++++++++
 kernel/lockdep_states.h    |  6 ++++++
 3 files changed, 56 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 6d729c9d1d27..5a58ea3e91e9 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -20,51 +20,10 @@ struct lockdep_map;
 #include <linux/stacktrace.h>
 
 /*
- * Lock-class usage-state bits:
+ * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
+ * the total number of states... :-(
  */
-enum lock_usage_bit
-{
-	LOCK_USED = 0,
-	LOCK_USED_IN_HARDIRQ,
-	LOCK_USED_IN_SOFTIRQ,
-	LOCK_USED_IN_RECLAIM_FS,
-	LOCK_ENABLED_SOFTIRQ,
-	LOCK_ENABLED_HARDIRQ,
-	LOCK_ENABLED_RECLAIM_FS,
-	LOCK_USED_IN_HARDIRQ_READ,
-	LOCK_USED_IN_SOFTIRQ_READ,
-	LOCK_USED_IN_RECLAIM_FS_READ,
-	LOCK_ENABLED_SOFTIRQ_READ,
-	LOCK_ENABLED_HARDIRQ_READ,
-	LOCK_ENABLED_RECLAIM_FS_READ,
-	LOCK_USAGE_STATES
-};
-
-/*
- * Usage-state bitmasks:
- */
-#define LOCKF_USED			(1 << LOCK_USED)
-#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
-#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
-#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
-#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
-#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
-#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
-
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
-
-#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
-#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
-#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
-#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
-#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
-#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
-
-#define LOCKF_ENABLED_IRQ_READ \
-		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
-		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+#define XXX_LOCK_USAGE_STATES		(1+3*4)
 
 #define MAX_LOCKDEP_SUBCLASSES		8UL
 
@@ -105,7 +64,7 @@ struct lock_class {
 	 * IRQ/softirq usage tracking bits:
 	 */
 	unsigned long			usage_mask;
-	struct stack_trace		usage_traces[LOCK_USAGE_STATES];
+	struct stack_trace		usage_traces[XXX_LOCK_USAGE_STATES];
 
 	/*
 	 * These fields represent a directed graph of lock dependencies,
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index e887b783244f..1352409cfef1 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -6,6 +6,52 @@
  * lockdep subsystem internal functions and variables.
  */
 
+/*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit {
+	LOCK_USED = 0,
+	LOCK_USED_IN_HARDIRQ,
+	LOCK_USED_IN_SOFTIRQ,
+	LOCK_USED_IN_RECLAIM_FS,
+	LOCK_ENABLED_SOFTIRQ,
+	LOCK_ENABLED_HARDIRQ,
+	LOCK_ENABLED_RECLAIM_FS,
+	LOCK_USED_IN_HARDIRQ_READ,
+	LOCK_USED_IN_SOFTIRQ_READ,
+	LOCK_USED_IN_RECLAIM_FS_READ,
+	LOCK_ENABLED_SOFTIRQ_READ,
+	LOCK_ENABLED_HARDIRQ_READ,
+	LOCK_ENABLED_RECLAIM_FS_READ,
+	LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define LOCKF_USED			(1 << LOCK_USED)
+#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
+#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
+#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
+#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
+#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
+#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
+
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
+#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
+#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
+#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
+#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
+#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
+
+#define LOCKF_ENABLED_IRQ_READ \
+		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
 /*
  * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
  * we track.
diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
index 937039ef2dd0..995b0cc2b84c 100644
--- a/kernel/lockdep_states.h
+++ b/kernel/lockdep_states.h
@@ -1,3 +1,9 @@
+/*
+ * Lockdep states,
+ *
+ * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
+ * you add one, or come up with a nice dynamic solution.
+ */
 LOCKDEP_STATE(HARDIRQ)
 LOCKDEP_STATE(SOFTIRQ)
 LOCKDEP_STATE(RECLAIM_FS)
-- 
cgit v1.2.3-71-gd317


From 35c26c2cf6a6a2d1c48add732d8ba002bd90784c Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 14 Feb 2009 22:56:56 -0800
Subject: rndis: remove private wrapper of __constant_cpu_to_le32

Use cpu_to_le32 directly as it handles constant folding now, replace direct
uses of __constant_cpu_to_{endian} as well.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/rndis_host.c      | 25 +++++------
 drivers/net/wireless/rndis_wlan.c | 90 +++++++++++++++++++-------------------
 drivers/usb/gadget/rndis.c        | 92 +++++++++++++++++++--------------------
 include/linux/usb/rndis_host.h    | 85 +++++++++++++++++-------------------
 4 files changed, 143 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index bcd858c567e0..b7f763e1298c 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -169,7 +169,7 @@ int rndis_command(struct usbnet *dev, struct rndis_msg_hdr *buf, int buflen)
 				struct rndis_keepalive_c *msg = (void *)buf;
 
 				msg->msg_type = RNDIS_MSG_KEEPALIVE_C;
-				msg->msg_len = ccpu2(sizeof *msg);
+				msg->msg_len = cpu_to_le32(sizeof *msg);
 				msg->status = RNDIS_STATUS_SUCCESS;
 				retval = usb_control_msg(dev->udev,
 					usb_sndctrlpipe(dev->udev, 0),
@@ -237,7 +237,7 @@ static int rndis_query(struct usbnet *dev, struct usb_interface *intf,
 	u.get->msg_len = cpu_to_le32(sizeof *u.get + in_len);
 	u.get->oid = oid;
 	u.get->len = cpu_to_le32(in_len);
-	u.get->offset = ccpu2(20);
+	u.get->offset = cpu_to_le32(20);
 
 	retval = rndis_command(dev, u.header, CONTROL_BUFFER_SIZE);
 	if (unlikely(retval < 0)) {
@@ -297,9 +297,9 @@ generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
 		goto fail;
 
 	u.init->msg_type = RNDIS_MSG_INIT;
-	u.init->msg_len = ccpu2(sizeof *u.init);
-	u.init->major_version = ccpu2(1);
-	u.init->minor_version = ccpu2(0);
+	u.init->msg_len = cpu_to_le32(sizeof *u.init);
+	u.init->major_version = cpu_to_le32(1);
+	u.init->minor_version = cpu_to_le32(0);
 
 	/* max transfer (in spec) is 0x4000 at full speed, but for
 	 * TX we'll stick to one Ethernet packet plus RNDIS framing.
@@ -403,10 +403,10 @@ generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
 	/* set a nonzero filter to enable data transfers */
 	memset(u.set, 0, sizeof *u.set);
 	u.set->msg_type = RNDIS_MSG_SET;
-	u.set->msg_len = ccpu2(4 + sizeof *u.set);
+	u.set->msg_len = cpu_to_le32(4 + sizeof *u.set);
 	u.set->oid = OID_GEN_CURRENT_PACKET_FILTER;
-	u.set->len = ccpu2(4);
-	u.set->offset = ccpu2((sizeof *u.set) - 8);
+	u.set->len = cpu_to_le32(4);
+	u.set->offset = cpu_to_le32((sizeof *u.set) - 8);
 	*(__le32 *)(u.buf + sizeof *u.set) = RNDIS_DEFAULT_FILTER;
 
 	retval = rndis_command(dev, u.header, CONTROL_BUFFER_SIZE);
@@ -423,7 +423,7 @@ generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
 halt_fail_and_release:
 	memset(u.halt, 0, sizeof *u.halt);
 	u.halt->msg_type = RNDIS_MSG_HALT;
-	u.halt->msg_len = ccpu2(sizeof *u.halt);
+	u.halt->msg_len = cpu_to_le32(sizeof *u.halt);
 	(void) rndis_command(dev, (void *)u.halt, CONTROL_BUFFER_SIZE);
 fail_and_release:
 	usb_set_intfdata(info->data, NULL);
@@ -448,7 +448,7 @@ void rndis_unbind(struct usbnet *dev, struct usb_interface *intf)
 	halt = kzalloc(CONTROL_BUFFER_SIZE, GFP_KERNEL);
 	if (halt) {
 		halt->msg_type = RNDIS_MSG_HALT;
-		halt->msg_len = ccpu2(sizeof *halt);
+		halt->msg_len = cpu_to_le32(sizeof *halt);
 		(void) rndis_command(dev, (void *)halt, CONTROL_BUFFER_SIZE);
 		kfree(halt);
 	}
@@ -543,7 +543,7 @@ fill:
 	memset(hdr, 0, sizeof *hdr);
 	hdr->msg_type = RNDIS_MSG_PACKET;
 	hdr->msg_len = cpu_to_le32(skb->len);
-	hdr->data_offset = ccpu2(sizeof(*hdr) - 8);
+	hdr->data_offset = cpu_to_le32(sizeof(*hdr) - 8);
 	hdr->data_len = cpu_to_le32(len);
 
 	/* FIXME make the last packet always be short ... */
@@ -562,9 +562,6 @@ static const struct driver_info	rndis_info = {
 	.tx_fixup =	rndis_tx_fixup,
 };
 
-#undef ccpu2
-
-
 /*-------------------------------------------------------------------------*/
 
 static const struct usb_device_id	products [] = {
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 105f214e21f4..82af21eeb592 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -90,44 +90,44 @@ MODULE_PARM_DESC(workaround_interval,
 
 
 /* various RNDIS OID defs */
-#define OID_GEN_LINK_SPEED			ccpu2(0x00010107)
-#define OID_GEN_RNDIS_CONFIG_PARAMETER		ccpu2(0x0001021b)
-
-#define OID_GEN_XMIT_OK				ccpu2(0x00020101)
-#define OID_GEN_RCV_OK				ccpu2(0x00020102)
-#define OID_GEN_XMIT_ERROR			ccpu2(0x00020103)
-#define OID_GEN_RCV_ERROR			ccpu2(0x00020104)
-#define OID_GEN_RCV_NO_BUFFER			ccpu2(0x00020105)
-
-#define OID_802_3_PERMANENT_ADDRESS		ccpu2(0x01010101)
-#define OID_802_3_CURRENT_ADDRESS		ccpu2(0x01010102)
-#define OID_802_3_MULTICAST_LIST		ccpu2(0x01010103)
-#define OID_802_3_MAXIMUM_LIST_SIZE		ccpu2(0x01010104)
-
-#define OID_802_11_BSSID			ccpu2(0x0d010101)
-#define OID_802_11_SSID				ccpu2(0x0d010102)
-#define OID_802_11_INFRASTRUCTURE_MODE		ccpu2(0x0d010108)
-#define OID_802_11_ADD_WEP			ccpu2(0x0d010113)
-#define OID_802_11_REMOVE_WEP			ccpu2(0x0d010114)
-#define OID_802_11_DISASSOCIATE			ccpu2(0x0d010115)
-#define OID_802_11_AUTHENTICATION_MODE		ccpu2(0x0d010118)
-#define OID_802_11_PRIVACY_FILTER		ccpu2(0x0d010119)
-#define OID_802_11_BSSID_LIST_SCAN		ccpu2(0x0d01011a)
-#define OID_802_11_ENCRYPTION_STATUS		ccpu2(0x0d01011b)
-#define OID_802_11_ADD_KEY			ccpu2(0x0d01011d)
-#define OID_802_11_REMOVE_KEY			ccpu2(0x0d01011e)
-#define OID_802_11_ASSOCIATION_INFORMATION	ccpu2(0x0d01011f)
-#define OID_802_11_PMKID			ccpu2(0x0d010123)
-#define OID_802_11_NETWORK_TYPES_SUPPORTED	ccpu2(0x0d010203)
-#define OID_802_11_NETWORK_TYPE_IN_USE		ccpu2(0x0d010204)
-#define OID_802_11_TX_POWER_LEVEL		ccpu2(0x0d010205)
-#define OID_802_11_RSSI				ccpu2(0x0d010206)
-#define OID_802_11_RSSI_TRIGGER			ccpu2(0x0d010207)
-#define OID_802_11_FRAGMENTATION_THRESHOLD	ccpu2(0x0d010209)
-#define OID_802_11_RTS_THRESHOLD		ccpu2(0x0d01020a)
-#define OID_802_11_SUPPORTED_RATES		ccpu2(0x0d01020e)
-#define OID_802_11_CONFIGURATION		ccpu2(0x0d010211)
-#define OID_802_11_BSSID_LIST			ccpu2(0x0d010217)
+#define OID_GEN_LINK_SPEED			cpu_to_le32(0x00010107)
+#define OID_GEN_RNDIS_CONFIG_PARAMETER		cpu_to_le32(0x0001021b)
+
+#define OID_GEN_XMIT_OK				cpu_to_le32(0x00020101)
+#define OID_GEN_RCV_OK				cpu_to_le32(0x00020102)
+#define OID_GEN_XMIT_ERROR			cpu_to_le32(0x00020103)
+#define OID_GEN_RCV_ERROR			cpu_to_le32(0x00020104)
+#define OID_GEN_RCV_NO_BUFFER			cpu_to_le32(0x00020105)
+
+#define OID_802_3_PERMANENT_ADDRESS		cpu_to_le32(0x01010101)
+#define OID_802_3_CURRENT_ADDRESS		cpu_to_le32(0x01010102)
+#define OID_802_3_MULTICAST_LIST		cpu_to_le32(0x01010103)
+#define OID_802_3_MAXIMUM_LIST_SIZE		cpu_to_le32(0x01010104)
+
+#define OID_802_11_BSSID			cpu_to_le32(0x0d010101)
+#define OID_802_11_SSID				cpu_to_le32(0x0d010102)
+#define OID_802_11_INFRASTRUCTURE_MODE		cpu_to_le32(0x0d010108)
+#define OID_802_11_ADD_WEP			cpu_to_le32(0x0d010113)
+#define OID_802_11_REMOVE_WEP			cpu_to_le32(0x0d010114)
+#define OID_802_11_DISASSOCIATE			cpu_to_le32(0x0d010115)
+#define OID_802_11_AUTHENTICATION_MODE		cpu_to_le32(0x0d010118)
+#define OID_802_11_PRIVACY_FILTER		cpu_to_le32(0x0d010119)
+#define OID_802_11_BSSID_LIST_SCAN		cpu_to_le32(0x0d01011a)
+#define OID_802_11_ENCRYPTION_STATUS		cpu_to_le32(0x0d01011b)
+#define OID_802_11_ADD_KEY			cpu_to_le32(0x0d01011d)
+#define OID_802_11_REMOVE_KEY			cpu_to_le32(0x0d01011e)
+#define OID_802_11_ASSOCIATION_INFORMATION	cpu_to_le32(0x0d01011f)
+#define OID_802_11_PMKID			cpu_to_le32(0x0d010123)
+#define OID_802_11_NETWORK_TYPES_SUPPORTED	cpu_to_le32(0x0d010203)
+#define OID_802_11_NETWORK_TYPE_IN_USE		cpu_to_le32(0x0d010204)
+#define OID_802_11_TX_POWER_LEVEL		cpu_to_le32(0x0d010205)
+#define OID_802_11_RSSI				cpu_to_le32(0x0d010206)
+#define OID_802_11_RSSI_TRIGGER			cpu_to_le32(0x0d010207)
+#define OID_802_11_FRAGMENTATION_THRESHOLD	cpu_to_le32(0x0d010209)
+#define OID_802_11_RTS_THRESHOLD		cpu_to_le32(0x0d01020a)
+#define OID_802_11_SUPPORTED_RATES		cpu_to_le32(0x0d01020e)
+#define OID_802_11_CONFIGURATION		cpu_to_le32(0x0d010211)
+#define OID_802_11_BSSID_LIST			cpu_to_le32(0x0d010217)
 
 
 /* Typical noise/maximum signal level values taken from ndiswrapper iw_ndis.h */
@@ -144,8 +144,8 @@ MODULE_PARM_DESC(workaround_interval,
 
 
 /* codes for "status" field of completion messages */
-#define RNDIS_STATUS_ADAPTER_NOT_READY		ccpu2(0xc0010011)
-#define RNDIS_STATUS_ADAPTER_NOT_OPEN		ccpu2(0xc0010012)
+#define RNDIS_STATUS_ADAPTER_NOT_READY		cpu_to_le32(0xc0010011)
+#define RNDIS_STATUS_ADAPTER_NOT_OPEN		cpu_to_le32(0xc0010012)
 
 
 /* NDIS data structures. Taken from wpa_supplicant driver_ndis.c
@@ -442,7 +442,7 @@ static int rndis_query_oid(struct usbnet *dev, __le32 oid, void *data, int *len)
 
 	memset(u.get, 0, sizeof *u.get);
 	u.get->msg_type = RNDIS_MSG_QUERY;
-	u.get->msg_len = ccpu2(sizeof *u.get);
+	u.get->msg_len = cpu_to_le32(sizeof *u.get);
 	u.get->oid = oid;
 
 	ret = rndis_command(dev, u.header, buflen);
@@ -491,8 +491,8 @@ static int rndis_set_oid(struct usbnet *dev, __le32 oid, void *data, int len)
 	u.set->msg_len = cpu_to_le32(sizeof(*u.set) + len);
 	u.set->oid = oid;
 	u.set->len = cpu_to_le32(len);
-	u.set->offset = ccpu2(sizeof(*u.set) - 8);
-	u.set->handle = ccpu2(0);
+	u.set->offset = cpu_to_le32(sizeof(*u.set) - 8);
+	u.set->handle = cpu_to_le32(0);
 	memcpy(u.buf + sizeof(*u.set), data, len);
 
 	ret = rndis_command(dev, u.header, buflen);
@@ -1630,7 +1630,7 @@ static int rndis_iw_set_scan(struct net_device *dev,
 	devdbg(usbdev, "SIOCSIWSCAN");
 
 	if (wrqu->data.flags == 0) {
-		tmp = ccpu2(1);
+		tmp = cpu_to_le32(1);
 		ret = rndis_set_oid(usbdev, OID_802_11_BSSID_LIST_SCAN, &tmp,
 								sizeof(tmp));
 		evt.data.flags = 0;
@@ -2428,7 +2428,7 @@ static void rndis_update_wireless_stats(struct work_struct *work)
 		/* Send scan OID. Use of both OIDs is required to get device
 		 * working.
 		 */
-		tmp = ccpu2(1);
+		tmp = cpu_to_le32(1);
 		rndis_set_oid(usbdev, OID_802_11_BSSID_LIST_SCAN, &tmp,
 								sizeof(tmp));
 
diff --git a/drivers/usb/gadget/rndis.c b/drivers/usb/gadget/rndis.c
index 8c26f5ea2b83..d2860a823680 100644
--- a/drivers/usb/gadget/rndis.c
+++ b/drivers/usb/gadget/rndis.c
@@ -63,7 +63,7 @@ MODULE_PARM_DESC (rndis_debug, "enable debugging");
 static rndis_params rndis_per_dev_params [RNDIS_MAX_CONFIGS];
 
 /* Driver Version */
-static const __le32 rndis_driver_version = __constant_cpu_to_le32 (1);
+static const __le32 rndis_driver_version = cpu_to_le32 (1);
 
 /* Function Prototypes */
 static rndis_resp_t *rndis_add_response (int configNr, u32 length);
@@ -190,7 +190,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 
 	/* response goes here, right after the header */
 	outbuf = (__le32 *) &resp[1];
-	resp->InformationBufferOffset = __constant_cpu_to_le32 (16);
+	resp->InformationBufferOffset = cpu_to_le32 (16);
 
 	net = rndis_per_dev_params[configNr].dev;
 	if (net->get_stats)
@@ -221,7 +221,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 		 * reddite ergo quae sunt Caesaris Caesari
 		 * et quae sunt Dei Deo!
 		 */
-		*outbuf = __constant_cpu_to_le32 (0);
+		*outbuf = cpu_to_le32 (0);
 		retval = 0;
 		break;
 
@@ -256,7 +256,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 			pr_debug("%s: OID_GEN_LINK_SPEED\n", __func__);
 		if (rndis_per_dev_params [configNr].media_state
 				== NDIS_MEDIA_STATE_DISCONNECTED)
-			*outbuf = __constant_cpu_to_le32 (0);
+			*outbuf = cpu_to_le32 (0);
 		else
 			*outbuf = cpu_to_le32 (
 				rndis_per_dev_params [configNr].speed);
@@ -317,7 +317,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 	/* mandatory */
 	case OID_GEN_MAXIMUM_TOTAL_SIZE:
 		pr_debug("%s: OID_GEN_MAXIMUM_TOTAL_SIZE\n", __func__);
-		*outbuf = __constant_cpu_to_le32(RNDIS_MAX_TOTAL_SIZE);
+		*outbuf = cpu_to_le32(RNDIS_MAX_TOTAL_SIZE);
 		retval = 0;
 		break;
 
@@ -332,7 +332,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 
 	case OID_GEN_PHYSICAL_MEDIUM:
 		pr_debug("%s: OID_GEN_PHYSICAL_MEDIUM\n", __func__);
-		*outbuf = __constant_cpu_to_le32 (0);
+		*outbuf = cpu_to_le32 (0);
 		retval = 0;
 		break;
 
@@ -342,7 +342,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 	 */
 	case OID_GEN_MAC_OPTIONS:		/* from WinME */
 		pr_debug("%s: OID_GEN_MAC_OPTIONS\n", __func__);
-		*outbuf = __constant_cpu_to_le32(
+		*outbuf = cpu_to_le32(
 			  NDIS_MAC_OPTION_RECEIVE_SERIALIZED
 			| NDIS_MAC_OPTION_FULL_DUPLEX);
 		retval = 0;
@@ -431,7 +431,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 	case OID_802_3_MULTICAST_LIST:
 		pr_debug("%s: OID_802_3_MULTICAST_LIST\n", __func__);
 		/* Multicast base address only */
-		*outbuf = __constant_cpu_to_le32 (0xE0000000);
+		*outbuf = cpu_to_le32 (0xE0000000);
 		retval = 0;
 		break;
 
@@ -439,7 +439,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 	case OID_802_3_MAXIMUM_LIST_SIZE:
 		pr_debug("%s: OID_802_3_MAXIMUM_LIST_SIZE\n", __func__);
 		/* Multicast base address only */
-		*outbuf = __constant_cpu_to_le32 (1);
+		*outbuf = cpu_to_le32 (1);
 		retval = 0;
 		break;
 
@@ -461,14 +461,14 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 	/* mandatory */
 	case OID_802_3_XMIT_ONE_COLLISION:
 		pr_debug("%s: OID_802_3_XMIT_ONE_COLLISION\n", __func__);
-		*outbuf = __constant_cpu_to_le32 (0);
+		*outbuf = cpu_to_le32 (0);
 		retval = 0;
 		break;
 
 	/* mandatory */
 	case OID_802_3_XMIT_MORE_COLLISIONS:
 		pr_debug("%s: OID_802_3_XMIT_MORE_COLLISIONS\n", __func__);
-		*outbuf = __constant_cpu_to_le32 (0);
+		*outbuf = cpu_to_le32 (0);
 		retval = 0;
 		break;
 
@@ -572,24 +572,24 @@ static int rndis_init_response (int configNr, rndis_init_msg_type *buf)
 		return -ENOMEM;
 	resp = (rndis_init_cmplt_type *) r->buf;
 
-	resp->MessageType = __constant_cpu_to_le32 (
+	resp->MessageType = cpu_to_le32 (
 			REMOTE_NDIS_INITIALIZE_CMPLT);
-	resp->MessageLength = __constant_cpu_to_le32 (52);
+	resp->MessageLength = cpu_to_le32 (52);
 	resp->RequestID = buf->RequestID; /* Still LE in msg buffer */
-	resp->Status = __constant_cpu_to_le32 (RNDIS_STATUS_SUCCESS);
-	resp->MajorVersion = __constant_cpu_to_le32 (RNDIS_MAJOR_VERSION);
-	resp->MinorVersion = __constant_cpu_to_le32 (RNDIS_MINOR_VERSION);
-	resp->DeviceFlags = __constant_cpu_to_le32 (RNDIS_DF_CONNECTIONLESS);
-	resp->Medium = __constant_cpu_to_le32 (RNDIS_MEDIUM_802_3);
-	resp->MaxPacketsPerTransfer = __constant_cpu_to_le32 (1);
+	resp->Status = cpu_to_le32 (RNDIS_STATUS_SUCCESS);
+	resp->MajorVersion = cpu_to_le32 (RNDIS_MAJOR_VERSION);
+	resp->MinorVersion = cpu_to_le32 (RNDIS_MINOR_VERSION);
+	resp->DeviceFlags = cpu_to_le32 (RNDIS_DF_CONNECTIONLESS);
+	resp->Medium = cpu_to_le32 (RNDIS_MEDIUM_802_3);
+	resp->MaxPacketsPerTransfer = cpu_to_le32 (1);
 	resp->MaxTransferSize = cpu_to_le32 (
 		  params->dev->mtu
 		+ sizeof (struct ethhdr)
 		+ sizeof (struct rndis_packet_msg_type)
 		+ 22);
-	resp->PacketAlignmentFactor = __constant_cpu_to_le32 (0);
-	resp->AFListOffset = __constant_cpu_to_le32 (0);
-	resp->AFListSize = __constant_cpu_to_le32 (0);
+	resp->PacketAlignmentFactor = cpu_to_le32 (0);
+	resp->AFListOffset = cpu_to_le32 (0);
+	resp->AFListSize = cpu_to_le32 (0);
 
 	params->resp_avail(params->v);
 	return 0;
@@ -617,7 +617,7 @@ static int rndis_query_response (int configNr, rndis_query_msg_type *buf)
 		return -ENOMEM;
 	resp = (rndis_query_cmplt_type *) r->buf;
 
-	resp->MessageType = __constant_cpu_to_le32 (REMOTE_NDIS_QUERY_CMPLT);
+	resp->MessageType = cpu_to_le32 (REMOTE_NDIS_QUERY_CMPLT);
 	resp->RequestID = buf->RequestID; /* Still LE in msg buffer */
 
 	if (gen_ndis_query_resp (configNr, le32_to_cpu (buf->OID),
@@ -626,13 +626,13 @@ static int rndis_query_response (int configNr, rndis_query_msg_type *buf)
 			le32_to_cpu(buf->InformationBufferLength),
 			r)) {
 		/* OID not supported */
-		resp->Status = __constant_cpu_to_le32 (
+		resp->Status = cpu_to_le32 (
 				RNDIS_STATUS_NOT_SUPPORTED);
-		resp->MessageLength = __constant_cpu_to_le32 (sizeof *resp);
-		resp->InformationBufferLength = __constant_cpu_to_le32 (0);
-		resp->InformationBufferOffset = __constant_cpu_to_le32 (0);
+		resp->MessageLength = cpu_to_le32 (sizeof *resp);
+		resp->InformationBufferLength = cpu_to_le32 (0);
+		resp->InformationBufferOffset = cpu_to_le32 (0);
 	} else
-		resp->Status = __constant_cpu_to_le32 (RNDIS_STATUS_SUCCESS);
+		resp->Status = cpu_to_le32 (RNDIS_STATUS_SUCCESS);
 
 	params->resp_avail(params->v);
 	return 0;
@@ -665,14 +665,14 @@ static int rndis_set_response (int configNr, rndis_set_msg_type *buf)
 	pr_debug("\n");
 #endif
 
-	resp->MessageType = __constant_cpu_to_le32 (REMOTE_NDIS_SET_CMPLT);
-	resp->MessageLength = __constant_cpu_to_le32 (16);
+	resp->MessageType = cpu_to_le32 (REMOTE_NDIS_SET_CMPLT);
+	resp->MessageLength = cpu_to_le32 (16);
 	resp->RequestID = buf->RequestID; /* Still LE in msg buffer */
 	if (gen_ndis_set_resp (configNr, le32_to_cpu (buf->OID),
 			((u8 *) buf) + 8 + BufOffset, BufLength, r))
-		resp->Status = __constant_cpu_to_le32 (RNDIS_STATUS_NOT_SUPPORTED);
+		resp->Status = cpu_to_le32 (RNDIS_STATUS_NOT_SUPPORTED);
 	else
-		resp->Status = __constant_cpu_to_le32 (RNDIS_STATUS_SUCCESS);
+		resp->Status = cpu_to_le32 (RNDIS_STATUS_SUCCESS);
 
 	params->resp_avail(params->v);
 	return 0;
@@ -689,11 +689,11 @@ static int rndis_reset_response (int configNr, rndis_reset_msg_type *buf)
 		return -ENOMEM;
 	resp = (rndis_reset_cmplt_type *) r->buf;
 
-	resp->MessageType = __constant_cpu_to_le32 (REMOTE_NDIS_RESET_CMPLT);
-	resp->MessageLength = __constant_cpu_to_le32 (16);
-	resp->Status = __constant_cpu_to_le32 (RNDIS_STATUS_SUCCESS);
+	resp->MessageType = cpu_to_le32 (REMOTE_NDIS_RESET_CMPLT);
+	resp->MessageLength = cpu_to_le32 (16);
+	resp->Status = cpu_to_le32 (RNDIS_STATUS_SUCCESS);
 	/* resent information */
-	resp->AddressingReset = __constant_cpu_to_le32 (1);
+	resp->AddressingReset = cpu_to_le32 (1);
 
 	params->resp_avail(params->v);
 	return 0;
@@ -713,11 +713,11 @@ static int rndis_keepalive_response (int configNr,
 		return -ENOMEM;
 	resp = (rndis_keepalive_cmplt_type *) r->buf;
 
-	resp->MessageType = __constant_cpu_to_le32 (
+	resp->MessageType = cpu_to_le32 (
 			REMOTE_NDIS_KEEPALIVE_CMPLT);
-	resp->MessageLength = __constant_cpu_to_le32 (16);
+	resp->MessageLength = cpu_to_le32 (16);
 	resp->RequestID = buf->RequestID; /* Still LE in msg buffer */
-	resp->Status = __constant_cpu_to_le32 (RNDIS_STATUS_SUCCESS);
+	resp->Status = cpu_to_le32 (RNDIS_STATUS_SUCCESS);
 
 	params->resp_avail(params->v);
 	return 0;
@@ -742,12 +742,12 @@ static int rndis_indicate_status_msg (int configNr, u32 status)
 		return -ENOMEM;
 	resp = (rndis_indicate_status_msg_type *) r->buf;
 
-	resp->MessageType = __constant_cpu_to_le32 (
+	resp->MessageType = cpu_to_le32 (
 			REMOTE_NDIS_INDICATE_STATUS_MSG);
-	resp->MessageLength = __constant_cpu_to_le32 (20);
+	resp->MessageLength = cpu_to_le32 (20);
 	resp->Status = cpu_to_le32 (status);
-	resp->StatusBufferLength = __constant_cpu_to_le32 (0);
-	resp->StatusBufferOffset = __constant_cpu_to_le32 (0);
+	resp->StatusBufferLength = cpu_to_le32 (0);
+	resp->StatusBufferOffset = cpu_to_le32 (0);
 
 	params->resp_avail(params->v);
 	return 0;
@@ -963,9 +963,9 @@ void rndis_add_hdr (struct sk_buff *skb)
 		return;
 	header = (void *) skb_push (skb, sizeof *header);
 	memset (header, 0, sizeof *header);
-	header->MessageType = __constant_cpu_to_le32(REMOTE_NDIS_PACKET_MSG);
+	header->MessageType = cpu_to_le32(REMOTE_NDIS_PACKET_MSG);
 	header->MessageLength = cpu_to_le32(skb->len);
-	header->DataOffset = __constant_cpu_to_le32 (36);
+	header->DataOffset = cpu_to_le32 (36);
 	header->DataLength = cpu_to_le32(skb->len - sizeof *header);
 }
 
@@ -1029,7 +1029,7 @@ int rndis_rm_hdr(struct sk_buff *skb)
 	__le32		*tmp = (void *) skb->data;
 
 	/* MessageType, MessageLength */
-	if (__constant_cpu_to_le32(REMOTE_NDIS_PACKET_MSG)
+	if (cpu_to_le32(REMOTE_NDIS_PACKET_MSG)
 			!= get_unaligned(tmp++))
 		return -EINVAL;
 	tmp++;
diff --git a/include/linux/usb/rndis_host.h b/include/linux/usb/rndis_host.h
index 0a6e6d4b929a..37836b937d97 100644
--- a/include/linux/usb/rndis_host.h
+++ b/include/linux/usb/rndis_host.h
@@ -49,48 +49,45 @@ struct rndis_msg_hdr {
  */
 #define	RNDIS_CONTROL_TIMEOUT_MS	(5 * 1000)
 
-
-#define ccpu2 __constant_cpu_to_le32
-
-#define RNDIS_MSG_COMPLETION	ccpu2(0x80000000)
+#define RNDIS_MSG_COMPLETION	cpu_to_le32(0x80000000)
 
 /* codes for "msg_type" field of rndis messages;
  * only the data channel uses packet messages (maybe batched);
  * everything else goes on the control channel.
  */
-#define RNDIS_MSG_PACKET	ccpu2(0x00000001)	/* 1-N packets */
-#define RNDIS_MSG_INIT		ccpu2(0x00000002)
+#define RNDIS_MSG_PACKET	cpu_to_le32(0x00000001)	/* 1-N packets */
+#define RNDIS_MSG_INIT		cpu_to_le32(0x00000002)
 #define RNDIS_MSG_INIT_C	(RNDIS_MSG_INIT|RNDIS_MSG_COMPLETION)
-#define RNDIS_MSG_HALT		ccpu2(0x00000003)
-#define RNDIS_MSG_QUERY		ccpu2(0x00000004)
+#define RNDIS_MSG_HALT		cpu_to_le32(0x00000003)
+#define RNDIS_MSG_QUERY		cpu_to_le32(0x00000004)
 #define RNDIS_MSG_QUERY_C	(RNDIS_MSG_QUERY|RNDIS_MSG_COMPLETION)
-#define RNDIS_MSG_SET		ccpu2(0x00000005)
+#define RNDIS_MSG_SET		cpu_to_le32(0x00000005)
 #define RNDIS_MSG_SET_C		(RNDIS_MSG_SET|RNDIS_MSG_COMPLETION)
-#define RNDIS_MSG_RESET		ccpu2(0x00000006)
+#define RNDIS_MSG_RESET		cpu_to_le32(0x00000006)
 #define RNDIS_MSG_RESET_C	(RNDIS_MSG_RESET|RNDIS_MSG_COMPLETION)
-#define RNDIS_MSG_INDICATE	ccpu2(0x00000007)
-#define RNDIS_MSG_KEEPALIVE	ccpu2(0x00000008)
+#define RNDIS_MSG_INDICATE	cpu_to_le32(0x00000007)
+#define RNDIS_MSG_KEEPALIVE	cpu_to_le32(0x00000008)
 #define RNDIS_MSG_KEEPALIVE_C	(RNDIS_MSG_KEEPALIVE|RNDIS_MSG_COMPLETION)
 
 /* codes for "status" field of completion messages */
-#define	RNDIS_STATUS_SUCCESS		ccpu2(0x00000000)
-#define	RNDIS_STATUS_FAILURE		ccpu2(0xc0000001)
-#define	RNDIS_STATUS_INVALID_DATA	ccpu2(0xc0010015)
-#define	RNDIS_STATUS_NOT_SUPPORTED	ccpu2(0xc00000bb)
-#define	RNDIS_STATUS_MEDIA_CONNECT	ccpu2(0x4001000b)
-#define	RNDIS_STATUS_MEDIA_DISCONNECT	ccpu2(0x4001000c)
+#define	RNDIS_STATUS_SUCCESS		cpu_to_le32(0x00000000)
+#define	RNDIS_STATUS_FAILURE		cpu_to_le32(0xc0000001)
+#define	RNDIS_STATUS_INVALID_DATA	cpu_to_le32(0xc0010015)
+#define	RNDIS_STATUS_NOT_SUPPORTED	cpu_to_le32(0xc00000bb)
+#define	RNDIS_STATUS_MEDIA_CONNECT	cpu_to_le32(0x4001000b)
+#define	RNDIS_STATUS_MEDIA_DISCONNECT	cpu_to_le32(0x4001000c)
 
 /* codes for OID_GEN_PHYSICAL_MEDIUM */
-#define	RNDIS_PHYSICAL_MEDIUM_UNSPECIFIED	ccpu2(0x00000000)
-#define	RNDIS_PHYSICAL_MEDIUM_WIRELESS_LAN	ccpu2(0x00000001)
-#define	RNDIS_PHYSICAL_MEDIUM_CABLE_MODEM	ccpu2(0x00000002)
-#define	RNDIS_PHYSICAL_MEDIUM_PHONE_LINE	ccpu2(0x00000003)
-#define	RNDIS_PHYSICAL_MEDIUM_POWER_LINE	ccpu2(0x00000004)
-#define	RNDIS_PHYSICAL_MEDIUM_DSL		ccpu2(0x00000005)
-#define	RNDIS_PHYSICAL_MEDIUM_FIBRE_CHANNEL	ccpu2(0x00000006)
-#define	RNDIS_PHYSICAL_MEDIUM_1394		ccpu2(0x00000007)
-#define	RNDIS_PHYSICAL_MEDIUM_WIRELESS_WAN	ccpu2(0x00000008)
-#define	RNDIS_PHYSICAL_MEDIUM_MAX		ccpu2(0x00000009)
+#define	RNDIS_PHYSICAL_MEDIUM_UNSPECIFIED	cpu_to_le32(0x00000000)
+#define	RNDIS_PHYSICAL_MEDIUM_WIRELESS_LAN	cpu_to_le32(0x00000001)
+#define	RNDIS_PHYSICAL_MEDIUM_CABLE_MODEM	cpu_to_le32(0x00000002)
+#define	RNDIS_PHYSICAL_MEDIUM_PHONE_LINE	cpu_to_le32(0x00000003)
+#define	RNDIS_PHYSICAL_MEDIUM_POWER_LINE	cpu_to_le32(0x00000004)
+#define	RNDIS_PHYSICAL_MEDIUM_DSL		cpu_to_le32(0x00000005)
+#define	RNDIS_PHYSICAL_MEDIUM_FIBRE_CHANNEL	cpu_to_le32(0x00000006)
+#define	RNDIS_PHYSICAL_MEDIUM_1394		cpu_to_le32(0x00000007)
+#define	RNDIS_PHYSICAL_MEDIUM_WIRELESS_WAN	cpu_to_le32(0x00000008)
+#define	RNDIS_PHYSICAL_MEDIUM_MAX		cpu_to_le32(0x00000009)
 
 struct rndis_data_hdr {
 	__le32	msg_type;		/* RNDIS_MSG_PACKET */
@@ -228,24 +225,24 @@ struct rndis_keepalive_c {	/* IN (optionally OUT) */
  * there are gobs more that may optionally be supported.  We'll avoid as much
  * of that mess as possible.
  */
-#define OID_802_3_PERMANENT_ADDRESS	ccpu2(0x01010101)
-#define OID_GEN_MAXIMUM_FRAME_SIZE	ccpu2(0x00010106)
-#define OID_GEN_CURRENT_PACKET_FILTER	ccpu2(0x0001010e)
-#define OID_GEN_PHYSICAL_MEDIUM		ccpu2(0x00010202)
+#define OID_802_3_PERMANENT_ADDRESS	cpu_to_le32(0x01010101)
+#define OID_GEN_MAXIMUM_FRAME_SIZE	cpu_to_le32(0x00010106)
+#define OID_GEN_CURRENT_PACKET_FILTER	cpu_to_le32(0x0001010e)
+#define OID_GEN_PHYSICAL_MEDIUM		cpu_to_le32(0x00010202)
 
 /* packet filter bits used by OID_GEN_CURRENT_PACKET_FILTER */
-#define RNDIS_PACKET_TYPE_DIRECTED		ccpu2(0x00000001)
-#define RNDIS_PACKET_TYPE_MULTICAST		ccpu2(0x00000002)
-#define RNDIS_PACKET_TYPE_ALL_MULTICAST		ccpu2(0x00000004)
-#define RNDIS_PACKET_TYPE_BROADCAST		ccpu2(0x00000008)
-#define RNDIS_PACKET_TYPE_SOURCE_ROUTING	ccpu2(0x00000010)
-#define RNDIS_PACKET_TYPE_PROMISCUOUS		ccpu2(0x00000020)
-#define RNDIS_PACKET_TYPE_SMT			ccpu2(0x00000040)
-#define RNDIS_PACKET_TYPE_ALL_LOCAL		ccpu2(0x00000080)
-#define RNDIS_PACKET_TYPE_GROUP			ccpu2(0x00001000)
-#define RNDIS_PACKET_TYPE_ALL_FUNCTIONAL	ccpu2(0x00002000)
-#define RNDIS_PACKET_TYPE_FUNCTIONAL		ccpu2(0x00004000)
-#define RNDIS_PACKET_TYPE_MAC_FRAME		ccpu2(0x00008000)
+#define RNDIS_PACKET_TYPE_DIRECTED		cpu_to_le32(0x00000001)
+#define RNDIS_PACKET_TYPE_MULTICAST		cpu_to_le32(0x00000002)
+#define RNDIS_PACKET_TYPE_ALL_MULTICAST		cpu_to_le32(0x00000004)
+#define RNDIS_PACKET_TYPE_BROADCAST		cpu_to_le32(0x00000008)
+#define RNDIS_PACKET_TYPE_SOURCE_ROUTING	cpu_to_le32(0x00000010)
+#define RNDIS_PACKET_TYPE_PROMISCUOUS		cpu_to_le32(0x00000020)
+#define RNDIS_PACKET_TYPE_SMT			cpu_to_le32(0x00000040)
+#define RNDIS_PACKET_TYPE_ALL_LOCAL		cpu_to_le32(0x00000080)
+#define RNDIS_PACKET_TYPE_GROUP			cpu_to_le32(0x00001000)
+#define RNDIS_PACKET_TYPE_ALL_FUNCTIONAL	cpu_to_le32(0x00002000)
+#define RNDIS_PACKET_TYPE_FUNCTIONAL		cpu_to_le32(0x00004000)
+#define RNDIS_PACKET_TYPE_MAC_FRAME		cpu_to_le32(0x00008000)
 
 /* default filter used with RNDIS devices */
 #define RNDIS_DEFAULT_FILTER ( \
-- 
cgit v1.2.3-71-gd317


From f3a7c66b5ce0b75a9774a50b5dcce93e5ba28370 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 14 Feb 2009 22:58:35 -0800
Subject: net: replace __constant_{endian} uses in net headers

Base versions handle constant folding now.  For headers exposed to
userspace, we must only expose the __ prefixed versions.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_pppox.h         | 20 ++++-----
 include/linux/if_tunnel.h        | 16 +++----
 include/linux/ncp_no.h           | 26 ++++++------
 include/linux/netdevice.h        |  4 +-
 include/linux/netfilter_bridge.h |  4 +-
 include/linux/pim.h              |  4 +-
 include/linux/sctp.h             | 90 ++++++++++++++++++++--------------------
 include/linux/tcp.h              | 20 ++++-----
 include/net/inet_ecn.h           |  4 +-
 include/net/ip_vs.h              |  4 +-
 include/net/ipv6.h               |  4 +-
 include/net/ipx.h                |  2 +-
 include/net/transp_v6.h          |  2 +-
 include/rdma/ib_verbs.h          |  2 +-
 14 files changed, 101 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 30c88b2245ff..90b5fae5d714 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -95,16 +95,16 @@ struct pppoe_tag {
 } __attribute ((packed));
 
 /* Tag identifiers */
-#define PTT_EOL		__constant_htons(0x0000)
-#define PTT_SRV_NAME	__constant_htons(0x0101)
-#define PTT_AC_NAME	__constant_htons(0x0102)
-#define PTT_HOST_UNIQ	__constant_htons(0x0103)
-#define PTT_AC_COOKIE	__constant_htons(0x0104)
-#define PTT_VENDOR 	__constant_htons(0x0105)
-#define PTT_RELAY_SID	__constant_htons(0x0110)
-#define PTT_SRV_ERR     __constant_htons(0x0201)
-#define PTT_SYS_ERR  	__constant_htons(0x0202)
-#define PTT_GEN_ERR  	__constant_htons(0x0203)
+#define PTT_EOL		__cpu_to_be16(0x0000)
+#define PTT_SRV_NAME	__cpu_to_be16(0x0101)
+#define PTT_AC_NAME	__cpu_to_be16(0x0102)
+#define PTT_HOST_UNIQ	__cpu_to_be16(0x0103)
+#define PTT_AC_COOKIE	__cpu_to_be16(0x0104)
+#define PTT_VENDOR 	__cpu_to_be16(0x0105)
+#define PTT_RELAY_SID	__cpu_to_be16(0x0110)
+#define PTT_SRV_ERR     __cpu_to_be16(0x0201)
+#define PTT_SYS_ERR  	__cpu_to_be16(0x0202)
+#define PTT_GEN_ERR  	__cpu_to_be16(0x0203)
 
 struct pppoe_hdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 82c43624c067..5a9aae4adb44 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -16,14 +16,14 @@
 #define SIOCDELPRL      (SIOCDEVPRIVATE + 6)
 #define SIOCCHGPRL      (SIOCDEVPRIVATE + 7)
 
-#define GRE_CSUM	__constant_htons(0x8000)
-#define GRE_ROUTING	__constant_htons(0x4000)
-#define GRE_KEY		__constant_htons(0x2000)
-#define GRE_SEQ		__constant_htons(0x1000)
-#define GRE_STRICT	__constant_htons(0x0800)
-#define GRE_REC		__constant_htons(0x0700)
-#define GRE_FLAGS	__constant_htons(0x00F8)
-#define GRE_VERSION	__constant_htons(0x0007)
+#define GRE_CSUM	__cpu_to_be16(0x8000)
+#define GRE_ROUTING	__cpu_to_be16(0x4000)
+#define GRE_KEY		__cpu_to_be16(0x2000)
+#define GRE_SEQ		__cpu_to_be16(0x1000)
+#define GRE_STRICT	__cpu_to_be16(0x0800)
+#define GRE_REC		__cpu_to_be16(0x0700)
+#define GRE_FLAGS	__cpu_to_be16(0x00F8)
+#define GRE_VERSION	__cpu_to_be16(0x0007)
 
 struct ip_tunnel_parm
 {
diff --git a/include/linux/ncp_no.h b/include/linux/ncp_no.h
index f56a696a7cc6..cddaa48fb182 100644
--- a/include/linux/ncp_no.h
+++ b/include/linux/ncp_no.h
@@ -2,18 +2,18 @@
 #define _NCP_NO
 
 /* these define the attribute byte as seen by NCP */
-#define aRONLY			(__constant_cpu_to_le32(1))
-#define aHIDDEN			(__constant_cpu_to_le32(2))
-#define aSYSTEM			(__constant_cpu_to_le32(4))
-#define aEXECUTE		(__constant_cpu_to_le32(8))
-#define aDIR			(__constant_cpu_to_le32(0x10))
-#define aARCH			(__constant_cpu_to_le32(0x20))
-#define aSHARED			(__constant_cpu_to_le32(0x80))
-#define aDONTSUBALLOCATE	(__constant_cpu_to_le32(1L<<11))
-#define aTRANSACTIONAL		(__constant_cpu_to_le32(1L<<12))
-#define aPURGE			(__constant_cpu_to_le32(1L<<16))
-#define aRENAMEINHIBIT		(__constant_cpu_to_le32(1L<<17))
-#define aDELETEINHIBIT		(__constant_cpu_to_le32(1L<<18))
-#define aDONTCOMPRESS		(__constant_cpu_to_le32(1L<<27))
+#define aRONLY			(__cpu_to_le32(1))
+#define aHIDDEN			(__cpu_to_le32(2))
+#define aSYSTEM			(__cpu_to_le32(4))
+#define aEXECUTE		(__cpu_to_le32(8))
+#define aDIR			(__cpu_to_le32(0x10))
+#define aARCH			(__cpu_to_le32(0x20))
+#define aSHARED			(__cpu_to_le32(0x80))
+#define aDONTSUBALLOCATE	(__cpu_to_le32(1L<<11))
+#define aTRANSACTIONAL		(__cpu_to_le32(1L<<12))
+#define aPURGE			(__cpu_to_le32(1L<<16))
+#define aRENAMEINHIBIT		(__cpu_to_le32(1L<<17))
+#define aDELETEINHIBIT		(__cpu_to_le32(1L<<18))
+#define aDONTCOMPRESS		(__cpu_to_le32(1L<<27))
 
 #endif /* _NCP_NO */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 355662aac940..bd8b4ca85a2a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1863,7 +1863,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 
 		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
 			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-			    skb->protocol == __constant_htons(ETH_P_ARP))
+			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
 				return 0;
 
 			if (master->priv_flags & IFF_MASTER_ALB) {
@@ -1872,7 +1872,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 					return 0;
 			}
 			if (master->priv_flags & IFF_MASTER_8023AD &&
-			    skb->protocol == __constant_htons(ETH_P_SLOW))
+			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
 				return 0;
 
 			return 1;
diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index 499aa9375901..f8105e54716a 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -59,9 +59,9 @@ static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb)
 static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
 {
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_8021Q):
+	case __cpu_to_be16(ETH_P_8021Q):
 		return VLAN_HLEN;
-	case __constant_htons(ETH_P_PPP_SES):
+	case __cpu_to_be16(ETH_P_PPP_SES):
 		return PPPOE_SES_HLEN;
 	default:
 		return 0;
diff --git a/include/linux/pim.h b/include/linux/pim.h
index 1ba0661561a4..252bf6644c51 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -4,14 +4,14 @@
 #include <asm/byteorder.h>
 
 /* Message types - V1 */
-#define PIM_V1_VERSION		__constant_htonl(0x10000000)
+#define PIM_V1_VERSION		cpu_to_be32(0x10000000)
 #define PIM_V1_REGISTER		1
 
 /* Message types - V2 */
 #define PIM_VERSION		2
 #define PIM_REGISTER		1
 
-#define PIM_NULL_REGISTER	__constant_htonl(0x40000000)
+#define PIM_NULL_REGISTER	cpu_to_be32(0x40000000)
 
 /* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */
 struct pimreghdr
diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 8ba1c320f975..bd50b371ffaa 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -172,35 +172,35 @@ typedef struct sctp_paramhdr {
 typedef enum {
 
 	/* RFC 2960 Section 3.3.5 */
-	SCTP_PARAM_HEARTBEAT_INFO		= __constant_htons(1),
+	SCTP_PARAM_HEARTBEAT_INFO		= cpu_to_be16(1),
 	/* RFC 2960 Section 3.3.2.1 */
-	SCTP_PARAM_IPV4_ADDRESS			= __constant_htons(5),
-	SCTP_PARAM_IPV6_ADDRESS			= __constant_htons(6),
-	SCTP_PARAM_STATE_COOKIE			= __constant_htons(7),
-	SCTP_PARAM_UNRECOGNIZED_PARAMETERS	= __constant_htons(8),
-	SCTP_PARAM_COOKIE_PRESERVATIVE		= __constant_htons(9),
-	SCTP_PARAM_HOST_NAME_ADDRESS		= __constant_htons(11),
-	SCTP_PARAM_SUPPORTED_ADDRESS_TYPES	= __constant_htons(12),
-	SCTP_PARAM_ECN_CAPABLE			= __constant_htons(0x8000),
+	SCTP_PARAM_IPV4_ADDRESS			= cpu_to_be16(5),
+	SCTP_PARAM_IPV6_ADDRESS			= cpu_to_be16(6),
+	SCTP_PARAM_STATE_COOKIE			= cpu_to_be16(7),
+	SCTP_PARAM_UNRECOGNIZED_PARAMETERS	= cpu_to_be16(8),
+	SCTP_PARAM_COOKIE_PRESERVATIVE		= cpu_to_be16(9),
+	SCTP_PARAM_HOST_NAME_ADDRESS		= cpu_to_be16(11),
+	SCTP_PARAM_SUPPORTED_ADDRESS_TYPES	= cpu_to_be16(12),
+	SCTP_PARAM_ECN_CAPABLE			= cpu_to_be16(0x8000),
 
 	/* AUTH Extension Section 3 */
-	SCTP_PARAM_RANDOM			= __constant_htons(0x8002),
-	SCTP_PARAM_CHUNKS			= __constant_htons(0x8003),
-	SCTP_PARAM_HMAC_ALGO			= __constant_htons(0x8004),
+	SCTP_PARAM_RANDOM			= cpu_to_be16(0x8002),
+	SCTP_PARAM_CHUNKS			= cpu_to_be16(0x8003),
+	SCTP_PARAM_HMAC_ALGO			= cpu_to_be16(0x8004),
 
 	/* Add-IP: Supported Extensions, Section 4.2 */
-	SCTP_PARAM_SUPPORTED_EXT	= __constant_htons(0x8008),
+	SCTP_PARAM_SUPPORTED_EXT	= cpu_to_be16(0x8008),
 
 	/* PR-SCTP Sec 3.1 */
-	SCTP_PARAM_FWD_TSN_SUPPORT	= __constant_htons(0xc000),
+	SCTP_PARAM_FWD_TSN_SUPPORT	= cpu_to_be16(0xc000),
 
 	/* Add-IP Extension. Section 3.2 */
-	SCTP_PARAM_ADD_IP		= __constant_htons(0xc001),
-	SCTP_PARAM_DEL_IP		= __constant_htons(0xc002),
-	SCTP_PARAM_ERR_CAUSE		= __constant_htons(0xc003),
-	SCTP_PARAM_SET_PRIMARY		= __constant_htons(0xc004),
-	SCTP_PARAM_SUCCESS_REPORT	= __constant_htons(0xc005),
-	SCTP_PARAM_ADAPTATION_LAYER_IND = __constant_htons(0xc006),
+	SCTP_PARAM_ADD_IP		= cpu_to_be16(0xc001),
+	SCTP_PARAM_DEL_IP		= cpu_to_be16(0xc002),
+	SCTP_PARAM_ERR_CAUSE		= cpu_to_be16(0xc003),
+	SCTP_PARAM_SET_PRIMARY		= cpu_to_be16(0xc004),
+	SCTP_PARAM_SUCCESS_REPORT	= cpu_to_be16(0xc005),
+	SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006),
 
 } sctp_param_t; /* enum */
 
@@ -212,13 +212,13 @@ typedef enum {
  *
  */
 typedef enum {
-	SCTP_PARAM_ACTION_DISCARD     = __constant_htons(0x0000),
-	SCTP_PARAM_ACTION_DISCARD_ERR = __constant_htons(0x4000),
-	SCTP_PARAM_ACTION_SKIP        = __constant_htons(0x8000),
-	SCTP_PARAM_ACTION_SKIP_ERR    = __constant_htons(0xc000),
+	SCTP_PARAM_ACTION_DISCARD     = cpu_to_be16(0x0000),
+	SCTP_PARAM_ACTION_DISCARD_ERR = cpu_to_be16(0x4000),
+	SCTP_PARAM_ACTION_SKIP        = cpu_to_be16(0x8000),
+	SCTP_PARAM_ACTION_SKIP_ERR    = cpu_to_be16(0xc000),
 } sctp_param_action_t;
 
-enum { SCTP_PARAM_ACTION_MASK = __constant_htons(0xc000), };
+enum { SCTP_PARAM_ACTION_MASK = cpu_to_be16(0xc000), };
 
 /* RFC 2960 Section 3.3.1 Payload Data (DATA) (0) */
 
@@ -457,17 +457,17 @@ typedef struct sctp_operr_chunk {
  */
 typedef enum {
 
-	SCTP_ERROR_NO_ERROR	   = __constant_htons(0x00),
-	SCTP_ERROR_INV_STRM	   = __constant_htons(0x01),
-	SCTP_ERROR_MISS_PARAM 	   = __constant_htons(0x02),
-	SCTP_ERROR_STALE_COOKIE	   = __constant_htons(0x03),
-	SCTP_ERROR_NO_RESOURCE 	   = __constant_htons(0x04),
-	SCTP_ERROR_DNS_FAILED      = __constant_htons(0x05),
-	SCTP_ERROR_UNKNOWN_CHUNK   = __constant_htons(0x06),
-	SCTP_ERROR_INV_PARAM       = __constant_htons(0x07),
-	SCTP_ERROR_UNKNOWN_PARAM   = __constant_htons(0x08),
-	SCTP_ERROR_NO_DATA         = __constant_htons(0x09),
-	SCTP_ERROR_COOKIE_IN_SHUTDOWN = __constant_htons(0x0a),
+	SCTP_ERROR_NO_ERROR	   = cpu_to_be16(0x00),
+	SCTP_ERROR_INV_STRM	   = cpu_to_be16(0x01),
+	SCTP_ERROR_MISS_PARAM 	   = cpu_to_be16(0x02),
+	SCTP_ERROR_STALE_COOKIE	   = cpu_to_be16(0x03),
+	SCTP_ERROR_NO_RESOURCE 	   = cpu_to_be16(0x04),
+	SCTP_ERROR_DNS_FAILED      = cpu_to_be16(0x05),
+	SCTP_ERROR_UNKNOWN_CHUNK   = cpu_to_be16(0x06),
+	SCTP_ERROR_INV_PARAM       = cpu_to_be16(0x07),
+	SCTP_ERROR_UNKNOWN_PARAM   = cpu_to_be16(0x08),
+	SCTP_ERROR_NO_DATA         = cpu_to_be16(0x09),
+	SCTP_ERROR_COOKIE_IN_SHUTDOWN = cpu_to_be16(0x0a),
 
 
 	/* SCTP Implementation Guide:
@@ -476,9 +476,9 @@ typedef enum {
 	 *  13  Protocol Violation
 	 */
 
-	SCTP_ERROR_RESTART         = __constant_htons(0x0b),
-	SCTP_ERROR_USER_ABORT      = __constant_htons(0x0c),
-	SCTP_ERROR_PROTO_VIOLATION = __constant_htons(0x0d),
+	SCTP_ERROR_RESTART         = cpu_to_be16(0x0b),
+	SCTP_ERROR_USER_ABORT      = cpu_to_be16(0x0c),
+	SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d),
 
 	/* ADDIP Section 3.3  New Error Causes
 	 *
@@ -493,11 +493,11 @@ typedef enum {
 	 * 0x0103          Association Aborted due to illegal ASCONF-ACK
 	 * 0x0104          Request refused - no authorization.
 	 */
-	SCTP_ERROR_DEL_LAST_IP	= __constant_htons(0x0100),
-	SCTP_ERROR_RSRC_LOW	= __constant_htons(0x0101),
-	SCTP_ERROR_DEL_SRC_IP	= __constant_htons(0x0102),
-	SCTP_ERROR_ASCONF_ACK   = __constant_htons(0x0103),
-	SCTP_ERROR_REQ_REFUSED	= __constant_htons(0x0104),
+	SCTP_ERROR_DEL_LAST_IP	= cpu_to_be16(0x0100),
+	SCTP_ERROR_RSRC_LOW	= cpu_to_be16(0x0101),
+	SCTP_ERROR_DEL_SRC_IP	= cpu_to_be16(0x0102),
+	SCTP_ERROR_ASCONF_ACK   = cpu_to_be16(0x0103),
+	SCTP_ERROR_REQ_REFUSED	= cpu_to_be16(0x0104),
 
 	/* AUTH Section 4.  New Error Cause
 	 *
@@ -509,7 +509,7 @@ typedef enum {
 	 * --------------------------------------------------------------
 	 * 0x0105          Unsupported HMAC Identifier
 	 */
-	 SCTP_ERROR_UNSUP_HMAC	= __constant_htons(0x0105)
+	 SCTP_ERROR_UNSUP_HMAC	= cpu_to_be16(0x0105)
 } sctp_error_t;
 
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fe77e1499ab7..0cd99e6baca5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -69,16 +69,16 @@ union tcp_word_hdr {
 #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) 
 
 enum { 
-	TCP_FLAG_CWR = __constant_htonl(0x00800000), 
-	TCP_FLAG_ECE = __constant_htonl(0x00400000), 
-	TCP_FLAG_URG = __constant_htonl(0x00200000), 
-	TCP_FLAG_ACK = __constant_htonl(0x00100000), 
-	TCP_FLAG_PSH = __constant_htonl(0x00080000), 
-	TCP_FLAG_RST = __constant_htonl(0x00040000), 
-	TCP_FLAG_SYN = __constant_htonl(0x00020000), 
-	TCP_FLAG_FIN = __constant_htonl(0x00010000),
-	TCP_RESERVED_BITS = __constant_htonl(0x0F000000),
-	TCP_DATA_OFFSET = __constant_htonl(0xF0000000)
+	TCP_FLAG_CWR = __cpu_to_be32(0x00800000),
+	TCP_FLAG_ECE = __cpu_to_be32(0x00400000),
+	TCP_FLAG_URG = __cpu_to_be32(0x00200000),
+	TCP_FLAG_ACK = __cpu_to_be32(0x00100000),
+	TCP_FLAG_PSH = __cpu_to_be32(0x00080000),
+	TCP_FLAG_RST = __cpu_to_be32(0x00040000),
+	TCP_FLAG_SYN = __cpu_to_be32(0x00020000),
+	TCP_FLAG_FIN = __cpu_to_be32(0x00010000),
+	TCP_RESERVED_BITS = __cpu_to_be32(0x0F000000),
+	TCP_DATA_OFFSET = __cpu_to_be32(0xF0000000)
 }; 
 
 /* TCP socket options */
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index 7040a782c656..9b5d08f4f6e8 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -113,12 +113,12 @@ static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner)
 static inline int INET_ECN_set_ce(struct sk_buff *skb)
 {
 	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
+	case cpu_to_be16(ETH_P_IP):
 		if (skb->network_header + sizeof(struct iphdr) <= skb->tail)
 			return IP_ECN_set_ce(ip_hdr(skb));
 		break;
 
-	case __constant_htons(ETH_P_IPV6):
+	case cpu_to_be16(ETH_P_IPV6):
 		if (skb->network_header + sizeof(struct ipv6hdr) <= skb->tail)
 			return IP6_ECN_set_ce(ipv6_hdr(skb));
 		break;
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ab9b003ab671..bbae1e87efcd 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -184,8 +184,8 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 /*
  *      The port number of FTP service (in network order).
  */
-#define FTPPORT  __constant_htons(21)
-#define FTPDATA  __constant_htons(20)
+#define FTPPORT  cpu_to_be16(21)
+#define FTPDATA  cpu_to_be16(20)
 
 /*
  *      TCP State Values
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 6d5b58a1c743..c1f16fc49ade 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -196,8 +196,8 @@ struct ip6_flowlabel
 	struct net		*fl_net;
 };
 
-#define IPV6_FLOWINFO_MASK	__constant_htonl(0x0FFFFFFF)
-#define IPV6_FLOWLABEL_MASK	__constant_htonl(0x000FFFFF)
+#define IPV6_FLOWINFO_MASK	cpu_to_be32(0x0FFFFFFF)
+#define IPV6_FLOWLABEL_MASK	cpu_to_be32(0x000FFFFF)
 
 struct ipv6_fl_socklist
 {
diff --git a/include/net/ipx.h b/include/net/ipx.h
index 4cc0b4eca948..a14121dd1932 100644
--- a/include/net/ipx.h
+++ b/include/net/ipx.h
@@ -27,7 +27,7 @@ struct ipx_address {
 
 struct ipxhdr {
 	__be16			ipx_checksum __attribute__ ((packed));
-#define IPX_NO_CHECKSUM	__constant_htons(0xFFFF)
+#define IPX_NO_CHECKSUM	cpu_to_be16(0xFFFF)
 	__be16			ipx_pktsize __attribute__ ((packed));
 	__u8			ipx_tctrl;
 	__u8			ipx_type;
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index 876b6f2bb4fd..bfb240c6cf79 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -46,7 +46,7 @@ extern int			datagram_send_ctl(struct net *net,
 						  struct ipv6_txoptions *opt,
 						  int *hlimit, int *tclass);
 
-#define		LOOPBACK4_IPV6		__constant_htonl(0x7f000006)
+#define		LOOPBACK4_IPV6		cpu_to_be32(0x7f000006)
 
 /*
  *	address family specific functions
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 936e333e7ce5..c179318edd92 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -388,7 +388,7 @@ enum {
 	IB_MULTICAST_QPN = 0xffffff
 };
 
-#define IB_LID_PERMISSIVE	__constant_htons(0xFFFF)
+#define IB_LID_PERMISSIVE	cpu_to_be16(0xFFFF)
 
 enum ib_ah_flags {
 	IB_AH_GRH	= 1
-- 
cgit v1.2.3-71-gd317


From a038a353c3de4040d8445ec568acebdac144436f Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Thu, 12 Feb 2009 05:03:34 +0000
Subject: clocksource: allow usage independent of timekeeping.c

So far struct clocksource acted as the interface between time/timekeeping.c
and hardware. This patch generalizes the concept so that a similar
interface can also be used in other contexts. For that it introduces
new structures and related functions *without* touching the existing
struct clocksource.

The reasons for adding these new structures to clocksource.[ch] are
* the APIs are clearly related
* struct clocksource could be cleaned up to use the new structs
* avoids proliferation of files with similar names (timesource.h?
  timecounter.h?)

As outlined in the discussion with John Stultz, this patch adds
* struct cyclecounter: stateless API to hardware which counts clock cycles
* struct timecounter: stateful utility code built on a cyclecounter which
  provides a nanosecond counter
* only the function to read the nanosecond counter; deltas are used internally
  and not exposed to users of timecounter

The code does no locking of the shared state. It must be called at least
as often as the cycle counter wraps around to detect these wrap arounds.
Both is the responsibility of the timecounter user.

Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Patrick Ohly <patrick.ohly@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/clocksource.h | 101 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/clocksource.c   |  76 +++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f88d32f8ff7c..573819ef4cc0 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -21,9 +21,110 @@
 typedef u64 cycle_t;
 struct clocksource;
 
+/**
+ * struct cyclecounter - hardware abstraction for a free running counter
+ *	Provides completely state-free accessors to the underlying hardware.
+ *	Depending on which hardware it reads, the cycle counter may wrap
+ *	around quickly. Locking rules (if necessary) have to be defined
+ *	by the implementor and user of specific instances of this API.
+ *
+ * @read:		returns the current cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters,
+ *			see CLOCKSOURCE_MASK() helper macro
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ */
+struct cyclecounter {
+	cycle_t (*read)(const struct cyclecounter *cc);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+};
+
+/**
+ * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds
+ *	Contains the state needed by timecounter_read() to detect
+ *	cycle counter wrap around. Initialize with
+ *	timecounter_init(). Also used to convert cycle counts into the
+ *	corresponding nanosecond counts with timecounter_cyc2time(). Users
+ *	of this code are responsible for initializing the underlying
+ *	cycle counter hardware, locking issues and reading the time
+ *	more often than the cycle counter wraps around. The nanosecond
+ *	counter will only wrap around after ~585 years.
+ *
+ * @cc:			the cycle counter used by this instance
+ * @cycle_last:		most recent cycle counter value seen by
+ *			timecounter_read()
+ * @nsec:		continuously increasing count
+ */
+struct timecounter {
+	const struct cyclecounter *cc;
+	cycle_t cycle_last;
+	u64 nsec;
+};
+
+/**
+ * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
+ * @tc:		Pointer to cycle counter.
+ * @cycles:	Cycles
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization. Same code
+ * as in cyc2ns, but with unsigned result.
+ */
+static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
+				      cycle_t cycles)
+{
+	u64 ret = (u64)cycles;
+	ret = (ret * cc->mult) >> cc->shift;
+	return ret;
+}
+
+/**
+ * timecounter_init - initialize a time counter
+ * @tc:			Pointer to time counter which is to be initialized/reset
+ * @cc:			A cycle counter, ready to be used.
+ * @start_tstamp:	Arbitrary initial time stamp.
+ *
+ * After this call the current cycle register (roughly) corresponds to
+ * the initial time stamp. Every call to timecounter_read() increments
+ * the time stamp counter by the number of elapsed nanoseconds.
+ */
+extern void timecounter_init(struct timecounter *tc,
+			     const struct cyclecounter *cc,
+			     u64 start_tstamp);
+
+/**
+ * timecounter_read - return nanoseconds elapsed since timecounter_init()
+ *                    plus the initial time stamp
+ * @tc:          Pointer to time counter.
+ *
+ * In other words, keeps track of time since the same epoch as
+ * the function which generated the initial time stamp.
+ */
+extern u64 timecounter_read(struct timecounter *tc);
+
+/**
+ * timecounter_cyc2time - convert a cycle counter to same
+ *                        time base as values returned by
+ *                        timecounter_read()
+ * @tc:		Pointer to time counter.
+ * @cycle:	a value returned by tc->cc->read()
+ *
+ * Cycle counts that are converted correctly as long as they
+ * fall into the interval [-1/2 max cycle count, +1/2 max cycle count],
+ * with "max cycle count" == cs->mask+1.
+ *
+ * This allows conversion of cycle counter values which were generated
+ * in the past.
+ */
+extern u64 timecounter_cyc2time(struct timecounter *tc,
+				cycle_t cycle_tstamp);
+
 /**
  * struct clocksource - hardware abstraction for a free running counter
  *	Provides mostly state-free accessors to the underlying hardware.
+ *	This is the structure used for system time.
  *
  * @name:		ptr to clocksource name
  * @list:		list head for registration
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ca89e1593f08..c46c931a7fe7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,82 @@
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
 
+void timecounter_init(struct timecounter *tc,
+		      const struct cyclecounter *cc,
+		      u64 start_tstamp)
+{
+	tc->cc = cc;
+	tc->cycle_last = cc->read(cc);
+	tc->nsec = start_tstamp;
+}
+EXPORT_SYMBOL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc:         Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+	cycle_t cycle_now, cycle_delta;
+	u64 ns_offset;
+
+	/* read cycle counter: */
+	cycle_now = tc->cc->read(tc->cc);
+
+	/* calculate the delta since the last timecounter_read_delta(): */
+	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+
+	/* update time stamp of timecounter_read_delta() call: */
+	tc->cycle_last = cycle_now;
+
+	return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+	u64 nsec;
+
+	/* increment time by nanoseconds since last call */
+	nsec = timecounter_read_delta(tc);
+	nsec += tc->nsec;
+	tc->nsec = nsec;
+
+	return nsec;
+}
+EXPORT_SYMBOL(timecounter_read);
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+			 cycle_t cycle_tstamp)
+{
+	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+	u64 nsec;
+
+	/*
+	 * Instead of always treating cycle_tstamp as more recent
+	 * than tc->cycle_last, detect when it is too far in the
+	 * future and treat it as old time stamp instead.
+	 */
+	if (cycle_delta > tc->cc->mask / 2) {
+		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	} else {
+		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+	}
+
+	return nsec;
+}
+EXPORT_SYMBOL(timecounter_cyc2time);
+
 /* XXX - Would like a better way for initializing curr_clocksource */
 extern struct clocksource clocksource_jiffies;
 
-- 
cgit v1.2.3-71-gd317


From a75244c3d519fcb490ca2bf3f123c98017f1e8d0 Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Thu, 12 Feb 2009 05:03:35 +0000
Subject: timecompare: generic infrastructure to map between two time bases

Mapping from a struct timecounter to a time returned by functions like
ktime_get_real() is implemented. This is sufficient to use this code
in a network device driver which wants to support hardware time
stamping and transformation of hardware time stamps to system time.

The interface could have been made more versatile by not depending on
a time counter, but this wasn't done to avoid writing glue code
elsewhere.

The method implemented here is the one used and analyzed under the name
"assisted PTP" in the LCI PTP paper:
http://www.linuxclustersinstitute.org/conferences/archive/2008/PDF/Ohly_92221.pdf

Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Patrick Ohly <patrick.ohly@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/timecompare.h | 125 +++++++++++++++++++++++++++++
 kernel/time/Makefile        |   2 +-
 kernel/time/timecompare.c   | 191 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 317 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/timecompare.h
 create mode 100644 kernel/time/timecompare.c

(limited to 'include/linux')

diff --git a/include/linux/timecompare.h b/include/linux/timecompare.h
new file mode 100644
index 000000000000..546e2234e4b3
--- /dev/null
+++ b/include/linux/timecompare.h
@@ -0,0 +1,125 @@
+/*
+ * Utility code which helps transforming between two different time
+ * bases, called "source" and "target" time in this code.
+ *
+ * Source time has to be provided via the timecounter API while target
+ * time is accessed via a function callback whose prototype
+ * intentionally matches ktime_get() and ktime_get_real(). These
+ * interfaces where chosen like this so that the code serves its
+ * initial purpose without additional glue code.
+ *
+ * This purpose is synchronizing a hardware clock in a NIC with system
+ * time, in order to implement the Precision Time Protocol (PTP,
+ * IEEE1588) with more accurate hardware assisted time stamping.  In
+ * that context only synchronization against system time (=
+ * ktime_get_real()) is currently needed. But this utility code might
+ * become useful in other situations, which is why it was written as
+ * general purpose utility code.
+ *
+ * The source timecounter is assumed to return monotonically
+ * increasing time (but this code does its best to compensate if that
+ * is not the case) whereas target time may jump.
+ *
+ * The target time corresponding to a source time is determined by
+ * reading target time, reading source time, reading target time
+ * again, then assuming that average target time corresponds to source
+ * time. In other words, the assumption is that reading the source
+ * time is slow and involves equal time for sending the request and
+ * receiving the reply, whereas reading target time is assumed to be
+ * fast.
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _LINUX_TIMECOMPARE_H
+#define _LINUX_TIMECOMPARE_H
+
+#include <linux/clocksource.h>
+#include <linux/ktime.h>
+
+/**
+ * struct timecompare - stores state and configuration for the two clocks
+ *
+ * Initialize to zero, then set source/target/num_samples.
+ *
+ * Transformation between source time and target time is done with:
+ * target_time = source_time + offset +
+ *               (source_time - last_update) * skew /
+ *               TIMECOMPARE_SKEW_RESOLUTION
+ *
+ * @source:          used to get source time stamps via timecounter_read()
+ * @target:          function returning target time (for example, ktime_get
+ *                   for monotonic time, or ktime_get_real for wall clock)
+ * @num_samples:     number of times that source time and target time are to
+ *                   be compared when determining their offset
+ * @offset:          (target time - source time) at the time of the last update
+ * @skew:            average (target time - source time) / delta source time *
+ *                   TIMECOMPARE_SKEW_RESOLUTION
+ * @last_update:     last source time stamp when time offset was measured
+ */
+struct timecompare {
+	struct timecounter *source;
+	ktime_t (*target)(void);
+	int num_samples;
+
+	s64 offset;
+	s64 skew;
+	u64 last_update;
+};
+
+/**
+ * timecompare_transform - transform source time stamp into target time base
+ * @sync:            context for time sync
+ * @source_tstamp:   the result of timecounter_read() or
+ *                   timecounter_cyc2time()
+ */
+extern ktime_t timecompare_transform(struct timecompare *sync,
+				     u64 source_tstamp);
+
+/**
+ * timecompare_offset - measure current (target time - source time) offset
+ * @sync:            context for time sync
+ * @offset:          average offset during sample period returned here
+ * @source_tstamp:   average source time during sample period returned here
+ *
+ * Returns number of samples used. Might be zero (= no result) in the
+ * unlikely case that target time was monotonically decreasing for all
+ * samples (= broken).
+ */
+extern int timecompare_offset(struct timecompare *sync,
+			      s64 *offset,
+			      u64 *source_tstamp);
+
+extern void __timecompare_update(struct timecompare *sync,
+				 u64 source_tstamp);
+
+/**
+ * timecompare_update - update offset and skew by measuring current offset
+ * @sync:            context for time sync
+ * @source_tstamp:   the result of timecounter_read() or
+ *                   timecounter_cyc2time(), pass zero to force update
+ *
+ * Updates are only done at most once per second.
+ */
+static inline void timecompare_update(struct timecompare *sync,
+				      u64 source_tstamp)
+{
+	if (!source_tstamp ||
+	    (s64)(source_tstamp - sync->last_update) >= NSEC_PER_SEC)
+		__timecompare_update(sync, source_tstamp);
+}
+
+#endif /* _LINUX_TIMECOMPARE_H */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 905b0b50792d..0b0a6366c9d4 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
new file mode 100644
index 000000000000..71e7f1a19156
--- /dev/null
+++ b/kernel/time/timecompare.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/timecompare.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+
+/*
+ * fixed point arithmetic scale factor for skew
+ *
+ * Usually one would measure skew in ppb (parts per billion, 1e9), but
+ * using a factor of 2 simplifies the math.
+ */
+#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
+
+ktime_t timecompare_transform(struct timecompare *sync,
+			      u64 source_tstamp)
+{
+	u64 nsec;
+
+	nsec = source_tstamp + sync->offset;
+	nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
+		TIMECOMPARE_SKEW_RESOLUTION;
+
+	return ns_to_ktime(nsec);
+}
+EXPORT_SYMBOL(timecompare_transform);
+
+int timecompare_offset(struct timecompare *sync,
+		       s64 *offset,
+		       u64 *source_tstamp)
+{
+	u64 start_source = 0, end_source = 0;
+	struct {
+		s64 offset;
+		s64 duration_target;
+	} buffer[10], sample, *samples;
+	int counter = 0, i;
+	int used;
+	int index;
+	int num_samples = sync->num_samples;
+
+	if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+		samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
+		if (!samples) {
+			samples = buffer;
+			num_samples = sizeof(buffer)/sizeof(buffer[0]);
+		}
+	} else {
+		samples = buffer;
+	}
+
+	/* run until we have enough valid samples, but do not try forever */
+	i = 0;
+	counter = 0;
+	while (1) {
+		u64 ts;
+		ktime_t start, end;
+
+		start = sync->target();
+		ts = timecounter_read(sync->source);
+		end = sync->target();
+
+		if (!i)
+			start_source = ts;
+
+		/* ignore negative durations */
+		sample.duration_target = ktime_to_ns(ktime_sub(end, start));
+		if (sample.duration_target >= 0) {
+			/*
+			 * assume symetric delay to and from source:
+			 * average target time corresponds to measured
+			 * source time
+			 */
+			sample.offset =
+				ktime_to_ns(ktime_add(end, start)) / 2 -
+				ts;
+
+			/* simple insertion sort based on duration */
+			index = counter - 1;
+			while (index >= 0) {
+				if (samples[index].duration_target <
+				    sample.duration_target)
+					break;
+				samples[index + 1] = samples[index];
+				index--;
+			}
+			samples[index + 1] = sample;
+			counter++;
+		}
+
+		i++;
+		if (counter >= num_samples || i >= 100000) {
+			end_source = ts;
+			break;
+		}
+	}
+
+	*source_tstamp = (end_source + start_source) / 2;
+
+	/* remove outliers by only using 75% of the samples */
+	used = counter * 3 / 4;
+	if (!used)
+		used = counter;
+	if (used) {
+		/* calculate average */
+		s64 off = 0;
+		for (index = 0; index < used; index++)
+			off += samples[index].offset;
+		*offset = div_s64(off, used);
+	}
+
+	if (samples && samples != buffer)
+		kfree(samples);
+
+	return used;
+}
+EXPORT_SYMBOL(timecompare_offset);
+
+void __timecompare_update(struct timecompare *sync,
+			  u64 source_tstamp)
+{
+	s64 offset;
+	u64 average_time;
+
+	if (!timecompare_offset(sync, &offset, &average_time))
+		return;
+
+	if (!sync->last_update) {
+		sync->last_update = average_time;
+		sync->offset = offset;
+		sync->skew = 0;
+	} else {
+		s64 delta_nsec = average_time - sync->last_update;
+
+		/* avoid division by negative or small deltas */
+		if (delta_nsec >= 10000) {
+			s64 delta_offset_nsec = offset - sync->offset;
+			s64 skew; /* delta_offset_nsec *
+				     TIMECOMPARE_SKEW_RESOLUTION /
+				     delta_nsec */
+			u64 divisor;
+
+			/* div_s64() is limited to 32 bit divisor */
+			skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
+			divisor = delta_nsec;
+			while (unlikely(divisor >= ((s64)1) << 32)) {
+				/* divide both by 2; beware, right shift
+				   of negative value has undefined
+				   behavior and can only be used for
+				   the positive divisor */
+				skew = div_s64(skew, 2);
+				divisor >>= 1;
+			}
+			skew = div_s64(skew, divisor);
+
+			/*
+			 * Calculate new overall skew as 4/16 the
+			 * old value and 12/16 the new one. This is
+			 * a rather arbitrary tradeoff between
+			 * only using the latest measurement (0/16 and
+			 * 16/16) and even more weight on past measurements.
+			 */
+#define TIMECOMPARE_NEW_SKEW_PER_16 12
+			sync->skew =
+				div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
+					sync->skew +
+					TIMECOMPARE_NEW_SKEW_PER_16 * skew,
+					16);
+			sync->last_update = average_time;
+			sync->offset = offset;
+		}
+	}
+}
+EXPORT_SYMBOL(__timecompare_update);
-- 
cgit v1.2.3-71-gd317


From cb9eff097831007afb30d64373f29d99825d0068 Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Thu, 12 Feb 2009 05:03:36 +0000
Subject: net: new user space API for time stamping of incoming and outgoing
 packets

User space can request hardware and/or software time stamping.
Reporting of the result(s) via a new control message is enabled
separately for each field in the message because some of the
fields may require additional computation and thus cause overhead.
User space can tell the different kinds of time stamps apart
and choose what suits its needs.

When a TX timestamp operation is requested, the TX skb will be cloned
and the clone will be time stamped (in hardware or software) and added
to the socket error queue of the skb, if the skb has a socket
associated with it.

The actual TX timestamp will reach userspace as a RX timestamp on the
cloned packet. If timestamping is requested and no timestamping is
done in the device driver (potentially this may use hardware
timestamping), it will be done in software after the device's
start_hard_xmit routine.

Signed-off-by: Patrick Ohly <patrick.ohly@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/timestamping.txt          | 178 +++++++
 Documentation/networking/timestamping/.gitignore   |   1 +
 Documentation/networking/timestamping/Makefile     |   6 +
 .../networking/timestamping/timestamping.c         | 533 +++++++++++++++++++++
 arch/alpha/include/asm/socket.h                    |   3 +
 arch/arm/include/asm/socket.h                      |   3 +
 arch/avr32/include/asm/socket.h                    |   3 +
 arch/blackfin/include/asm/socket.h                 |   3 +
 arch/cris/include/asm/socket.h                     |   3 +
 arch/h8300/include/asm/socket.h                    |   3 +
 arch/ia64/include/asm/socket.h                     |   3 +
 arch/m68k/include/asm/socket.h                     |   3 +
 arch/mips/include/asm/socket.h                     |   3 +
 arch/parisc/include/asm/socket.h                   |   3 +
 arch/powerpc/include/asm/socket.h                  |   3 +
 arch/s390/include/asm/socket.h                     |   3 +
 arch/sh/include/asm/socket.h                       |   3 +
 arch/sparc/include/asm/socket.h                    |   3 +
 arch/x86/include/asm/socket.h                      |   3 +
 arch/xtensa/include/asm/socket.h                   |   3 +
 include/asm-frv/socket.h                           |   3 +
 include/asm-m32r/socket.h                          |   3 +
 include/asm-mn10300/socket.h                       |   3 +
 include/linux/errqueue.h                           |   1 +
 include/linux/net_tstamp.h                         | 104 ++++
 include/linux/sockios.h                            |   3 +
 26 files changed, 883 insertions(+)
 create mode 100644 Documentation/networking/timestamping.txt
 create mode 100644 Documentation/networking/timestamping/.gitignore
 create mode 100644 Documentation/networking/timestamping/Makefile
 create mode 100644 Documentation/networking/timestamping/timestamping.c
 create mode 100644 include/linux/net_tstamp.h

(limited to 'include/linux')

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
new file mode 100644
index 000000000000..a681a65b5bc7
--- /dev/null
+++ b/Documentation/networking/timestamping.txt
@@ -0,0 +1,178 @@
+The existing interfaces for getting network packages time stamped are:
+
+* SO_TIMESTAMP
+  Generate time stamp for each incoming packet using the (not necessarily
+  monotonous!) system time. Result is returned via recv_msg() in a
+  control message as timeval (usec resolution).
+
+* SO_TIMESTAMPNS
+  Same time stamping mechanism as SO_TIMESTAMP, but returns result as
+  timespec (nsec resolution).
+
+* IP_MULTICAST_LOOP + SO_TIMESTAMP[NS]
+  Only for multicasts: approximate send time stamp by receiving the looped
+  packet and using its receive time stamp.
+
+The following interface complements the existing ones: receive time
+stamps can be generated and returned for arbitrary packets and much
+closer to the point where the packet is really sent. Time stamps can
+be generated in software (as before) or in hardware (if the hardware
+has such a feature).
+
+SO_TIMESTAMPING:
+
+Instructs the socket layer which kind of information is wanted. The
+parameter is an integer with some of the following bits set. Setting
+other bits is an error and doesn't change the current state.
+
+SOF_TIMESTAMPING_TX_HARDWARE:  try to obtain send time stamp in hardware
+SOF_TIMESTAMPING_TX_SOFTWARE:  if SOF_TIMESTAMPING_TX_HARDWARE is off or
+                               fails, then do it in software
+SOF_TIMESTAMPING_RX_HARDWARE:  return the original, unmodified time stamp
+                               as generated by the hardware
+SOF_TIMESTAMPING_RX_SOFTWARE:  if SOF_TIMESTAMPING_RX_HARDWARE is off or
+                               fails, then do it in software
+SOF_TIMESTAMPING_RAW_HARDWARE: return original raw hardware time stamp
+SOF_TIMESTAMPING_SYS_HARDWARE: return hardware time stamp transformed to
+                               the system time base
+SOF_TIMESTAMPING_SOFTWARE:     return system time stamp generated in
+                               software
+
+SOF_TIMESTAMPING_TX/RX determine how time stamps are generated.
+SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the
+following control message:
+    struct scm_timestamping {
+           struct timespec systime;
+           struct timespec hwtimetrans;
+           struct timespec hwtimeraw;
+    };
+
+recvmsg() can be used to get this control message for regular incoming
+packets. For send time stamps the outgoing packet is looped back to
+the socket's error queue with the send time stamp(s) attached. It can
+be received with recvmsg(flags=MSG_ERRQUEUE). The call returns the
+original outgoing packet data including all headers preprended down to
+and including the link layer, the scm_timestamping control message and
+a sock_extended_err control message with ee_errno==ENOMSG and
+ee_origin==SO_EE_ORIGIN_TIMESTAMPING. A socket with such a pending
+bounced packet is ready for reading as far as select() is concerned.
+
+All three values correspond to the same event in time, but were
+generated in different ways. Each of these values may be empty (= all
+zero), in which case no such value was available. If the application
+is not interested in some of these values, they can be left blank to
+avoid the potential overhead of calculating them.
+
+systime is the value of the system time at that moment. This
+corresponds to the value also returned via SO_TIMESTAMP[NS]. If the
+time stamp was generated by hardware, then this field is
+empty. Otherwise it is filled in if SOF_TIMESTAMPING_SOFTWARE is
+set.
+
+hwtimeraw is the original hardware time stamp. Filled in if
+SOF_TIMESTAMPING_RAW_HARDWARE is set. No assumptions about its
+relation to system time should be made.
+
+hwtimetrans is the hardware time stamp transformed so that it
+corresponds as good as possible to system time. This correlation is
+not perfect; as a consequence, sorting packets received via different
+NICs by their hwtimetrans may differ from the order in which they were
+received. hwtimetrans may be non-monotonic even for the same NIC.
+Filled in if SOF_TIMESTAMPING_SYS_HARDWARE is set. Requires support
+by the network device and will be empty without that support.
+
+
+SIOCSHWTSTAMP:
+
+Hardware time stamping must also be initialized for each device driver
+that is expected to do hardware time stamping. The parameter is:
+
+struct hwtstamp_config {
+    int flags;           /* no flags defined right now, must be zero */
+    int tx_type;         /* HWTSTAMP_TX_* */
+    int rx_filter;       /* HWTSTAMP_FILTER_* */
+};
+
+Desired behavior is passed into the kernel and to a specific device by
+calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose
+ifr_data points to a struct hwtstamp_config. The tx_type and
+rx_filter are hints to the driver what it is expected to do. If
+the requested fine-grained filtering for incoming packets is not
+supported, the driver may time stamp more than just the requested types
+of packets.
+
+A driver which supports hardware time stamping shall update the struct
+with the actual, possibly more permissive configuration. If the
+requested packets cannot be time stamped, then nothing should be
+changed and ERANGE shall be returned (in contrast to EINVAL, which
+indicates that SIOCSHWTSTAMP is not supported at all).
+
+Only a processes with admin rights may change the configuration. User
+space is responsible to ensure that multiple processes don't interfere
+with each other and that the settings are reset.
+
+/* possible values for hwtstamp_config->tx_type */
+enum {
+	/*
+	 * no outgoing packet will need hardware time stamping;
+	 * should a packet arrive which asks for it, no hardware
+	 * time stamping will be done
+	 */
+	HWTSTAMP_TX_OFF,
+
+	/*
+	 * enables hardware time stamping for outgoing packets;
+	 * the sender of the packet decides which are to be
+	 * time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE
+	 * before sending the packet
+	 */
+	HWTSTAMP_TX_ON,
+};
+
+/* possible values for hwtstamp_config->rx_filter */
+enum {
+	/* time stamp no incoming packet at all */
+	HWTSTAMP_FILTER_NONE,
+
+	/* time stamp any incoming packet */
+	HWTSTAMP_FILTER_ALL,
+
+        /* return value: time stamp all packets requested plus some others */
+        HWTSTAMP_FILTER_SOME,
+
+	/* PTP v1, UDP, any kind of event packet */
+	HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
+
+        ...
+};
+
+
+DEVICE IMPLEMENTATION
+
+A driver which supports hardware time stamping must support the
+SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored
+in the skb with skb_hwtstamp_set().
+
+Time stamps for outgoing packets are to be generated as follows:
+- In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware()
+  returns non-zero. If yes, then the driver is expected
+  to do hardware time stamping.
+- If this is possible for the skb and requested, then declare
+  that the driver is doing the time stamping by calling
+  skb_hwtstamp_tx_in_progress(). A driver not supporting
+  hardware time stamping doesn't do that. A driver must never
+  touch sk_buff::tstamp! It is used to store how time stamping
+  for an outgoing packets is to be done.
+- As soon as the driver has sent the packet and/or obtained a
+  hardware time stamp for it, it passes the time stamp back by
+  calling skb_hwtstamp_tx() with the original skb, the raw
+  hardware time stamp and a handle to the device (necessary
+  to convert the hardware time stamp to system time). If obtaining
+  the hardware time stamp somehow fails, then the driver should
+  not fall back to software time stamping. The rationale is that
+  this would occur at a later time in the processing pipeline
+  than other software time stamping and therefore could lead
+  to unexpected deltas between time stamps.
+- If the driver did not call skb_hwtstamp_tx_in_progress(), then
+  dev_hard_start_xmit() checks whether software time stamping
+  is wanted as fallback and potentially generates the time stamp.
diff --git a/Documentation/networking/timestamping/.gitignore b/Documentation/networking/timestamping/.gitignore
new file mode 100644
index 000000000000..71e81eb2e22f
--- /dev/null
+++ b/Documentation/networking/timestamping/.gitignore
@@ -0,0 +1 @@
+timestamping
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile
new file mode 100644
index 000000000000..2a1489fdc036
--- /dev/null
+++ b/Documentation/networking/timestamping/Makefile
@@ -0,0 +1,6 @@
+CPPFLAGS = -I../../../include
+
+timestamping: timestamping.c
+
+clean:
+	rm -f timestamping
diff --git a/Documentation/networking/timestamping/timestamping.c b/Documentation/networking/timestamping/timestamping.c
new file mode 100644
index 000000000000..43d143104210
--- /dev/null
+++ b/Documentation/networking/timestamping/timestamping.c
@@ -0,0 +1,533 @@
+/*
+ * This program demonstrates how the various time stamping features in
+ * the Linux kernel work. It emulates the behavior of a PTP
+ * implementation in stand-alone master mode by sending PTPv1 Sync
+ * multicasts once every second. It looks for similar packets, but
+ * beyond that doesn't actually implement PTP.
+ *
+ * Outgoing packets are time stamped with SO_TIMESTAMPING with or
+ * without hardware support.
+ *
+ * Incoming packets are time stamped with SO_TIMESTAMPING with or
+ * without hardware support, SIOCGSTAMP[NS] (per-socket time stamp) and
+ * SO_TIMESTAMP[NS].
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include "asm/types.h"
+#include "linux/net_tstamp.h"
+#include "linux/errqueue.h"
+
+#ifndef SO_TIMESTAMPING
+# define SO_TIMESTAMPING         37
+# define SCM_TIMESTAMPING        SO_TIMESTAMPING
+#endif
+
+#ifndef SO_TIMESTAMPNS
+# define SO_TIMESTAMPNS 35
+#endif
+
+#ifndef SIOCGSTAMPNS
+# define SIOCGSTAMPNS 0x8907
+#endif
+
+#ifndef SIOCSHWTSTAMP
+# define SIOCSHWTSTAMP 0x89b0
+#endif
+
+static void usage(const char *error)
+{
+	if (error)
+		printf("invalid option: %s\n", error);
+	printf("timestamping interface option*\n\n"
+	       "Options:\n"
+	       "  IP_MULTICAST_LOOP - looping outgoing multicasts\n"
+	       "  SO_TIMESTAMP - normal software time stamping, ms resolution\n"
+	       "  SO_TIMESTAMPNS - more accurate software time stamping\n"
+	       "  SOF_TIMESTAMPING_TX_HARDWARE - hardware time stamping of outgoing packets\n"
+	       "  SOF_TIMESTAMPING_TX_SOFTWARE - software fallback for outgoing packets\n"
+	       "  SOF_TIMESTAMPING_RX_HARDWARE - hardware time stamping of incoming packets\n"
+	       "  SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n"
+	       "  SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n"
+	       "  SOF_TIMESTAMPING_SYS_HARDWARE - request reporting of transformed HW time stamps\n"
+	       "  SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n"
+	       "  SIOCGSTAMP - check last socket time stamp\n"
+	       "  SIOCGSTAMPNS - more accurate socket time stamp\n");
+	exit(1);
+}
+
+static void bail(const char *error)
+{
+	printf("%s: %s\n", error, strerror(errno));
+	exit(1);
+}
+
+static const unsigned char sync[] = {
+	0x00, 0x01, 0x00, 0x01,
+	0x5f, 0x44, 0x46, 0x4c,
+	0x54, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x01, 0x01,
+
+	/* fake uuid */
+	0x00, 0x01,
+	0x02, 0x03, 0x04, 0x05,
+
+	0x00, 0x01, 0x00, 0x37,
+	0x00, 0x00, 0x00, 0x08,
+	0x00, 0x00, 0x00, 0x00,
+	0x49, 0x05, 0xcd, 0x01,
+	0x29, 0xb1, 0x8d, 0xb0,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x01,
+
+	/* fake uuid */
+	0x00, 0x01,
+	0x02, 0x03, 0x04, 0x05,
+
+	0x00, 0x00, 0x00, 0x37,
+	0x00, 0x00, 0x00, 0x04,
+	0x44, 0x46, 0x4c, 0x54,
+	0x00, 0x00, 0xf0, 0x60,
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x01,
+	0x00, 0x00, 0xf0, 0x60,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x04,
+	0x44, 0x46, 0x4c, 0x54,
+	0x00, 0x01,
+
+	/* fake uuid */
+	0x00, 0x01,
+	0x02, 0x03, 0x04, 0x05,
+
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00
+};
+
+static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len)
+{
+	struct timeval now;
+	int res;
+
+	res = sendto(sock, sync, sizeof(sync), 0,
+		addr, addr_len);
+	gettimeofday(&now, 0);
+	if (res < 0)
+		printf("%s: %s\n", "send", strerror(errno));
+	else
+		printf("%ld.%06ld: sent %d bytes\n",
+		       (long)now.tv_sec, (long)now.tv_usec,
+		       res);
+}
+
+static void printpacket(struct msghdr *msg, int res,
+			char *data,
+			int sock, int recvmsg_flags,
+			int siocgstamp, int siocgstampns)
+{
+	struct sockaddr_in *from_addr = (struct sockaddr_in *)msg->msg_name;
+	struct cmsghdr *cmsg;
+	struct timeval tv;
+	struct timespec ts;
+	struct timeval now;
+
+	gettimeofday(&now, 0);
+
+	printf("%ld.%06ld: received %s data, %d bytes from %s, %d bytes control messages\n",
+	       (long)now.tv_sec, (long)now.tv_usec,
+	       (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
+	       res,
+	       inet_ntoa(from_addr->sin_addr),
+	       msg->msg_controllen);
+	for (cmsg = CMSG_FIRSTHDR(msg);
+	     cmsg;
+	     cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		printf("   cmsg len %d: ", cmsg->cmsg_len);
+		switch (cmsg->cmsg_level) {
+		case SOL_SOCKET:
+			printf("SOL_SOCKET ");
+			switch (cmsg->cmsg_type) {
+			case SO_TIMESTAMP: {
+				struct timeval *stamp =
+					(struct timeval *)CMSG_DATA(cmsg);
+				printf("SO_TIMESTAMP %ld.%06ld",
+				       (long)stamp->tv_sec,
+				       (long)stamp->tv_usec);
+				break;
+			}
+			case SO_TIMESTAMPNS: {
+				struct timespec *stamp =
+					(struct timespec *)CMSG_DATA(cmsg);
+				printf("SO_TIMESTAMPNS %ld.%09ld",
+				       (long)stamp->tv_sec,
+				       (long)stamp->tv_nsec);
+				break;
+			}
+			case SO_TIMESTAMPING: {
+				struct timespec *stamp =
+					(struct timespec *)CMSG_DATA(cmsg);
+				printf("SO_TIMESTAMPING ");
+				printf("SW %ld.%09ld ",
+				       (long)stamp->tv_sec,
+				       (long)stamp->tv_nsec);
+				stamp++;
+				printf("HW transformed %ld.%09ld ",
+				       (long)stamp->tv_sec,
+				       (long)stamp->tv_nsec);
+				stamp++;
+				printf("HW raw %ld.%09ld",
+				       (long)stamp->tv_sec,
+				       (long)stamp->tv_nsec);
+				break;
+			}
+			default:
+				printf("type %d", cmsg->cmsg_type);
+				break;
+			}
+			break;
+		case IPPROTO_IP:
+			printf("IPPROTO_IP ");
+			switch (cmsg->cmsg_type) {
+			case IP_RECVERR: {
+				struct sock_extended_err *err =
+					(struct sock_extended_err *)CMSG_DATA(cmsg);
+				printf("IP_RECVERR ee_errno '%s' ee_origin %d => %s",
+					strerror(err->ee_errno),
+					err->ee_origin,
+#ifdef SO_EE_ORIGIN_TIMESTAMPING
+					err->ee_origin == SO_EE_ORIGIN_TIMESTAMPING ?
+					"bounced packet" : "unexpected origin"
+#else
+					"probably SO_EE_ORIGIN_TIMESTAMPING"
+#endif
+					);
+				if (res < sizeof(sync))
+					printf(" => truncated data?!");
+				else if (!memcmp(sync, data + res - sizeof(sync),
+							sizeof(sync)))
+					printf(" => GOT OUR DATA BACK (HURRAY!)");
+				break;
+			}
+			case IP_PKTINFO: {
+				struct in_pktinfo *pktinfo =
+					(struct in_pktinfo *)CMSG_DATA(cmsg);
+				printf("IP_PKTINFO interface index %u",
+					pktinfo->ipi_ifindex);
+				break;
+			}
+			default:
+				printf("type %d", cmsg->cmsg_type);
+				break;
+			}
+			break;
+		default:
+			printf("level %d type %d",
+				cmsg->cmsg_level,
+				cmsg->cmsg_type);
+			break;
+		}
+		printf("\n");
+	}
+
+	if (siocgstamp) {
+		if (ioctl(sock, SIOCGSTAMP, &tv))
+			printf("   %s: %s\n", "SIOCGSTAMP", strerror(errno));
+		else
+			printf("SIOCGSTAMP %ld.%06ld\n",
+			       (long)tv.tv_sec,
+			       (long)tv.tv_usec);
+	}
+	if (siocgstampns) {
+		if (ioctl(sock, SIOCGSTAMPNS, &ts))
+			printf("   %s: %s\n", "SIOCGSTAMPNS", strerror(errno));
+		else
+			printf("SIOCGSTAMPNS %ld.%09ld\n",
+			       (long)ts.tv_sec,
+			       (long)ts.tv_nsec);
+	}
+}
+
+static void recvpacket(int sock, int recvmsg_flags,
+		       int siocgstamp, int siocgstampns)
+{
+	char data[256];
+	struct msghdr msg;
+	struct iovec entry;
+	struct sockaddr_in from_addr;
+	struct {
+		struct cmsghdr cm;
+		char control[512];
+	} control;
+	int res;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_iov = &entry;
+	msg.msg_iovlen = 1;
+	entry.iov_base = data;
+	entry.iov_len = sizeof(data);
+	msg.msg_name = (caddr_t)&from_addr;
+	msg.msg_namelen = sizeof(from_addr);
+	msg.msg_control = &control;
+	msg.msg_controllen = sizeof(control);
+
+	res = recvmsg(sock, &msg, recvmsg_flags|MSG_DONTWAIT);
+	if (res < 0) {
+		printf("%s %s: %s\n",
+		       "recvmsg",
+		       (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
+		       strerror(errno));
+	} else {
+		printpacket(&msg, res, data,
+			    sock, recvmsg_flags,
+			    siocgstamp, siocgstampns);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int so_timestamping_flags = 0;
+	int so_timestamp = 0;
+	int so_timestampns = 0;
+	int siocgstamp = 0;
+	int siocgstampns = 0;
+	int ip_multicast_loop = 0;
+	char *interface;
+	int i;
+	int enabled = 1;
+	int sock;
+	struct ifreq device;
+	struct ifreq hwtstamp;
+	struct hwtstamp_config hwconfig, hwconfig_requested;
+	struct sockaddr_in addr;
+	struct ip_mreq imr;
+	struct in_addr iaddr;
+	int val;
+	socklen_t len;
+	struct timeval next;
+
+	if (argc < 2)
+		usage(0);
+	interface = argv[1];
+
+	for (i = 2; i < argc; i++) {
+		if (!strcasecmp(argv[i], "SO_TIMESTAMP"))
+			so_timestamp = 1;
+		else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS"))
+			so_timestampns = 1;
+		else if (!strcasecmp(argv[i], "SIOCGSTAMP"))
+			siocgstamp = 1;
+		else if (!strcasecmp(argv[i], "SIOCGSTAMPNS"))
+			siocgstampns = 1;
+		else if (!strcasecmp(argv[i], "IP_MULTICAST_LOOP"))
+			ip_multicast_loop = 1;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE"))
+			so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE"))
+			so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE"))
+			so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE"))
+			so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE"))
+			so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SYS_HARDWARE"))
+			so_timestamping_flags |= SOF_TIMESTAMPING_SYS_HARDWARE;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE"))
+			so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
+		else
+			usage(argv[i]);
+	}
+
+	sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+	if (socket < 0)
+		bail("socket");
+
+	memset(&device, 0, sizeof(device));
+	strncpy(device.ifr_name, interface, sizeof(device.ifr_name));
+	if (ioctl(sock, SIOCGIFADDR, &device) < 0)
+		bail("getting interface IP address");
+
+	memset(&hwtstamp, 0, sizeof(hwtstamp));
+	strncpy(hwtstamp.ifr_name, interface, sizeof(hwtstamp.ifr_name));
+	hwtstamp.ifr_data = (void *)&hwconfig;
+	memset(&hwconfig, 0, sizeof(&hwconfig));
+	hwconfig.tx_type =
+		(so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
+		HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
+	hwconfig.rx_filter =
+		(so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
+		HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE;
+	hwconfig_requested = hwconfig;
+	if (ioctl(sock, SIOCSHWTSTAMP, &hwtstamp) < 0) {
+		if ((errno == EINVAL || errno == ENOTSUP) &&
+		    hwconfig_requested.tx_type == HWTSTAMP_TX_OFF &&
+		    hwconfig_requested.rx_filter == HWTSTAMP_FILTER_NONE)
+			printf("SIOCSHWTSTAMP: disabling hardware time stamping not possible\n");
+		else
+			bail("SIOCSHWTSTAMP");
+	}
+	printf("SIOCSHWTSTAMP: tx_type %d requested, got %d; rx_filter %d requested, got %d\n",
+	       hwconfig_requested.tx_type, hwconfig.tx_type,
+	       hwconfig_requested.rx_filter, hwconfig.rx_filter);
+
+	/* bind to PTP port */
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	addr.sin_port = htons(319 /* PTP event port */);
+	if (bind(sock,
+		 (struct sockaddr *)&addr,
+		 sizeof(struct sockaddr_in)) < 0)
+		bail("bind");
+
+	/* set multicast group for outgoing packets */
+	inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */
+	addr.sin_addr = iaddr;
+	imr.imr_multiaddr.s_addr = iaddr.s_addr;
+	imr.imr_interface.s_addr =
+		((struct sockaddr_in *)&device.ifr_addr)->sin_addr.s_addr;
+	if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_IF,
+		       &imr.imr_interface.s_addr, sizeof(struct in_addr)) < 0)
+		bail("set multicast");
+
+	/* join multicast group, loop our own packet */
+	if (setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP,
+		       &imr, sizeof(struct ip_mreq)) < 0)
+		bail("join multicast group");
+
+	if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_LOOP,
+		       &ip_multicast_loop, sizeof(enabled)) < 0) {
+		bail("loop multicast");
+	}
+
+	/* set socket options for time stamping */
+	if (so_timestamp &&
+		setsockopt(sock, SOL_SOCKET, SO_TIMESTAMP,
+			   &enabled, sizeof(enabled)) < 0)
+		bail("setsockopt SO_TIMESTAMP");
+
+	if (so_timestampns &&
+		setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS,
+			   &enabled, sizeof(enabled)) < 0)
+		bail("setsockopt SO_TIMESTAMPNS");
+
+	if (so_timestamping_flags &&
+		setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING,
+			   &so_timestamping_flags,
+			   sizeof(so_timestamping_flags)) < 0)
+		bail("setsockopt SO_TIMESTAMPING");
+
+	/* request IP_PKTINFO for debugging purposes */
+	if (setsockopt(sock, SOL_IP, IP_PKTINFO,
+		       &enabled, sizeof(enabled)) < 0)
+		printf("%s: %s\n", "setsockopt IP_PKTINFO", strerror(errno));
+
+	/* verify socket options */
+	len = sizeof(val);
+	if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMP, &val, &len) < 0)
+		printf("%s: %s\n", "getsockopt SO_TIMESTAMP", strerror(errno));
+	else
+		printf("SO_TIMESTAMP %d\n", val);
+
+	if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS, &val, &len) < 0)
+		printf("%s: %s\n", "getsockopt SO_TIMESTAMPNS",
+		       strerror(errno));
+	else
+		printf("SO_TIMESTAMPNS %d\n", val);
+
+	if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) {
+		printf("%s: %s\n", "getsockopt SO_TIMESTAMPING",
+		       strerror(errno));
+	} else {
+		printf("SO_TIMESTAMPING %d\n", val);
+		if (val != so_timestamping_flags)
+			printf("   not the expected value %d\n",
+			       so_timestamping_flags);
+	}
+
+	/* send packets forever every five seconds */
+	gettimeofday(&next, 0);
+	next.tv_sec = (next.tv_sec + 1) / 5 * 5;
+	next.tv_usec = 0;
+	while (1) {
+		struct timeval now;
+		struct timeval delta;
+		long delta_us;
+		int res;
+		fd_set readfs, errorfs;
+
+		gettimeofday(&now, 0);
+		delta_us = (long)(next.tv_sec - now.tv_sec) * 1000000 +
+			(long)(next.tv_usec - now.tv_usec);
+		if (delta_us > 0) {
+			/* continue waiting for timeout or data */
+			delta.tv_sec = delta_us / 1000000;
+			delta.tv_usec = delta_us % 1000000;
+
+			FD_ZERO(&readfs);
+			FD_ZERO(&errorfs);
+			FD_SET(sock, &readfs);
+			FD_SET(sock, &errorfs);
+			printf("%ld.%06ld: select %ldus\n",
+			       (long)now.tv_sec, (long)now.tv_usec,
+			       delta_us);
+			res = select(sock + 1, &readfs, 0, &errorfs, &delta);
+			gettimeofday(&now, 0);
+			printf("%ld.%06ld: select returned: %d, %s\n",
+			       (long)now.tv_sec, (long)now.tv_usec,
+			       res,
+			       res < 0 ? strerror(errno) : "success");
+			if (res > 0) {
+				if (FD_ISSET(sock, &readfs))
+					printf("ready for reading\n");
+				if (FD_ISSET(sock, &errorfs))
+					printf("has error\n");
+				recvpacket(sock, 0,
+					   siocgstamp,
+					   siocgstampns);
+				recvpacket(sock, MSG_ERRQUEUE,
+					   siocgstamp,
+					   siocgstampns);
+			}
+		} else {
+			/* write one packet */
+			sendpacket(sock,
+				   (struct sockaddr *)&addr,
+				   sizeof(addr));
+			next.tv_sec += 5;
+			continue;
+		}
+	}
+
+	return 0;
+}
diff --git a/arch/alpha/include/asm/socket.h b/arch/alpha/include/asm/socket.h
index a1057c2d95e7..3641ec1452f4 100644
--- a/arch/alpha/include/asm/socket.h
+++ b/arch/alpha/include/asm/socket.h
@@ -62,6 +62,9 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
diff --git a/arch/arm/include/asm/socket.h b/arch/arm/include/asm/socket.h
index 6817be9573a6..537de4e0ef50 100644
--- a/arch/arm/include/asm/socket.h
+++ b/arch/arm/include/asm/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/avr32/include/asm/socket.h b/arch/avr32/include/asm/socket.h
index 35863f260929..04c860619700 100644
--- a/arch/avr32/include/asm/socket.h
+++ b/arch/avr32/include/asm/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* __ASM_AVR32_SOCKET_H */
diff --git a/arch/blackfin/include/asm/socket.h b/arch/blackfin/include/asm/socket.h
index 2ca702e44d47..fac7fe9e1f8a 100644
--- a/arch/blackfin/include/asm/socket.h
+++ b/arch/blackfin/include/asm/socket.h
@@ -53,4 +53,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif				/* _ASM_SOCKET_H */
diff --git a/arch/cris/include/asm/socket.h b/arch/cris/include/asm/socket.h
index 9df0ca82f5de..d5cf74005408 100644
--- a/arch/cris/include/asm/socket.h
+++ b/arch/cris/include/asm/socket.h
@@ -56,6 +56,9 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_SOCKET_H */
 
 
diff --git a/arch/h8300/include/asm/socket.h b/arch/h8300/include/asm/socket.h
index da2520dbf254..602518a70a1a 100644
--- a/arch/h8300/include/asm/socket.h
+++ b/arch/h8300/include/asm/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/asm/socket.h b/arch/ia64/include/asm/socket.h
index d5ef0aa3e312..745421225ec6 100644
--- a/arch/ia64/include/asm/socket.h
+++ b/arch/ia64/include/asm/socket.h
@@ -63,4 +63,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m68k/include/asm/socket.h b/arch/m68k/include/asm/socket.h
index dbc64e92c41a..ca87f938b03f 100644
--- a/arch/m68k/include/asm/socket.h
+++ b/arch/m68k/include/asm/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/mips/include/asm/socket.h b/arch/mips/include/asm/socket.h
index facc2d7a87ca..2abca1780169 100644
--- a/arch/mips/include/asm/socket.h
+++ b/arch/mips/include/asm/socket.h
@@ -75,6 +75,9 @@ To add: #define SO_REUSEPORT 0x0200	/* Allow local address and port reuse.  */
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #ifdef __KERNEL__
 
 /** sock_type - Socket types
diff --git a/arch/parisc/include/asm/socket.h b/arch/parisc/include/asm/socket.h
index fba402c95ac2..885472bf7b78 100644
--- a/arch/parisc/include/asm/socket.h
+++ b/arch/parisc/include/asm/socket.h
@@ -54,6 +54,9 @@
 
 #define SO_MARK			0x401f
 
+#define SO_TIMESTAMPING		0x4020
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
diff --git a/arch/powerpc/include/asm/socket.h b/arch/powerpc/include/asm/socket.h
index f5a4e168e498..1e5cfad0e3f7 100644
--- a/arch/powerpc/include/asm/socket.h
+++ b/arch/powerpc/include/asm/socket.h
@@ -61,4 +61,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/asm/socket.h b/arch/s390/include/asm/socket.h
index c786ab623b2d..02330c50241b 100644
--- a/arch/s390/include/asm/socket.h
+++ b/arch/s390/include/asm/socket.h
@@ -62,4 +62,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sh/include/asm/socket.h b/arch/sh/include/asm/socket.h
index 6d4bf6512959..345653b96826 100644
--- a/arch/sh/include/asm/socket.h
+++ b/arch/sh/include/asm/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* __ASM_SH_SOCKET_H */
diff --git a/arch/sparc/include/asm/socket.h b/arch/sparc/include/asm/socket.h
index bf50d0c2d583..982a12f959f4 100644
--- a/arch/sparc/include/asm/socket.h
+++ b/arch/sparc/include/asm/socket.h
@@ -50,6 +50,9 @@
 
 #define SO_MARK			0x0022
 
+#define SO_TIMESTAMPING		0x0023
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
index 8ab9cc8b2ecc..ca8bf2cd0ba9 100644
--- a/arch/x86/include/asm/socket.h
+++ b/arch/x86/include/asm/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/xtensa/include/asm/socket.h b/arch/xtensa/include/asm/socket.h
index 6100682b1da2..dd1a7a4a1cea 100644
--- a/arch/xtensa/include/asm/socket.h
+++ b/arch/xtensa/include/asm/socket.h
@@ -65,4 +65,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/asm-frv/socket.h b/include/asm-frv/socket.h
index e51ca67b9356..57c3d4054e8b 100644
--- a/include/asm-frv/socket.h
+++ b/include/asm-frv/socket.h
@@ -54,5 +54,8 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/include/asm-m32r/socket.h b/include/asm-m32r/socket.h
index 9a0e20012224..be7ed589af5c 100644
--- a/include/asm-m32r/socket.h
+++ b/include/asm-m32r/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/include/asm-mn10300/socket.h b/include/asm-mn10300/socket.h
index 80af9c4ccad7..fb5daf438ec9 100644
--- a/include/asm-mn10300/socket.h
+++ b/include/asm-mn10300/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h
index ceb1454b6977..ec12cc74366f 100644
--- a/include/linux/errqueue.h
+++ b/include/linux/errqueue.h
@@ -18,6 +18,7 @@ struct sock_extended_err
 #define SO_EE_ORIGIN_LOCAL	1
 #define SO_EE_ORIGIN_ICMP	2
 #define SO_EE_ORIGIN_ICMP6	3
+#define SO_EE_ORIGIN_TIMESTAMPING 4
 
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
new file mode 100644
index 000000000000..a3b8546354ac
--- /dev/null
+++ b/include/linux/net_tstamp.h
@@ -0,0 +1,104 @@
+/*
+ * Userspace API for hardware time stamping of network packets
+ *
+ * Copyright (C) 2008,2009 Intel Corporation
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ */
+
+#ifndef _NET_TIMESTAMPING_H
+#define _NET_TIMESTAMPING_H
+
+#include <linux/socket.h>   /* for SO_TIMESTAMPING */
+
+/* SO_TIMESTAMPING gets an integer bit field comprised of these values */
+enum {
+	SOF_TIMESTAMPING_TX_HARDWARE = (1<<0),
+	SOF_TIMESTAMPING_TX_SOFTWARE = (1<<1),
+	SOF_TIMESTAMPING_RX_HARDWARE = (1<<2),
+	SOF_TIMESTAMPING_RX_SOFTWARE = (1<<3),
+	SOF_TIMESTAMPING_SOFTWARE = (1<<4),
+	SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5),
+	SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6),
+	SOF_TIMESTAMPING_MASK =
+	(SOF_TIMESTAMPING_RAW_HARDWARE - 1) |
+	SOF_TIMESTAMPING_RAW_HARDWARE
+};
+
+/**
+ * struct hwtstamp_config - %SIOCSHWTSTAMP parameter
+ *
+ * @flags:	no flags defined right now, must be zero
+ * @tx_type:	one of HWTSTAMP_TX_*
+ * @rx_type:	one of one of HWTSTAMP_FILTER_*
+ *
+ * %SIOCSHWTSTAMP expects a &struct ifreq with a ifr_data pointer to
+ * this structure. dev_ifsioc() in the kernel takes care of the
+ * translation between 32 bit userspace and 64 bit kernel. The
+ * structure is intentionally chosen so that it has the same layout on
+ * 32 and 64 bit systems, don't break this!
+ */
+struct hwtstamp_config {
+	int flags;
+	int tx_type;
+	int rx_filter;
+};
+
+/* possible values for hwtstamp_config->tx_type */
+enum {
+	/*
+	 * No outgoing packet will need hardware time stamping;
+	 * should a packet arrive which asks for it, no hardware
+	 * time stamping will be done.
+	 */
+	HWTSTAMP_TX_OFF,
+
+	/*
+	 * Enables hardware time stamping for outgoing packets;
+	 * the sender of the packet decides which are to be
+	 * time stamped by setting %SOF_TIMESTAMPING_TX_SOFTWARE
+	 * before sending the packet.
+	 */
+	HWTSTAMP_TX_ON,
+};
+
+/* possible values for hwtstamp_config->rx_filter */
+enum {
+	/* time stamp no incoming packet at all */
+	HWTSTAMP_FILTER_NONE,
+
+	/* time stamp any incoming packet */
+	HWTSTAMP_FILTER_ALL,
+
+	/* return value: time stamp all packets requested plus some others */
+	HWTSTAMP_FILTER_SOME,
+
+	/* PTP v1, UDP, any kind of event packet */
+	HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
+	/* PTP v1, UDP, Sync packet */
+	HWTSTAMP_FILTER_PTP_V1_L4_SYNC,
+	/* PTP v1, UDP, Delay_req packet */
+	HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ,
+	/* PTP v2, UDP, any kind of event packet */
+	HWTSTAMP_FILTER_PTP_V2_L4_EVENT,
+	/* PTP v2, UDP, Sync packet */
+	HWTSTAMP_FILTER_PTP_V2_L4_SYNC,
+	/* PTP v2, UDP, Delay_req packet */
+	HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ,
+
+	/* 802.AS1, Ethernet, any kind of event packet */
+	HWTSTAMP_FILTER_PTP_V2_L2_EVENT,
+	/* 802.AS1, Ethernet, Sync packet */
+	HWTSTAMP_FILTER_PTP_V2_L2_SYNC,
+	/* 802.AS1, Ethernet, Delay_req packet */
+	HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ,
+
+	/* PTP v2/802.AS1, any layer, any kind of event packet */
+	HWTSTAMP_FILTER_PTP_V2_EVENT,
+	/* PTP v2/802.AS1, any layer, Sync packet */
+	HWTSTAMP_FILTER_PTP_V2_SYNC,
+	/* PTP v2/802.AS1, any layer, Delay_req packet */
+	HWTSTAMP_FILTER_PTP_V2_DELAY_REQ,
+};
+
+#endif /* _NET_TIMESTAMPING_H */
diff --git a/include/linux/sockios.h b/include/linux/sockios.h
index abef7596655a..241f179347d9 100644
--- a/include/linux/sockios.h
+++ b/include/linux/sockios.h
@@ -122,6 +122,9 @@
 #define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
 #define SIOCBRDELIF	0x89a3		/* remove interface from bridge */
 
+/* hardware time stamping: parameters in linux/net_tstamp.h */
+#define SIOCSHWTSTAMP   0x89b0
+
 /* Device private ioctl calls */
 
 /*
-- 
cgit v1.2.3-71-gd317


From ac45f602ee3d1b6f326f68bc0c2591ceebf05ba4 Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Thu, 12 Feb 2009 05:03:37 +0000
Subject: net: infrastructure for hardware time stamping

The additional per-packet information (16 bytes for time stamps, 1
byte for flags) is stored for all packets in the skb_shared_info
struct. This implementation detail is hidden from users of that
information via skb_* accessor functions. A separate struct resp.
union is used for the additional information so that it can be
stored/copied easily outside of skb_shared_info.

Compared to previous implementations (reusing the tstamp field
depending on the context, optional additional structures) this
is the simplest solution. It does not extend sk_buff itself.

TX time stamping is implemented in software if the device driver
doesn't support hardware time stamping.

The new semantic for hardware/software time stamping around
ndo_start_xmit() is based on two assumptions about existing
network device drivers which don't support hardware time
stamping and know nothing about it:
 - they leave the new skb_shared_tx unmodified
 - the keep the connection to the originating socket in skb->sk
   alive, i.e., don't call skb_orphan()

Given that skb_shared_tx is new, the first assumption is safe.
The second is only true for some drivers. As a result, software
TX time stamping currently works with the bnx2 driver, but not
with the unmodified igb driver (the two drivers this patch series
was tested with).

Signed-off-by: Patrick Ohly <patrick.ohly@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++++-
 net/core/dev.c         | 32 ++++++++++++++++--
 net/core/skbuff.c      | 41 +++++++++++++++++++++++
 3 files changed, 161 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 924700844580..f96bc91bf0a3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -132,6 +132,57 @@ struct skb_frag_struct {
 	__u32 size;
 };
 
+#define HAVE_HW_TIME_STAMP
+
+/**
+ * skb_shared_hwtstamps - hardware time stamps
+ *
+ * @hwtstamp:	hardware time stamp transformed into duration
+ *		since arbitrary point in time
+ * @syststamp:	hwtstamp transformed to system time base
+ *
+ * Software time stamps generated by ktime_get_real() are stored in
+ * skb->tstamp. The relation between the different kinds of time
+ * stamps is as follows:
+ *
+ * syststamp and tstamp can be compared against each other in
+ * arbitrary combinations.  The accuracy of a
+ * syststamp/tstamp/"syststamp from other device" comparison is
+ * limited by the accuracy of the transformation into system time
+ * base. This depends on the device driver and its underlying
+ * hardware.
+ *
+ * hwtstamps can only be compared against other hwtstamps from
+ * the same device.
+ *
+ * This structure is attached to packets as part of the
+ * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
+ */
+struct skb_shared_hwtstamps {
+	ktime_t	hwtstamp;
+	ktime_t	syststamp;
+};
+
+/**
+ * skb_shared_tx - instructions for time stamping of outgoing packets
+ *
+ * @hardware:		generate hardware time stamp
+ * @software:		generate software time stamp
+ * @in_progress:	device driver is going to provide
+ *			hardware time stamp
+ *
+ * These flags are attached to packets as part of the
+ * &skb_shared_info. Use skb_tx() to get a pointer.
+ */
+union skb_shared_tx {
+	struct {
+		__u8	hardware:1,
+			software:1,
+			in_progress:1;
+	};
+	__u8 flags;
+};
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -143,10 +194,12 @@ struct skb_shared_info {
 	unsigned short	gso_segs;
 	unsigned short  gso_type;
 	__be32          ip6_frag_id;
+	union skb_shared_tx tx_flags;
 #ifdef CONFIG_HAS_DMA
 	unsigned int	num_dma_maps;
 #endif
 	struct sk_buff	*frag_list;
+	struct skb_shared_hwtstamps hwtstamps;
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 #ifdef CONFIG_HAS_DMA
 	dma_addr_t	dma_maps[MAX_SKB_FRAGS + 1];
@@ -465,6 +518,16 @@ static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
 /* Internal */
 #define skb_shinfo(SKB)	((struct skb_shared_info *)(skb_end_pointer(SKB)))
 
+static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
+{
+	return &skb_shinfo(skb)->hwtstamps;
+}
+
+static inline union skb_shared_tx *skb_tx(struct sk_buff *skb)
+{
+	return &skb_shinfo(skb)->tx_flags;
+}
+
 /**
  *	skb_queue_empty - check if a queue is empty
  *	@list: queue head
@@ -1730,6 +1793,11 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
 
 extern void skb_init(void);
 
+static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
+{
+	return skb->tstamp;
+}
+
 /**
  *	skb_get_timestamp - get timestamp from a skb
  *	@skb: skb to get stamp from
@@ -1739,11 +1807,18 @@ extern void skb_init(void);
  *	This function converts the offset back to a struct timeval and stores
  *	it in stamp.
  */
-static inline void skb_get_timestamp(const struct sk_buff *skb, struct timeval *stamp)
+static inline void skb_get_timestamp(const struct sk_buff *skb,
+				     struct timeval *stamp)
 {
 	*stamp = ktime_to_timeval(skb->tstamp);
 }
 
+static inline void skb_get_timestampns(const struct sk_buff *skb,
+				       struct timespec *stamp)
+{
+	*stamp = ktime_to_timespec(skb->tstamp);
+}
+
 static inline void __net_timestamp(struct sk_buff *skb)
 {
 	skb->tstamp = ktime_get_real();
@@ -1759,6 +1834,20 @@ static inline ktime_t net_invalid_timestamp(void)
 	return ktime_set(0, 0);
 }
 
+/**
+ * skb_tstamp_tx - queue clone of skb with send time stamps
+ * @orig_skb:	the original outgoing packet
+ * @hwtstamps:	hardware time stamps, may be NULL if not available
+ *
+ * If the skb has a socket associated, then this function clones the
+ * skb (thus sharing the actual data and optional structures), stores
+ * the optional hardware time stamping information (if non NULL) or
+ * generates a software time stamp (otherwise), then queues the clone
+ * to the error queue of the socket.  Errors are silently ignored.
+ */
+extern void skb_tstamp_tx(struct sk_buff *orig_skb,
+			struct skb_shared_hwtstamps *hwtstamps);
+
 extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
 extern __sum16 __skb_checksum_complete(struct sk_buff *skb);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 1e27a67df242..d20c28e839d3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1672,10 +1672,21 @@ static int dev_gso_segment(struct sk_buff *skb)
 	return 0;
 }
 
+static void tstamp_tx(struct sk_buff *skb)
+{
+	union skb_shared_tx *shtx =
+		skb_tx(skb);
+	if (unlikely(shtx->software &&
+			!shtx->in_progress)) {
+		skb_tstamp_tx(skb, NULL);
+	}
+}
+
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	int rc;
 
 	prefetch(&dev->netdev_ops->ndo_start_xmit);
 	if (likely(!skb->next)) {
@@ -1689,13 +1700,29 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				goto gso;
 		}
 
-		return ops->ndo_start_xmit(skb, dev);
+		rc = ops->ndo_start_xmit(skb, dev);
+		/*
+		 * TODO: if skb_orphan() was called by
+		 * dev->hard_start_xmit() (for example, the unmodified
+		 * igb driver does that; bnx2 doesn't), then
+		 * skb_tx_software_timestamp() will be unable to send
+		 * back the time stamp.
+		 *
+		 * How can this be prevented? Always create another
+		 * reference to the socket before calling
+		 * dev->hard_start_xmit()? Prevent that skb_orphan()
+		 * does anything in dev->hard_start_xmit() by clearing
+		 * the skb destructor before the call and restoring it
+		 * afterwards, then doing the skb_orphan() ourselves?
+		 */
+		if (likely(!rc))
+			tstamp_tx(skb);
+		return rc;
 	}
 
 gso:
 	do {
 		struct sk_buff *nskb = skb->next;
-		int rc;
 
 		skb->next = nskb->next;
 		nskb->next = NULL;
@@ -1705,6 +1732,7 @@ gso:
 			skb->next = nskb;
 			return rc;
 		}
+		tstamp_tx(skb);
 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ab7d2e9f02fa..e5a8351ff12d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -55,6 +55,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/scatterlist.h>
+#include <linux/errqueue.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -215,7 +216,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	shinfo->gso_segs = 0;
 	shinfo->gso_type = 0;
 	shinfo->ip6_frag_id = 0;
+	shinfo->tx_flags.flags = 0;
 	shinfo->frag_list = NULL;
+	memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
 
 	if (fclone) {
 		struct sk_buff *child = skb + 1;
@@ -2945,6 +2948,44 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 }
 EXPORT_SYMBOL_GPL(skb_cow_data);
 
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+		struct skb_shared_hwtstamps *hwtstamps)
+{
+	struct sock *sk = orig_skb->sk;
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb;
+	int err;
+
+	if (!sk)
+		return;
+
+	skb = skb_clone(orig_skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	if (hwtstamps) {
+		*skb_hwtstamps(skb) =
+			*hwtstamps;
+	} else {
+		/*
+		 * no hardware time stamps available,
+		 * so keep the skb_shared_tx and only
+		 * store software time stamp
+		 */
+		skb->tstamp = ktime_get_real();
+	}
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+	err = sock_queue_err_skb(sk, skb);
+	if (err)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+
+
 /**
  * skb_partial_csum_set - set up and verify partial csum values for packet
  * @skb: the skb to set
-- 
cgit v1.2.3-71-gd317


From 4458f04c02a46c679a90ef71f866a415c192deb4 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Fri, 13 Feb 2009 08:33:42 +0000
Subject: sctp: Clean up sctp checksumming code

The sctp crc32c checksum is always generated in little endian.
So, we clean up the code to treat it as little endian and remove
all the __force casts.

Suggested by Herbert Xu.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h        |  2 +-
 include/net/sctp/checksum.h | 14 +++++++-------
 net/sctp/input.c            | 11 ++++++-----
 net/sctp/output.c           | 14 ++++++--------
 4 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index bd50b371ffaa..c2731bfe04d8 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -60,7 +60,7 @@ typedef struct sctphdr {
 	__be16 source;
 	__be16 dest;
 	__be32 vtag;
-	__be32 checksum;
+	__le32 checksum;
 } __attribute__((packed)) sctp_sctphdr_t;
 
 #ifdef __KERNEL__
diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 2fec3c366e81..befc8d2a1b9f 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -46,14 +46,14 @@
 #include <net/sctp/sctp.h>
 #include <linux/crc32c.h>
 
-static inline __be32 sctp_crc32c(__be32 crc, u8 *buffer, u16 length)
+static inline __u32 sctp_crc32c(__u32 crc, u8 *buffer, u16 length)
 {
-	return (__force __be32)crc32c((__force u32)crc, buffer, length);
+	return crc32c(crc, buffer, length);
 }
 
-static inline __be32 sctp_start_cksum(__u8 *buffer, __u16 length)
+static inline __u32 sctp_start_cksum(__u8 *buffer, __u16 length)
 {
-	__be32 crc = ~cpu_to_be32(0);
+	__u32 crc = ~(__u32)0;
 	__u8  zero[sizeof(__u32)] = {0};
 
 	/* Optimize this routine to be SCTP specific, knowing how
@@ -72,12 +72,12 @@ static inline __be32 sctp_start_cksum(__u8 *buffer, __u16 length)
 	return crc;
 }
 
-static inline __be32 sctp_update_cksum(__u8 *buffer, __u16 length, __be32 crc32)
+static inline __u32 sctp_update_cksum(__u8 *buffer, __u16 length, __u32 crc32)
 {
 	return sctp_crc32c(crc32, buffer, length);
 }
 
-static inline __be32 sctp_end_cksum(__be32 crc32)
+static inline __le32 sctp_end_cksum(__be32 crc32)
 {
-	return (__force __be32)~cpu_to_le32((__force u32)crc32);
+	return cpu_to_le32(~crc32);
 }
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 693fd0804810..d2e98803ffe3 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -83,14 +83,15 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb)
 {
 	struct sk_buff *list = skb_shinfo(skb)->frag_list;
 	struct sctphdr *sh = sctp_hdr(skb);
-	__be32 cmp = sh->checksum;
-	__be32 val = sctp_start_cksum((__u8 *)sh, skb_headlen(skb));
+	__le32 cmp = sh->checksum;
+	__le32 val;
+	__u32 tmp = sctp_start_cksum((__u8 *)sh, skb_headlen(skb));
 
 	for (; list; list = list->next)
-		val = sctp_update_cksum((__u8 *)list->data, skb_headlen(list),
-					val);
+		tmp = sctp_update_cksum((__u8 *)list->data, skb_headlen(list),
+					tmp);
 
-	val = sctp_end_cksum(val);
+	val = sctp_end_cksum(tmp);
 
 	if (val != cmp) {
 		/* CRC failure, dump it. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 2d65b7a7330b..07d58903a746 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -367,7 +367,6 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
 	struct sctphdr *sh;
-	__be32 crc32 = cpu_to_be32(0);
 	struct sk_buff *nskb;
 	struct sctp_chunk *chunk, *tmp;
 	struct sock *sk;
@@ -532,16 +531,15 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
 	 */
 	if (!sctp_checksum_disable && !(dst->dev->features & NETIF_F_NO_CSUM)) {
-		crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
-		crc32 = sctp_end_cksum(crc32);
+		__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
+
+		/* 3) Put the resultant value into the checksum field in the
+		 *    common header, and leave the rest of the bits unchanged.
+		 */
+		sh->checksum = sctp_end_cksum(crc32);
 	} else
 		nskb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	/* 3) Put the resultant value into the checksum field in the
-	 *    common header, and leave the rest of the bits unchanged.
-	 */
-	sh->checksum = crc32;
-
 	/* IP layer ECN support
 	 * From RFC 2481
 	 *  "The ECN-Capable Transport (ECT) bit would be set by the
-- 
cgit v1.2.3-71-gd317


From f6180773d90595650e11de0118bb112018290915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 00:40:25 -0500
Subject: ftrace: add command interface for function selection

Allow for other tracers to add their own commands for function
selection. This interface gives a trace the ability to name a
command for function selection. Right now it is pretty limited
in what it offers, but this is a building step for more features.

The :mod: command is converted to this interface and also serves
as a template for other implementations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  16 ++++++++
 kernel/trace/ftrace.c  | 106 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 106b7909d500..f0a0ecc63b5c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -95,6 +95,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 		   loff_t *ppos);
 #endif
 
+struct ftrace_func_command {
+	struct list_head	list;
+	char			*name;
+	int			(*func)(char *func, char *cmd,
+					char *params, int enable);
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
@@ -119,6 +126,9 @@ struct dyn_ftrace {
 int ftrace_force_update(void);
 void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
+int register_ftrace_command(struct ftrace_func_command *cmd);
+int unregister_ftrace_command(struct ftrace_func_command *cmd);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
@@ -202,6 +212,12 @@ extern void ftrace_enable_daemon(void);
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
 static inline void ftrace_release(void *start, unsigned long size) { }
+static inline int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+}
+static inline int unregister_ftrace_command(char *cmd_name)
+{
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 340f88b68d9e..45a44c402566 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1239,9 +1239,93 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+/*
+ * We register the module command as a template to show others how
+ * to register the a command as well.
+ */
+
+static int
+ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+{
+	char *mod;
+
+	/*
+	 * cmd == 'mod' because we only registered this func
+	 * for the 'mod' ftrace_func_command.
+	 * But if you register one func with multiple commands,
+	 * you can tell which command was used by the cmd
+	 * parameter.
+	 */
+
+	/* we must have a module name */
+	if (!param)
+		return -EINVAL;
+
+	mod = strsep(&param, ":");
+	if (!strlen(mod))
+		return -EINVAL;
+
+	ftrace_match_module_records(func, mod, enable);
+	return 0;
+}
+
+static struct ftrace_func_command ftrace_mod_cmd = {
+	.name			= "mod",
+	.func			= ftrace_mod_callback,
+};
+
+static int __init ftrace_mod_cmd_init(void)
+{
+	return register_ftrace_command(&ftrace_mod_cmd);
+}
+device_initcall(ftrace_mod_cmd_init);
+
+static LIST_HEAD(ftrace_commands);
+static DEFINE_MUTEX(ftrace_cmd_mutex);
+
+int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p;
+	int ret = 0;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+	}
+	list_add(&cmd->list, &ftrace_commands);
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
+int unregister_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p, *n;
+	int ret = -ENODEV;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry_safe(p, n, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = 0;
+			list_del_init(&p->list);
+			goto out_unlock;
+		}
+	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	char *func, *mod, *command, *next = buff;
+	struct ftrace_func_command *p;
+	char *func, *command, *next = buff;
+	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
 
@@ -1250,21 +1334,21 @@ static int ftrace_process_regex(char *buff, int len, int enable)
 		return 0;
 	}
 
-	/* command fonud */
+	/* command found */
 
 	command = strsep(&next, ":");
 
-	if (strcmp(command, "mod") == 0) {
-		/* only match modules */
-		if (!next)
-			return -EINVAL;
-
-		mod = strsep(&next, ":");
-		ftrace_match_module_records(func, mod, enable);
-		return 0;
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(p->name, command) == 0) {
+			ret = p->func(func, command, next, enable);
+			goto out_unlock;
+		}
 	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 static ssize_t
-- 
cgit v1.2.3-71-gd317


From 59df055f1991c9fc0c71a9230663c39188f6972f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 15:29:06 -0500
Subject: ftrace: trace different functions with a different tracer

Impact: new feature

Currently, the function tracer only gives you an ability to hook
a tracer to all functions being traced. The dynamic function trace
allows you to pick and choose which of those functions will be
traced, but all functions being traced will call all tracers that
registered with the function tracer.

This patch adds a new feature that allows a tracer to hook to specific
functions, even when all functions are being traced. It allows for
different functions to call different tracer hooks.

The way this is accomplished is by a special function that will hook
to the function tracer and will set up a hash table knowing which
tracer hook to call with which function. This is the most general
and easiest method to accomplish this. Later, an arch may choose
to supply their own method in changing the mcount call of a function
to call a different tracer. But that will be an exercise for the
future.

To register a function:

 struct ftrace_hook_ops {
	void			(*func)(unsigned long ip,
					unsigned long parent_ip,
					void **data);
	int			(*callback)(unsigned long ip, void **data);
	void			(*free)(void **data);
 };

 int register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				  void *data);

glob is a simple glob to search for the functions to hook.
ops is a pointer to the operations (listed below)
data is the default data to be passed to the hook functions when traced

ops:
 func is the hook function to call when the functions are traced
 callback is a callback function that is called when setting up the hash.
   That is, if the tracer needs to do something special for each
   function, that is being traced, and wants to give each function
   its own data. The address of the entry data is passed to this
   callback, so that the callback may wish to update the entry to
   whatever it would like.
 free is a callback for when the entry is freed. In case the tracer
   allocated any data, it is give the chance to free it.

To unregister we have three functions:

  void
  unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				void *data)

This will unregister all hooks that match glob, point to ops, and
have its data matching data. (note, if glob is NULL, blank or '*',
all functions will be tested).

  void
  unregister_ftrace_function_hook_func(char *glob,
				 struct ftrace_hook_ops *ops)

This will unregister all functions matching glob that has an entry
pointing to ops.

  void unregister_ftrace_function_hook_all(char *glob)

This simply unregisters all funcs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  18 ++++
 kernel/trace/ftrace.c  | 247 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 265 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f0a0ecc63b5c..13918c4400ad 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,6 +106,24 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct ftrace_hook_ops {
+	void			(*func)(unsigned long ip,
+					unsigned long parent_ip,
+					void **data);
+	int			(*callback)(unsigned long ip, void **data);
+	void			(*free)(void **data);
+};
+
+extern int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data);
+extern void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data);
+extern void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
+extern void unregister_ftrace_function_hook_all(char *glob);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 157d4f68b0e0..0b80e325f296 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
+#include <linux/hash.h>
 
 #include <asm/ftrace.h>
 
@@ -1245,6 +1246,252 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+static void
+function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long key;
+	int resched;
+
+	key = hash_long(ip, FTRACE_HASH_BITS);
+
+	hhd = &ftrace_func_hash[key];
+
+	if (hlist_empty(hhd))
+		return;
+
+	/*
+	 * Disable preemption for these calls to prevent a RCU grace
+	 * period. This syncs the hash iteration and freeing of items
+	 * on the hash. rcu_read_lock is too dangerous here.
+	 */
+	resched = ftrace_preempt_disable();
+	hlist_for_each_entry_rcu(entry, n, hhd, node) {
+		if (entry->ip == ip)
+			entry->ops->func(ip, parent_ip, &entry->data);
+	}
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_hook_ops __read_mostly =
+{
+	.func = function_trace_hook_call,
+};
+
+static int ftrace_hook_registered;
+
+static void __enable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			break;
+	}
+	/* Nothing registered? */
+	if (i == FTRACE_FUNC_HASHSIZE)
+		return;
+
+	__register_ftrace_function(&trace_hook_ops);
+	ftrace_startup(0);
+	ftrace_hook_registered = 1;
+}
+
+static void __disable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (!ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			return;
+	}
+
+	/* no more funcs left */
+	__unregister_ftrace_function(&trace_hook_ops);
+	ftrace_shutdown(0);
+	ftrace_hook_registered = 0;
+}
+
+
+static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+{
+	struct ftrace_func_hook *entry =
+		container_of(rhp, struct ftrace_func_hook, rcu);
+
+	if (entry->ops->free)
+		entry->ops->free(&entry->data);
+	kfree(entry);
+}
+
+
+int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data)
+{
+	struct ftrace_func_hook *entry;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long key;
+	int type, len, not;
+	int count = 0;
+	char *search;
+
+	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+	len = strlen(search);
+
+	/* we do not support '!' for function hooks */
+	if (WARN_ON(not))
+		return -EINVAL;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (!ftrace_match_record(rec, search, len, type))
+			continue;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry) {
+			/* If we did not hook to any, then return error */
+			if (!count)
+				count = -ENOMEM;
+			goto out_unlock;
+		}
+
+		count++;
+
+		entry->data = data;
+
+		/*
+		 * The caller might want to do something special
+		 * for each function we find. We call the callback
+		 * to give the caller an opportunity to do so.
+		 */
+		if (ops->callback) {
+			if (ops->callback(rec->ip, &entry->data) < 0) {
+				/* caller does not like this func */
+				kfree(entry);
+				continue;
+			}
+		}
+
+		entry->ops = ops;
+		entry->ip = rec->ip;
+
+		key = hash_long(entry->ip, FTRACE_HASH_BITS);
+		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+
+	} while_for_each_ftrace_rec();
+	__enable_ftrace_function_hook();
+
+ out_unlock:
+	mutex_unlock(&ftrace_lock);
+
+	return count;
+}
+
+enum {
+	HOOK_TEST_FUNC		= 1,
+	HOOK_TEST_DATA		= 2
+};
+
+static void
+__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				  void *data, int flags)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_node *n, *tmp;
+	char str[KSYM_SYMBOL_LEN];
+	int type = MATCH_FULL;
+	int i, len = 0;
+	char *search;
+
+	if (glob && (strcmp(glob, "*") || !strlen(glob)))
+		glob = NULL;
+	else {
+		int not;
+
+		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+		len = strlen(search);
+
+		/* we do not support '!' for function hooks */
+		if (WARN_ON(not))
+			return;
+	}
+
+	mutex_lock(&ftrace_lock);
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+
+		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+
+			/* break up if statements for readability */
+			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+				continue;
+
+			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+				continue;
+
+			/* do this last, since it is the most expensive */
+			if (glob) {
+				kallsyms_lookup(entry->ip, NULL, NULL,
+						NULL, str);
+				if (!ftrace_match(str, glob, len, type))
+					continue;
+			}
+
+			hlist_del(&entry->node);
+			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+		}
+	}
+	__disable_ftrace_function_hook();
+	mutex_unlock(&ftrace_lock);
+}
+
+void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data)
+{
+	__unregister_ftrace_function_hook(glob, ops, data,
+					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+}
+
+void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+{
+	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+}
+
+void unregister_ftrace_function_hook_all(char *glob)
+{
+	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+}
+
 static LIST_HEAD(ftrace_commands);
 static DEFINE_MUTEX(ftrace_cmd_mutex);
 
-- 
cgit v1.2.3-71-gd317


From 988ae9d6b2bc3ebdc1a488490250a6812f85e9d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:17:02 -0500
Subject: ring-buffer: add tracing_is_on to test if ring buffer is enabled

This patch adds the tracing_is_on() interface to tell if the ring
buffer is turned on or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h | 2 ++
 kernel/trace/ring_buffer.c  | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8e6646a54acf..f5e793d69bd3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -128,10 +128,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
+int tracing_is_on(void);
 #else
 static inline void tracing_on(void) { }
 static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
+static inline int tracing_is_on(void) { return 0; }
 #endif
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2b4626ce95d6..8f19f1aa42b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -98,6 +98,15 @@ void tracing_off_permanent(void)
 	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+	return ring_buffer_flags == RB_BUFFERS_ON;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
 #include "trace.h"
 
 /* Up this if you want to test the TIME_EXTENTS and normalization */
-- 
cgit v1.2.3-71-gd317


From 809dcf29ce4e1723709910878e050bd187617e0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:06:01 -0500
Subject: ftrace: add pretty print to selected fuction traces

This patch adds a call back for the tracers that have hooks to
selected functions. This allows the tracer to show better output
in the set_ftrace_filter file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 6 ++++++
 kernel/trace/ftrace.c  | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 13918c4400ad..b331e216d8a1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,12 +106,18 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct seq_file;
+
 struct ftrace_hook_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
 	int			(*callback)(unsigned long ip, void **data);
 	void			(*free)(void **data);
+	int			(*print)(struct seq_file *m,
+					 unsigned long ip,
+					 struct ftrace_hook_ops *ops,
+					 void *data);
 };
 
 extern int
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e058848cddb..6533c1d20155 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -834,6 +834,9 @@ static int t_hash_show(struct seq_file *m, void *v)
 
 	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
 
+	if (rec->ops->print)
+		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-- 
cgit v1.2.3-71-gd317


From 97d0bb8dcd8c2812e1927cdb51d7b1f9c98352b5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 11:47:39 +0100
Subject: ftrace: fix !CONFIG_FTRACE [un_]register_ftrace_command() prototypes

Impact: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b331e216d8a1..63281228ce3e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -238,9 +238,11 @@ extern void ftrace_enable_daemon(void);
 static inline void ftrace_release(void *start, unsigned long size) { }
 static inline int register_ftrace_command(struct ftrace_func_command *cmd)
 {
+	return -EINVAL;
 }
 static inline int unregister_ftrace_command(char *cmd_name)
 {
+	return -EINVAL;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
-- 
cgit v1.2.3-71-gd317


From b6887d7916e44c1d8913084fb6aa5004d9473f1a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 12:32:04 -0500
Subject: ftrace: rename _hook to _probe

Impact: clean up

Ingo Molnar did not like the _hook naming convention used by the
select function tracer. Luis Claudio R. Goncalves suggested using
the "_probe" extension. This patch implements the change of
calling the functions and variables "_hook" and replacing them
with "_probe".

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h         | 12 +++----
 kernel/trace/ftrace.c          | 78 +++++++++++++++++++++---------------------
 kernel/trace/trace_functions.c | 26 +++++++-------
 3 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 63281228ce3e..9d224c43e634 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -108,7 +108,7 @@ struct ftrace_func_command {
 
 struct seq_file;
 
-struct ftrace_hook_ops {
+struct ftrace_probe_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
@@ -116,19 +116,19 @@ struct ftrace_hook_ops {
 	void			(*free)(void **data);
 	int			(*print)(struct seq_file *m,
 					 unsigned long ip,
-					 struct ftrace_hook_ops *ops,
+					 struct ftrace_probe_ops *ops,
 					 void *data);
 };
 
 extern int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data);
 extern void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data);
 extern void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
-extern void unregister_ftrace_function_hook_all(char *glob);
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
+extern void unregister_ftrace_function_probe_all(char *glob);
 
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index af9d95c0e4de..330a059f6ed7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -255,9 +255,9 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
-struct ftrace_func_hook {
+struct ftrace_func_probe {
 	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
+	struct ftrace_probe_ops	*ops;
 	unsigned long		flags;
 	unsigned long		ip;
 	void			*data;
@@ -830,11 +830,11 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 
 static int t_hash_show(struct seq_file *m, void *v)
 {
-	struct ftrace_func_hook *rec;
+	struct ftrace_func_probe *rec;
 	struct hlist_node *hnd = v;
 	char str[KSYM_SYMBOL_LEN];
 
-	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+	rec = hlist_entry(hnd, struct ftrace_func_probe, node);
 
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1351,9 +1351,9 @@ static int __init ftrace_mod_cmd_init(void)
 device_initcall(ftrace_mod_cmd_init);
 
 static void
-function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 	unsigned long key;
@@ -1379,18 +1379,18 @@ function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 	ftrace_preempt_enable(resched);
 }
 
-static struct ftrace_ops trace_hook_ops __read_mostly =
+static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_hook_call,
+	.func = function_trace_probe_call,
 };
 
-static int ftrace_hook_registered;
+static int ftrace_probe_registered;
 
-static void __enable_ftrace_function_hook(void)
+static void __enable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (ftrace_hook_registered)
+	if (ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1402,16 +1402,16 @@ static void __enable_ftrace_function_hook(void)
 	if (i == FTRACE_FUNC_HASHSIZE)
 		return;
 
-	__register_ftrace_function(&trace_hook_ops);
+	__register_ftrace_function(&trace_probe_ops);
 	ftrace_startup(0);
-	ftrace_hook_registered = 1;
+	ftrace_probe_registered = 1;
 }
 
-static void __disable_ftrace_function_hook(void)
+static void __disable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (!ftrace_hook_registered)
+	if (!ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1421,16 +1421,16 @@ static void __disable_ftrace_function_hook(void)
 	}
 
 	/* no more funcs left */
-	__unregister_ftrace_function(&trace_hook_ops);
+	__unregister_ftrace_function(&trace_probe_ops);
 	ftrace_shutdown(0);
-	ftrace_hook_registered = 0;
+	ftrace_probe_registered = 0;
 }
 
 
 static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 {
-	struct ftrace_func_hook *entry =
-		container_of(rhp, struct ftrace_func_hook, rcu);
+	struct ftrace_func_probe *entry =
+		container_of(rhp, struct ftrace_func_probe, rcu);
 
 	if (entry->ops->free)
 		entry->ops->free(&entry->data);
@@ -1439,10 +1439,10 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 
 
 int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type, len, not;
@@ -1453,7 +1453,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 	len = strlen(search);
 
-	/* we do not support '!' for function hooks */
+	/* we do not support '!' for function probes */
 	if (WARN_ON(not))
 		return -EINVAL;
 
@@ -1468,7 +1468,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 
 		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 		if (!entry) {
-			/* If we did not hook to any, then return error */
+			/* If we did not process any, then return error */
 			if (!count)
 				count = -ENOMEM;
 			goto out_unlock;
@@ -1498,7 +1498,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
 
 	} while_for_each_ftrace_rec();
-	__enable_ftrace_function_hook();
+	__enable_ftrace_function_probe();
 
  out_unlock:
 	mutex_unlock(&ftrace_lock);
@@ -1507,15 +1507,15 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 }
 
 enum {
-	HOOK_TEST_FUNC		= 1,
-	HOOK_TEST_DATA		= 2
+	PROBE_TEST_FUNC		= 1,
+	PROBE_TEST_DATA		= 2
 };
 
 static void
-__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				  void *data, int flags)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_node *n, *tmp;
 	char str[KSYM_SYMBOL_LEN];
 	int type = MATCH_FULL;
@@ -1530,7 +1530,7 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 		len = strlen(search);
 
-		/* we do not support '!' for function hooks */
+		/* we do not support '!' for function probes */
 		if (WARN_ON(not))
 			return;
 	}
@@ -1542,10 +1542,10 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
 
 			/* break up if statements for readability */
-			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+			if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
 				continue;
 
-			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+			if ((flags & PROBE_TEST_DATA) && entry->data != data)
 				continue;
 
 			/* do this last, since it is the most expensive */
@@ -1560,27 +1560,27 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
 		}
 	}
-	__disable_ftrace_function_hook();
+	__disable_ftrace_function_probe();
 	mutex_unlock(&ftrace_lock);
 }
 
 void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data)
 {
-	__unregister_ftrace_function_hook(glob, ops, data,
-					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+	__unregister_ftrace_function_probe(glob, ops, data,
+					  PROBE_TEST_FUNC | PROBE_TEST_DATA);
 }
 
 void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 {
-	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+	__unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
 }
 
-void unregister_ftrace_function_hook_all(char *glob)
+void unregister_ftrace_function_probe_all(char *glob)
 {
-	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+	__unregister_ftrace_function_probe(glob, NULL, NULL, 0);
 }
 
 static LIST_HEAD(ftrace_commands);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 021a574c5988..6ea73ed03bfa 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -269,21 +269,21 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data);
+			 struct ftrace_probe_ops *ops, void *data);
 
-static struct ftrace_hook_ops traceon_hook_ops = {
+static struct ftrace_probe_ops traceon_probe_ops = {
 	.func			= ftrace_traceon,
 	.print			= ftrace_trace_onoff_print,
 };
 
-static struct ftrace_hook_ops traceoff_hook_ops = {
+static struct ftrace_probe_ops traceoff_probe_ops = {
 	.func			= ftrace_traceoff,
 	.print			= ftrace_trace_onoff_print,
 };
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data)
+			 struct ftrace_probe_ops *ops, void *data)
 {
 	char str[KSYM_SYMBOL_LEN];
 	long count = (long)data;
@@ -291,7 +291,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	kallsyms_lookup(ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-	if (ops == &traceon_hook_ops)
+	if (ops == &traceon_probe_ops)
 		seq_printf(m, "traceon");
 	else
 		seq_printf(m, "traceoff");
@@ -306,15 +306,15 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
-	unregister_ftrace_function_hook_func(glob, ops);
+	unregister_ftrace_function_probe_func(glob, ops);
 
 	return 0;
 }
@@ -322,7 +322,7 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 static int
 ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 	void *count = (void *)-1;
 	char *number;
 	int ret;
@@ -336,9 +336,9 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
 	if (!param)
 		goto out_reg;
@@ -357,7 +357,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 		return ret;
 
  out_reg:
-	ret = register_ftrace_function_hook(glob, ops, count);
+	ret = register_ftrace_function_probe(glob, ops, count);
 
 	return ret;
 }
-- 
cgit v1.2.3-71-gd317


From 886a63e865eb346ab20572e802fc3118cb9aee14 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Sat, 14 Feb 2009 11:36:20 +0000
Subject: drivers/net/hamradio: fix sparse warnings: fix signedness

Fix this sparse warnings:
  drivers/net/hamradio/hdlcdrv.c:274:34: warning: incorrect type in argument 2 (different signedness)
  drivers/net/hamradio/hdlcdrv.c:279:47: warning: incorrect type in argument 2 (different signedness)
  drivers/net/hamradio/hdlcdrv.c:288:39: warning: incorrect type in argument 2 (different signedness)
  drivers/net/hamradio/hdlcdrv.c:300:47: warning: incorrect type in argument 2 (different signedness)

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/hdlcdrv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hdlcdrv.h b/include/linux/hdlcdrv.h
index 0821bac62b83..c010b4a785b8 100644
--- a/include/linux/hdlcdrv.h
+++ b/include/linux/hdlcdrv.h
@@ -215,7 +215,7 @@ struct hdlcdrv_state {
 
 	struct hdlcdrv_hdlctx {
 		struct hdlcdrv_hdlcbuffer hbuf;
-		long in_hdlc_tx;
+		unsigned long in_hdlc_tx;
 		/*
 		 * 0 = send flags
 		 * 1 = send txtail (flags)
-- 
cgit v1.2.3-71-gd317


From 3f683d6175748ef9daf4698d9ef5a488dd037063 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 18 Feb 2009 16:56:59 +0800
Subject: crypto: api - Fix crypto_alloc_tfm/create_create_tfm return
 convention

This is based on a report and patch by Geert Uytterhoeven.

The functions crypto_alloc_tfm and create_create_tfm return a
pointer that needs to be adjusted by the caller when successful
and otherwise an error value.  This means that the caller has
to check for the error and only perform the adjustment if the
pointer returned is valid.

Since all callers want to make the adjustment and we know how
to adjust it ourselves, it's much easier to just return adjusted
pointer directly.

The only caveat is that we have to return a void * instead of
struct crypto_tfm *.  However, this isn't that bad because both
of these functions are for internal use only (by types code like
shash.c, not even algorithms code).

This patch also moves crypto_alloc_tfm into crypto/internal.h
(crypto_create_tfm is already there) to reflect this.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 15 +++++++--------
 crypto/internal.h      |  6 ++++--
 crypto/shash.c         | 18 +++++-------------
 include/linux/crypto.h |  3 ---
 4 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 56b6e0e66311..22385cac90bb 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -453,8 +453,8 @@ err:
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_base);
 
-struct crypto_tfm *crypto_create_tfm(struct crypto_alg *alg,
-				     const struct crypto_type *frontend)
+void *crypto_create_tfm(struct crypto_alg *alg,
+			const struct crypto_type *frontend)
 {
 	char *mem;
 	struct crypto_tfm *tfm = NULL;
@@ -488,9 +488,9 @@ out_free_tfm:
 		crypto_shoot_alg(alg);
 	kfree(mem);
 out_err:
-	tfm = ERR_PTR(err);
+	mem = ERR_PTR(err);
 out:
-	return tfm;
+	return mem;
 }
 EXPORT_SYMBOL_GPL(crypto_create_tfm);
 
@@ -514,12 +514,11 @@ EXPORT_SYMBOL_GPL(crypto_create_tfm);
  *
  *	In case of error the return value is an error pointer.
  */
-struct crypto_tfm *crypto_alloc_tfm(const char *alg_name,
-				    const struct crypto_type *frontend,
-				    u32 type, u32 mask)
+void *crypto_alloc_tfm(const char *alg_name,
+		       const struct crypto_type *frontend, u32 type, u32 mask)
 {
 	struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask);
-	struct crypto_tfm *tfm;
+	void *tfm;
 	int err;
 
 	type &= frontend->maskclear;
diff --git a/crypto/internal.h b/crypto/internal.h
index 3c19a27a7563..fc76e1f37fc3 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -109,8 +109,10 @@ void crypto_alg_tested(const char *name, int err);
 void crypto_shoot_alg(struct crypto_alg *alg);
 struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
 				      u32 mask);
-struct crypto_tfm *crypto_create_tfm(struct crypto_alg *alg,
-				     const struct crypto_type *frontend);
+void *crypto_create_tfm(struct crypto_alg *alg,
+			const struct crypto_type *frontend);
+void *crypto_alloc_tfm(const char *alg_name,
+		       const struct crypto_type *frontend, u32 type, u32 mask);
 
 int crypto_register_instance(struct crypto_template *tmpl,
 			     struct crypto_instance *inst);
diff --git a/crypto/shash.c b/crypto/shash.c
index 13a0dc150a4d..7a659733f94a 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -18,15 +18,10 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 
-static const struct crypto_type crypto_shash_type;
-
-static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm)
-{
-	return container_of(tfm, struct crypto_shash, base);
-}
-
 #include "internal.h"
 
+static const struct crypto_type crypto_shash_type;
+
 static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
 				  unsigned int keylen)
 {
@@ -282,8 +277,7 @@ static int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
 	if (!crypto_mod_get(calg))
 		return -EAGAIN;
 
-	shash = __crypto_shash_cast(crypto_create_tfm(
-		calg, &crypto_shash_type));
+	shash = crypto_create_tfm(calg, &crypto_shash_type);
 	if (IS_ERR(shash)) {
 		crypto_mod_put(calg);
 		return PTR_ERR(shash);
@@ -391,8 +385,7 @@ static int crypto_init_shash_ops_compat(struct crypto_tfm *tfm)
 	if (!crypto_mod_get(calg))
 		return -EAGAIN;
 
-	shash = __crypto_shash_cast(crypto_create_tfm(
-		calg, &crypto_shash_type));
+	shash = crypto_create_tfm(calg, &crypto_shash_type);
 	if (IS_ERR(shash)) {
 		crypto_mod_put(calg);
 		return PTR_ERR(shash);
@@ -480,8 +473,7 @@ static const struct crypto_type crypto_shash_type = {
 struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
 					u32 mask)
 {
-	return __crypto_shash_cast(
-		crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask));
+	return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask);
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_shash);
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 1f2e9020acc6..29729b834380 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -548,9 +548,6 @@ struct crypto_attr_u32 {
  * Transform user interface.
  */
  
-struct crypto_tfm *crypto_alloc_tfm(const char *alg_name,
-				    const struct crypto_type *frontend,
-				    u32 type, u32 mask);
 struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
 void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);
 
-- 
cgit v1.2.3-71-gd317


From 4a2f965ca5a4e2593744bf75425d85e0e8ff814a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 18 Feb 2009 16:29:44 +0100
Subject: netfilter: x_tables: change elements in x_tables

Change to proper type on private pointer rather than anonymous void.
Keep active elements on same cache line.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index c7ee8744d26b..9fac88fc0e72 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -349,9 +349,6 @@ struct xt_table
 {
 	struct list_head list;
 
-	/* A unique name... */
-	const char name[XT_TABLE_MAXNAMELEN];
-
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
@@ -359,13 +356,15 @@ struct xt_table
 	rwlock_t lock;
 
 	/* Man behind the curtain... */
-	//struct ip6t_table_info *private;
-	void *private;
+	struct xt_table_info *private;
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
 
 	u_int8_t af;		/* address/protocol family */
+
+	/* A unique name... */
+	const char name[XT_TABLE_MAXNAMELEN];
 };
 
 #include <linux/netfilter_ipv4.h>
-- 
cgit v1.2.3-71-gd317


From 74019224ac34b044b44a31dd89a54e3477db4896 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 12:23:29 +0100
Subject: timers: add mod_timer_pending()

Impact: new timer API

Based on an idea from Martin Josefsson with the help of
Patrick McHardy and Stephen Hemminger:

introduce the mod_timer_pending() API which is a mod_timer()
offspring that is an invariant on already removed timers.

(regular mod_timer() re-activates non-pending timers.)

This is useful for the networking code in that it can
allow unserialized mod_timer_pending() timer-forwarding
calls, but a single del_timer*() will stop the timer
from being reactivated again.

Also while at it:

- optimize the regular mod_timer() path some more, the
  timer-stat and a debug check was needlessly duplicated
  in __mod_timer().

- make the exports come straight after the function, as
  most other exports in timer.c already did.

- eliminate __mod_timer() as an external API, change the
  users to mod_timer().

The regular mod_timer() code path is not impacted
significantly, due to inlining optimizations and due to
the simplifications.

Based-on-patch-from: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Cc: netdev@vger.kernel.org
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/platforms/cell/spufs/sched.c  |   2 +-
 drivers/infiniband/hw/ipath/ipath_driver.c |   6 +-
 include/linux/timer.h                      |  22 +-----
 kernel/relay.c                             |   2 +-
 kernel/timer.c                             | 110 +++++++++++++++++++----------
 5 files changed, 80 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 6a0ad196aeb3..f085369301b1 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -508,7 +508,7 @@ static void __spu_add_to_rq(struct spu_context *ctx)
 		list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
 		set_bit(ctx->prio, spu_prio->bitmap);
 		if (!spu_prio->nr_waiting++)
-			__mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
+			mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 	}
 }
 
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 69c0ce321b4e..cb9daa6ac029 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -2715,7 +2715,7 @@ static void ipath_hol_signal_up(struct ipath_devdata *dd)
  * to prevent HoL blocking, then start the HoL timer that
  * periodically continues, then stop procs, so they can detect
  * link down if they want, and do something about it.
- * Timer may already be running, so use __mod_timer, not add_timer.
+ * Timer may already be running, so use mod_timer, not add_timer.
  */
 void ipath_hol_down(struct ipath_devdata *dd)
 {
@@ -2724,7 +2724,7 @@ void ipath_hol_down(struct ipath_devdata *dd)
 	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
 	dd->ipath_hol_timer.expires = jiffies +
 		msecs_to_jiffies(ipath_hol_timeout_ms);
-	__mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
 }
 
 /*
@@ -2763,7 +2763,7 @@ void ipath_hol_event(unsigned long opaque)
 	else {
 		dd->ipath_hol_timer.expires = jiffies +
 			msecs_to_jiffies(ipath_hol_timeout_ms);
-		__mod_timer(&dd->ipath_hol_timer,
+		mod_timer(&dd->ipath_hol_timer,
 			dd->ipath_hol_timer.expires);
 	}
 }
diff --git a/include/linux/timer.h b/include/linux/timer.h
index daf9685b861c..e2d662e3416e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -86,8 +86,8 @@ static inline int timer_pending(const struct timer_list * timer)
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
-extern int __mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
+extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
 /*
  * The jiffies value which is added to now, when there is no timer
@@ -146,25 +146,7 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
 }
 #endif
 
-/**
- * add_timer - start a timer
- * @timer: the timer to be added
- *
- * The kernel will do a ->function(->data) callback from the
- * timer interrupt at the ->expires point in the future. The
- * current time is 'jiffies'.
- *
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
- *
- * Timers with an ->expires field in the past will be executed in the next
- * timer tick.
- */
-static inline void add_timer(struct timer_list *timer)
-{
-	BUG_ON(timer_pending(timer));
-	__mod_timer(timer, timer->expires);
-}
+extern void add_timer(struct timer_list *timer);
 
 #ifdef CONFIG_SMP
   extern int try_to_del_timer_sync(struct timer_list *timer);
diff --git a/kernel/relay.c b/kernel/relay.c
index 9d79b7854fa6..8f2179c8056f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -750,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 			 * from the scheduler (trying to re-grab
 			 * rq->lock), so defer it.
 			 */
-			__mod_timer(&buf->timer, jiffies + 1);
+			mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64fe143d..9b77fc9a9ac8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -589,11 +589,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 	}
 }
 
-int __mod_timer(struct timer_list *timer, unsigned long expires)
+static inline int
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret;
+
+	ret = 0;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -603,6 +606,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer)) {
 		detach_timer(timer, 0);
 		ret = 1;
+	} else {
+		if (pending_only)
+			goto out_unlock;
 	}
 
 	debug_timer_activate(timer);
@@ -629,42 +635,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 
 	timer->expires = expires;
 	internal_add_timer(base, timer);
+
+out_unlock:
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
 }
 
-EXPORT_SYMBOL(__mod_timer);
-
 /**
- * add_timer_on - start a timer on a particular CPU
- * @timer: the timer to be added
- * @cpu: the CPU to start it on
+ * mod_timer_pending - modify a pending timer's timeout
+ * @timer: the pending timer to be modified
+ * @expires: new timeout in jiffies
  *
- * This is not very scalable on SMP. Double adds are not possible.
+ * mod_timer_pending() is the same for pending timers as mod_timer(),
+ * but will not re-activate and modify already deleted timers.
+ *
+ * It is useful for unserialized use of timers.
  */
-void add_timer_on(struct timer_list *timer, int cpu)
+int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	struct tvec_base *base = per_cpu(tvec_bases, cpu);
-	unsigned long flags;
-
-	timer_stats_timer_set_start_info(timer);
-	BUG_ON(timer_pending(timer) || !timer->function);
-	spin_lock_irqsave(&base->lock, flags);
-	timer_set_base(timer, base);
-	debug_timer_activate(timer);
-	internal_add_timer(base, timer);
-	/*
-	 * Check whether the other CPU is idle and needs to be
-	 * triggered to reevaluate the timer wheel when nohz is
-	 * active. We are protected against the other CPU fiddling
-	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to idle can not evaluate
-	 * the timer wheel.
-	 */
-	wake_up_idle_cpu(cpu);
-	spin_unlock_irqrestore(&base->lock, flags);
+	return __mod_timer(timer, expires, true);
 }
+EXPORT_SYMBOL(mod_timer_pending);
 
 /**
  * mod_timer - modify a timer's timeout
@@ -688,9 +680,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	BUG_ON(!timer->function);
-
-	timer_stats_timer_set_start_info(timer);
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -699,11 +688,61 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires);
+	return __mod_timer(timer, expires, false);
 }
-
 EXPORT_SYMBOL(mod_timer);
 
+/**
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expires point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expires, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expires field in the past will be executed in the next
+ * timer tick.
+ */
+void add_timer(struct timer_list *timer)
+{
+	BUG_ON(timer_pending(timer));
+	mod_timer(timer, timer->expires);
+}
+EXPORT_SYMBOL(add_timer);
+
+/**
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP. Double adds are not possible.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+	struct tvec_base *base = per_cpu(tvec_bases, cpu);
+	unsigned long flags;
+
+	timer_stats_timer_set_start_info(timer);
+	BUG_ON(timer_pending(timer) || !timer->function);
+	spin_lock_irqsave(&base->lock, flags);
+	timer_set_base(timer, base);
+	debug_timer_activate(timer);
+	internal_add_timer(base, timer);
+	/*
+	 * Check whether the other CPU is idle and needs to be
+	 * triggered to reevaluate the timer wheel when nohz is
+	 * active. We are protected against the other CPU fiddling
+	 * with the timer by holding the timer base lock. This also
+	 * makes sure that a CPU on the way to idle can not evaluate
+	 * the timer wheel.
+	 */
+	wake_up_idle_cpu(cpu);
+	spin_unlock_irqrestore(&base->lock, flags);
+}
+
 /**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
@@ -733,7 +772,6 @@ int del_timer(struct timer_list *timer)
 
 	return ret;
 }
-
 EXPORT_SYMBOL(del_timer);
 
 #ifdef CONFIG_SMP
@@ -767,7 +805,6 @@ out:
 
 	return ret;
 }
-
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
 /**
@@ -796,7 +833,6 @@ int del_timer_sync(struct timer_list *timer)
 		cpu_relax();
 	}
 }
-
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
@@ -1268,7 +1304,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire);
+	__mod_timer(&timer, expire, false);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3-71-gd317


From 712406a6bf59ebf4a00358bb59a4a2a1b2953d90 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Feb 2009 10:54:03 -0800
Subject: tracing/function-graph-tracer: make arch generic push pop functions

There is nothing really arch specific of the push and pop functions
used by the function graph tracer. This patch moves them to generic
code.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/include/asm/ftrace.h        | 25 ------------
 arch/x86/kernel/dumpstack.c          |  1 +
 arch/x86/kernel/ftrace.c             | 75 +-----------------------------------
 include/linux/ftrace.h               | 24 ++++++++++++
 kernel/trace/trace_functions_graph.c | 75 ++++++++++++++++++++++++++++++++++++
 5 files changed, 101 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index b55b4a7fbefd..db24c2278be0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -55,29 +55,4 @@ struct dyn_arch_ftrace {
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-
-#ifndef __ASSEMBLY__
-
-/*
- * Stack of return addresses for functions
- * of a thread.
- * Used in struct thread_info
- */
-struct ftrace_ret_stack {
-	unsigned long ret;
-	unsigned long func;
-	unsigned long long calltime;
-};
-
-/*
- * Primary handler of a function return.
- * It relays on ftrace_return_to_handler.
- * Defined in entry_32/64.S
- */
-extern void return_to_handler(void);
-
-#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..c0852291b623 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,6 +10,7 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
 #include <linux/nmi.h>
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 231bdd3c5b1c..76f7141e0f91 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -389,79 +389,6 @@ void ftrace_nmi_exit(void)
 
 #endif /* !CONFIG_DYNAMIC_FTRACE */
 
-/* Add a function return address to the trace stack on thread info.*/
-static int push_return_trace(unsigned long ret, unsigned long long time,
-				unsigned long func, int *depth)
-{
-	int index;
-
-	if (!current->ret_stack)
-		return -EBUSY;
-
-	/* The return trace stack is full */
-	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
-		atomic_inc(&current->trace_overrun);
-		return -EBUSY;
-	}
-
-	index = ++current->curr_ret_stack;
-	barrier();
-	current->ret_stack[index].ret = ret;
-	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
-	*depth = index;
-
-	return 0;
-}
-
-/* Retrieve a function return address to the trace stack on thread info.*/
-static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
-{
-	int index;
-
-	index = current->curr_ret_stack;
-
-	if (unlikely(index < 0)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic, otherwise we have no where to go */
-		*ret = (unsigned long)panic;
-		return;
-	}
-
-	*ret = current->ret_stack[index].ret;
-	trace->func = current->ret_stack[index].func;
-	trace->calltime = current->ret_stack[index].calltime;
-	trace->overrun = atomic_read(&current->trace_overrun);
-	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
-}
-
-/*
- * Send the trace to the ring-buffer.
- * @return the original return address.
- */
-unsigned long ftrace_return_to_handler(void)
-{
-	struct ftrace_graph_ret trace;
-	unsigned long ret;
-
-	pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_graph_return(&trace);
-
-	if (unlikely(!ret)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic. What else to do? */
-		ret = (unsigned long)panic;
-	}
-
-	return ret;
-}
-
 /*
  * Hook the return address and push it in the stack of return addrs
  * in current thread info.
@@ -521,7 +448,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 
 	calltime = cpu_clock(raw_smp_processor_id());
 
-	if (push_return_trace(old, calltime,
+	if (ftrace_push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
 		return;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..a7f8134c594e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -379,6 +379,30 @@ struct ftrace_graph_ret {
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+	unsigned long ret;
+	unsigned long func;
+	unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry_32/64.S
+ */
+extern void return_to_handler(void);
+
+extern int
+ftrace_push_return_trace(unsigned long ret, unsigned long long time,
+			 unsigned long func, int *depth);
+extern void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
+
 /*
  * Sometimes we don't want to trace a function with the function
  * graph tracer but we want them to keep traced by the usual function
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e5b38e..dce71a5b51bc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -42,6 +42,81 @@ static struct tracer_flags tracer_flags = {
 /* pid on the last trace processed */
 static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
 
+/* Add a function return address to the trace stack on thread info.*/
+int
+ftrace_push_return_trace(unsigned long ret, unsigned long long time,
+			 unsigned long func, int *depth)
+{
+	int index;
+
+	if (!current->ret_stack)
+		return -EBUSY;
+
+	/* The return trace stack is full */
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
+		return -EBUSY;
+	}
+
+	index = ++current->curr_ret_stack;
+	barrier();
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = time;
+	*depth = index;
+
+	return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+	int index;
+
+	index = current->curr_ret_stack;
+
+	if (unlikely(index < 0)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic, otherwise we have no where to go */
+		*ret = (unsigned long)panic;
+		return;
+	}
+
+	*ret = current->ret_stack[index].ret;
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = index;
+	barrier();
+	current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_graph_ret trace;
+	unsigned long ret;
+
+	ftrace_pop_return_trace(&trace, &ret);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_graph_return(&trace);
+
+	if (unlikely(!ret)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic. What else to do? */
+		ret = (unsigned long)panic;
+	}
+
+	return ret;
+}
+
 static int graph_trace_init(struct trace_array *tr)
 {
 	int cpu, ret;
-- 
cgit v1.2.3-71-gd317


From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr

Impact: cleanup

There are two allocated per-cpu accessor macros with almost identical
spelling.  The original and far more popular is per_cpu_ptr (44
files), so change over the other 4 files.

tj: kill percpu_ptr() and update UP too

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: mingo@redhat.com
Cc: lenb@kernel.org
Cc: cpufreq@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  2 +-
 drivers/acpi/processor_perflib.c           |  4 ++--
 include/linux/percpu.h                     | 23 +++++++++++------------
 kernel/sched.c                             |  6 +++---
 kernel/stop_machine.c                      |  2 +-
 5 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..22590cf688ae 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
 
-	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
 	per_cpu(drv_data, cpu) = data;
 
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 9cc769b587ff..68fd3d292799 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
 			continue;
 		}
 
-		if (!performance || !percpu_ptr(performance, i)) {
+		if (!performance || !per_cpu_ptr(performance, i)) {
 			retval = -EINVAL;
 			continue;
 		}
 
-		pr->performance = percpu_ptr(performance, i);
+		pr->performance = per_cpu_ptr(performance, i);
 		cpumask_set_cpu(i, pr->performance->shared_cpu_map);
 		if (acpi_processor_get_psd(pr)) {
 			retval = -EINVAL;
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 3577ffd90d45..c80cfe1260ec 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -81,23 +81,13 @@ struct percpu_data {
 };
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
-/* 
- * Use this to get to a cpu's version of the per-cpu object dynamically
- * allocated. Non-atomic access to the current CPU's version should
- * probably be combined with get_cpu()/put_cpu().
- */ 
-#define percpu_ptr(ptr, cpu)                              \
-({                                                        \
-        struct percpu_data *__p = __percpu_disguise(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	          \
-})
 
 extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
 extern void percpu_free(void *__pdata);
 
 #else /* CONFIG_SMP */
 
-#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
 static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
 {
@@ -122,6 +112,15 @@ static inline void percpu_free(void *__pdata)
 						  cpu_possible_map)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
 #define free_percpu(ptr)	percpu_free((ptr))
-#define per_cpu_ptr(ptr, cpu)	percpu_ptr((ptr), (cpu))
+/*
+ * Use this to get to a cpu's version of the per-cpu object dynamically
+ * allocated. Non-atomic access to the current CPU's version should
+ * probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)						\
+({									\
+        struct percpu_data *__p = __percpu_disguise(ptr);		\
+        (__typeof__(ptr))__p->ptrs[(cpu)];				\
+})
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/kernel/sched.c b/kernel/sched.c
index fc17fd91ab57..9d30ac956328 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
 #ifndef CONFIG_64BIT
@@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 
 #ifndef CONFIG_64BIT
 	/*
@@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	ca = task_ca(tsk);
 
 	for (; ca; ca = ca->parent) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415ee62a2..74541ca49536 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 	 * doesn't hit this CPU until we're ready. */
 	get_cpu();
 	for_each_online_cpu(i) {
-		sm_work = percpu_ptr(stop_machine_work, i);
+		sm_work = per_cpu_ptr(stop_machine_work, i);
 		INIT_WORK(sm_work, stop_cpu);
 		queue_work_on(i, stop_machine_wq, sm_work);
 	}
-- 
cgit v1.2.3-71-gd317


From 313e458f81ec3852106c5a83830fe0d4f405a71a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: alloc_percpu: add align argument to __alloc_percpu.

This prepares for a real __alloc_percpu, by adding an alignment argument.
Only one place uses __alloc_percpu directly, and that's for a string.

tj: af_inet also uses __alloc_percpu(), update it.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 block/blktrace.c       | 2 +-
 include/linux/percpu.h | 5 +++--
 net/ipv4/af_inet.c     | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blktrace.c b/block/blktrace.c
index 39cc3bfe56e4..487766237d28 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->sequence)
 		goto err;
 
-	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
 	if (!bt->msg_data)
 		goto err;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index c80cfe1260ec..1fdaee93c04d 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -108,9 +108,10 @@ static inline void percpu_free(void *__pdata)
 
 /* (legacy) interface for use without CPU hotplug handling */
 
-#define __alloc_percpu(size)	percpu_alloc_mask((size), GFP_KERNEL, \
+#define __alloc_percpu(size, align)	percpu_alloc_mask((size), GFP_KERNEL, \
 						  cpu_possible_map)
-#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
+#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
+						       __alignof__(type))
 #define free_percpu(ptr)	percpu_free((ptr))
 /*
  * Use this to get to a cpu's version of the per-cpu object dynamically
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f5542d65a..3a3dad801354 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
 int snmp_mib_init(void *ptr[2], size_t mibsize)
 {
 	BUG_ON(ptr == NULL);
-	ptr[0] = __alloc_percpu(mibsize);
+	ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
 	if (!ptr[0])
 		goto err0;
-	ptr[1] = __alloc_percpu(mibsize);
+	ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
 	if (!ptr[1])
 		goto err1;
 	return 0;
-- 
cgit v1.2.3-71-gd317


From f2a8205c4ef1af917d175c36a4097ae5587791c8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: percpu: kill percpu_alloc() and friends

Impact: kill unused functions

percpu_alloc() and its friends never saw much action.  It was supposed
to replace the cpu-mask unaware __alloc_percpu() but it never happened
and in fact __percpu_alloc_mask() itself never really grew proper
up/down handling interface either (no exported interface for
populate/depopulate).

percpu allocation is about to go through major reimplementation and
there's no reason to carry this unused interface around.  Replace it
with __alloc_percpu() and free_percpu().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h | 47 ++++++++++++++++++++++-------------------------
 mm/allocpercpu.c       | 32 +++++++++++++++++++-------------
 2 files changed, 41 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1fdaee93c04d..d99e24ae1811 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -82,46 +82,43 @@ struct percpu_data {
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
 
-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
-extern void percpu_free(void *__pdata);
+/*
+ * Use this to get to a cpu's version of the per-cpu object
+ * dynamically allocated. Non-atomic access to the current CPU's
+ * version should probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)						\
+({									\
+        struct percpu_data *__p = __percpu_disguise(ptr);		\
+        (__typeof__(ptr))__p->ptrs[(cpu)];				\
+})
+
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void free_percpu(void *__pdata);
 
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
+	/*
+	 * Can't easily make larger alignment work with kmalloc.  WARN
+	 * on it.  Larger alignment should only be used for module
+	 * percpu sections on SMP for which this path isn't used.
+	 */
+	WARN_ON_ONCE(align > __alignof__(unsigned long long));
 	return kzalloc(size, gfp);
 }
 
-static inline void percpu_free(void *__pdata)
+static inline void free_percpu(void *p)
 {
-	kfree(__pdata);
+	kfree(p);
 }
 
 #endif /* CONFIG_SMP */
 
-#define percpu_alloc_mask(size, gfp, mask) \
-	__percpu_alloc_mask((size), (gfp), &(mask))
-
-#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
-
-/* (legacy) interface for use without CPU hotplug handling */
-
-#define __alloc_percpu(size, align)	percpu_alloc_mask((size), GFP_KERNEL, \
-						  cpu_possible_map)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
 						       __alignof__(type))
-#define free_percpu(ptr)	percpu_free((ptr))
-/*
- * Use this to get to a cpu's version of the per-cpu object dynamically
- * allocated. Non-atomic access to the current CPU's version should
- * probably be combined with get_cpu()/put_cpu().
- */
-#define per_cpu_ptr(ptr, cpu)						\
-({									\
-        struct percpu_data *__p = __percpu_disguise(ptr);		\
-        (__typeof__(ptr))__p->ptrs[(cpu)];				\
-})
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc41bfd2..3653c570232b 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 
 /**
- * percpu_alloc_mask - initial setup of per-cpu data
+ * alloc_percpu - initial setup of per-cpu data
  * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-data for cpu's selected through mask bits
+ * @align: alignment
  *
- * Populating per-cpu data for all online cpu's would be a typical use case,
- * which is simplified by the percpu_alloc() wrapper.
- * Per-cpu objects are populated with zeroed buffers.
+ * Allocate dynamic percpu area.  Percpu objects are populated with
+ * zeroed buffers.
  */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__alloc_percpu(size_t size, size_t align)
 {
 	/*
 	 * We allocate whole cache lines to avoid false sharing
 	 */
 	size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-	void *pdata = kzalloc(sz, gfp);
+	void *pdata = kzalloc(sz, GFP_KERNEL);
 	void *__pdata = __percpu_disguise(pdata);
 
+	/*
+	 * Can't easily make larger alignment work with kmalloc.  WARN
+	 * on it.  Larger alignment should only be used for module
+	 * percpu sections on SMP for which this path isn't used.
+	 */
+	WARN_ON_ONCE(align > __alignof__(unsigned long long));
+
 	if (unlikely(!pdata))
 		return NULL;
-	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+	if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
+					   &cpu_possible_map)))
 		return __pdata;
 	kfree(pdata);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+EXPORT_SYMBOL_GPL(__alloc_percpu);
 
 /**
- * percpu_free - final cleanup of per-cpu data
+ * free_percpu - final cleanup of per-cpu data
  * @__pdata: object to clean up
  *
  * We simply clean up any per-cpu object left. No need for the client to
  * track and specify through a bis mask which per-cpu objects are to free.
  */
-void percpu_free(void *__pdata)
+void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
 	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
 	kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(percpu_free);
+EXPORT_SYMBOL_GPL(free_percpu);
-- 
cgit v1.2.3-71-gd317


From f0aa6617903648077dffe5cfcf7c4458f4610fa7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: vmalloc: implement vm_area_register_early()

Impact: allow multiple early vm areas

There are places where kernel VM area needs to be allocated before
vmalloc is initialized.  This is done by allocating static vm_struct,
initializing several fields and linking it to vmlist and later vmalloc
initialization picking up these from vmlist.  This is currently done
manually and if there's more than one such areas, there's no defined
way to arbitrate who gets which address.

This patch implements vm_area_register_early(), which takes vm_area
struct with flags and size initialized, assigns address to it and puts
it on the vmlist.  This way, multiple early vm areas can determine
which addresses they should use.  The only current user - alpha mm
init - is converted to use it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/alpha/mm/init.c    | 20 +++++++++++++-------
 include/linux/vmalloc.h |  1 +
 mm/vmalloc.c            | 24 ++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 5d7a16eab312..df6df025ded4 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
 
 	if (alpha_using_srm) {
 		static struct vm_struct console_remap_vm;
-		unsigned long vaddr = VMALLOC_START;
+		unsigned long nr_pages = 0;
+		unsigned long vaddr;
 		unsigned long i, j;
 
+		/* calculate needed size */
+		for (i = 0; i < crb->map_entries; ++i)
+			nr_pages += crb->map[i].count;
+
+		/* register the vm area */
+		console_remap_vm.flags = VM_ALLOC;
+		console_remap_vm.size = nr_pages << PAGE_SHIFT;
+		vm_area_register_early(&console_remap_vm);
+
+		vaddr = (unsigned long)consle_remap_vm.addr;
+
 		/* Set up the third level PTEs and update the virtual
 		   addresses of the CRB entries.  */
 		for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
 				vaddr += PAGE_SIZE;
 			}
 		}
-
-		/* Let vmalloc know that we've allocated some space.  */
-		console_remap_vm.flags = VM_ALLOC;
-		console_remap_vm.addr = (void *) VMALLOC_START;
-		console_remap_vm.size = vaddr - VMALLOC_START;
-		vmlist = &console_remap_vm;
 	}
 
 	callback_init_done = 1;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 506e7620a986..bbc051392298 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -106,5 +106,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
+extern __init void vm_area_register_early(struct vm_struct *vm);
 
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c37924a2ee36..d206261ad9ef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
+#include <linux/pfn.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -982,6 +983,29 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
 
+/**
+ * vm_area_register_early - register vmap area early during boot
+ * @vm: vm_struct to register
+ * @size: size of area to register
+ *
+ * This function is used to register kernel vm area before
+ * vmalloc_init() is called.  @vm->size and @vm->flags should contain
+ * proper values on entry and other fields should be zero.  On return,
+ * vm->addr contains the allocated address.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_register_early(struct vm_struct *vm)
+{
+	static size_t vm_init_off __initdata;
+
+	vm->addr = (void *)VMALLOC_START + vm_init_off;
+	vm_init_off = PFN_ALIGN(vm_init_off + vm->size);
+
+	vm->next = vmlist;
+	vmlist = vm;
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
-- 
cgit v1.2.3-71-gd317


From 8fc48985006da4ceba24508db64ec77fc0dfe3bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: vmalloc: add un/map_kernel_range_noflush()

Impact: two more public map/unmap functions

Implement map_kernel_range_noflush() and unmap_kernel_range_noflush().
These functions respectively map and unmap address range in kernel VM
area but doesn't do any vcache or tlb flushing.  These will be used by
new percpu allocator.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
---
 include/linux/vmalloc.h |  3 +++
 mm/vmalloc.c            | 67 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index bbc051392298..599ba7984310 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -91,6 +91,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
+extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
+				    pgprot_t prot, struct page **pages);
+extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
 /* Allocate/destroy a 'vmalloc' VM area. */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d206261ad9ef..224eca9650a8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -153,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
  *
  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
  */
-static int vmap_page_range(unsigned long start, unsigned long end,
-				pgprot_t prot, struct page **pages)
+static int vmap_page_range_noflush(unsigned long start, unsigned long end,
+				   pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -170,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	flush_cache_vmap(start, end);
 
 	if (unlikely(err))
 		return err;
 	return nr;
 }
 
+static int vmap_page_range(unsigned long start, unsigned long end,
+			   pgprot_t prot, struct page **pages)
+{
+	int ret;
+
+	ret = vmap_page_range_noflush(start, end, prot, pages);
+	flush_cache_vmap(start, end);
+	return ret;
+}
+
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -1033,6 +1042,58 @@ void __init vmalloc_init(void)
 	vmap_initialized = true;
 }
 
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vmap() on to-be-mapped areas
+ * before calling this function.
+ *
+ * RETURNS:
+ * The number of pages mapped on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+			     pgprot_t prot, struct page **pages)
+{
+	return vmap_page_range_noflush(addr, addr + size, prot, pages);
+}
+
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vunmap() on to-be-mapped areas
+ * before calling this function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
+{
+	vunmap_page_range(addr, addr + size);
+}
+
+/**
+ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Similar to unmap_kernel_range_noflush() but flushes vcache before
+ * the unmapping and tlb after.
+ */
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
 	unsigned long end = addr + size;
-- 
cgit v1.2.3-71-gd317


From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: percpu: implement new dynamic percpu allocator

Impact: new scalable dynamic percpu allocator which allows dynamic
        percpu areas to be accessed the same way as static ones

Implement scalable dynamic percpu allocator which can be used for both
static and dynamic percpu areas.  This will allow static and dynamic
areas to share faster direct access methods.  This feature is optional
and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by
arch.  Please read comment on top of mm/percpu.c for details.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu.h |  22 +-
 kernel/module.c        |  31 ++
 mm/Makefile            |   4 +
 mm/percpu.c            | 890 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 943 insertions(+), 4 deletions(-)
 create mode 100644 mm/percpu.c

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d99e24ae1811..18080995ff3e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,23 +76,37 @@
 
 #ifdef CONFIG_SMP
 
-struct percpu_data {
-	void *ptrs[1];
-};
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
-#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+extern void *pcpu_base_addr;
 
+typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+
+extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				       struct page **pages, size_t cpu_size);
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
  * version should probably be combined with get_cpu()/put_cpu().
  */
+#define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+
+#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+struct percpu_data {
+	void *ptrs[1];
+};
+
+#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
         struct percpu_data *__p = __percpu_disguise(ptr);		\
         (__typeof__(ptr))__p->ptrs[(cpu)];				\
 })
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(void *__pdata);
 
diff --git a/kernel/module.c b/kernel/module.c
index 52b3497b8748..1f0657ae555b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/percpu.h>
 
 #if 0
 #define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
 }
 
 #ifdef CONFIG_SMP
+
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+			     const char *name)
+{
+	void *ptr;
+
+	if (align > PAGE_SIZE) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+		       name, align, PAGE_SIZE);
+		align = PAGE_SIZE;
+	}
+
+	ptr = __alloc_percpu(size, align);
+	if (!ptr)
+		printk(KERN_WARNING
+		       "Could not allocate %lu bytes percpu data\n", size);
+	return ptr;
+}
+
+static void percpu_modfree(void *freeme)
+{
+	free_percpu(freeme);
+}
+
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@@ -499,6 +528,8 @@ static int percpu_modinit(void)
 }
 __initcall(percpu_modinit);
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
 				 const char *secstrings)
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..818569b68f46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
 obj-$(CONFIG_SMP) += allocpercpu.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..4617d97e877c
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,890 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009		SUSE Linux Products GmbH
+ * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running).  Unit grows as
+ * necessary and all units grow or shrink in unison.  When a chunk is
+ * filled up, another chunk is allocated.  ie. in vmalloc area
+ *
+ *  c0                           c1                         c2
+ *  -------------------          -------------------        ------------
+ * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
+ *  -------------------  ......  -------------------  ....  ------------
+ *
+ * Allocation is done in offset-size areas of single unit space.  Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes.  The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk.  This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map.  A positive value in the map represents a free
+ * region and negative allocated.  Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry.  This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ *   regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_static() during percpu area initialization to
+ *   setup kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define PCPU_MIN_UNIT_PAGES_SHIFT	4	/* also max alloc size */
+#define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
+
+struct pcpu_chunk {
+	struct list_head	list;		/* linked to pcpu_slot lists */
+	struct rb_node		rb_node;	/* key is chunk->vm->addr */
+	int			free_size;	/* free bytes in the chunk */
+	int			contig_hint;	/* max contiguous size hint */
+	struct vm_struct	*vm;		/* mapped vmalloc region */
+	int			map_used;	/* # of map entries used */
+	int			map_alloc;	/* # of map entries allocated */
+	int			*map;		/* allocation map */
+	struct page		*page[];	/* #cpus * UNIT_PAGES */
+};
+
+static int pcpu_unit_pages_shift;
+static int pcpu_unit_pages;
+static int pcpu_unit_shift;
+static int pcpu_unit_size;
+static int pcpu_chunk_size;
+static int pcpu_nr_slots;
+static size_t pcpu_chunk_struct_size;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+/* the size of kernel static area */
+static int pcpu_static_size;
+
+/*
+ * One mutex to rule them all.
+ *
+ * The following mutex is grabbed in the outermost public alloc/free
+ * interface functions and released only when the operation is
+ * complete.  As such, every function in this file other than the
+ * outermost functions are called under pcpu_mutex.
+ *
+ * It can easily be switched to use spinlock such that only the area
+ * allocation and page population commit are protected with it doing
+ * actual [de]allocation without holding any lock.  However, given
+ * what this allocator does, I think it's better to let them run
+ * sequentially.
+ */
+static DEFINE_MUTEX(pcpu_mutex);
+
+static struct list_head *pcpu_slot;		/* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
+
+static int pcpu_size_to_slot(int size)
+{
+	int highbit = fls(size);
+	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+		return 0;
+
+	return pcpu_size_to_slot(chunk->free_size);
+}
+
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+	return (cpu << pcpu_unit_pages_shift) + page_idx;
+}
+
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+				      unsigned int cpu, int page_idx)
+{
+	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+				     unsigned int cpu, int page_idx)
+{
+	return (unsigned long)chunk->vm->addr +
+		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+				     int page_idx)
+{
+	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+
+/**
+ * pcpu_realloc - versatile realloc
+ * @p: the current pointer (can be NULL for new allocations)
+ * @size: the current size (can be 0 for new allocations)
+ * @new_size: the wanted new size (can be 0 for free)
+ *
+ * More robust realloc which can be used to allocate, resize or free a
+ * memory area of arbitrary size.  If the needed size goes over
+ * PAGE_SIZE, kernel VM is used.
+ *
+ * RETURNS:
+ * The new pointer on success, NULL on failure.
+ */
+static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+{
+	void *new;
+
+	if (new_size <= PAGE_SIZE)
+		new = kmalloc(new_size, GFP_KERNEL);
+	else
+		new = vmalloc(new_size);
+	if (new_size && !new)
+		return NULL;
+
+	memcpy(new, p, min(size, new_size));
+	if (new_size > size)
+		memset(new + size, 0, new_size - size);
+
+	if (size <= PAGE_SIZE)
+		kfree(p);
+	else
+		vfree(p);
+
+	return new;
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+	int nslot = pcpu_chunk_slot(chunk);
+
+	if (oslot != nslot) {
+		if (oslot < nslot)
+			list_move(&chunk->list, &pcpu_slot[nslot]);
+		else
+			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+	}
+}
+
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+					     struct rb_node **parentp)
+{
+	struct rb_node **p = &pcpu_addr_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pcpu_chunk *chunk;
+
+	while (*p) {
+		parent = *p;
+		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+
+		if (addr < chunk->vm->addr)
+			p = &(*p)->rb_left;
+		else if (addr > chunk->vm->addr)
+			p = &(*p)->rb_right;
+		else
+			break;
+	}
+
+	if (parentp)
+		*parentp = parent;
+	return p;
+}
+
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr.  More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+	struct rb_node *n, *parent;
+	struct pcpu_chunk *chunk;
+
+	n = *pcpu_chunk_rb_search(addr, &parent);
+	if (!n) {
+		/* no exactly matching chunk, the parent is the closest */
+		n = parent;
+		BUG_ON(!n);
+	}
+	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+
+	if (addr < chunk->vm->addr) {
+		/* the parent was the next one, look for the previous one */
+		n = rb_prev(n);
+		BUG_ON(!n);
+		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+	}
+
+	return chunk;
+}
+
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+	struct rb_node **p, *parent;
+
+	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+	BUG_ON(*p);
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size (can be 0)
+ * @tail: tail size (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks.  If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+{
+	int nr_extra = !!head + !!tail;
+	int target = chunk->map_used + nr_extra;
+
+	/* reallocation required? */
+	if (chunk->map_alloc < target) {
+		int new_alloc = chunk->map_alloc;
+		int *new;
+
+		while (new_alloc < target)
+			new_alloc *= 2;
+
+		new = pcpu_realloc(chunk->map,
+				   chunk->map_alloc * sizeof(new[0]),
+				   new_alloc * sizeof(new[0]));
+		if (!new)
+			return -ENOMEM;
+
+		chunk->map_alloc = new_alloc;
+		chunk->map = new;
+	}
+
+	/* insert a new subblock */
+	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+		sizeof(chunk->map[0]) * (chunk->map_used - i));
+	chunk->map_used += nr_extra;
+
+	if (head) {
+		chunk->map[i + 1] = chunk->map[i] - head;
+		chunk->map[i++] = head;
+	}
+	if (tail) {
+		chunk->map[i++] -= tail;
+		chunk->map[i] = tail;
+	}
+	return 0;
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset.  It doesn't
+ * populate or map the area.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -errno on failure.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int max_contig = 0;
+	int i, off;
+
+	/*
+	 * The static chunk initially doesn't have map attached
+	 * because kmalloc wasn't available during init.  Give it one.
+	 */
+	if (unlikely(!chunk->map)) {
+		chunk->map = pcpu_realloc(NULL, 0,
+				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+		if (!chunk->map)
+			return -ENOMEM;
+
+		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+		chunk->map[chunk->map_used++] = -pcpu_static_size;
+		if (chunk->free_size)
+			chunk->map[chunk->map_used++] = chunk->free_size;
+	}
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+		bool is_last = i + 1 == chunk->map_used;
+		int head, tail;
+
+		/* extra for alignment requirement */
+		head = ALIGN(off, align) - off;
+		BUG_ON(i == 0 && head != 0);
+
+		if (chunk->map[i] < 0)
+			continue;
+		if (chunk->map[i] < head + size) {
+			max_contig = max(chunk->map[i], max_contig);
+			continue;
+		}
+
+		/*
+		 * If head is small or the previous block is free,
+		 * merge'em.  Note that 'small' is defined as smaller
+		 * than sizeof(int), which is very small but isn't too
+		 * uncommon for percpu allocations.
+		 */
+		if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+			if (chunk->map[i - 1] > 0)
+				chunk->map[i - 1] += head;
+			else {
+				chunk->map[i - 1] -= head;
+				chunk->free_size -= head;
+			}
+			chunk->map[i] -= head;
+			off += head;
+			head = 0;
+		}
+
+		/* if tail is small, just keep it around */
+		tail = chunk->map[i] - head - size;
+		if (tail < sizeof(int))
+			tail = 0;
+
+		/* split if warranted */
+		if (head || tail) {
+			if (pcpu_split_block(chunk, i, head, tail))
+				return -ENOMEM;
+			if (head) {
+				i++;
+				off += head;
+				max_contig = max(chunk->map[i - 1], max_contig);
+			}
+			if (tail)
+				max_contig = max(chunk->map[i + 1], max_contig);
+		}
+
+		/* update hint and mark allocated */
+		if (is_last)
+			chunk->contig_hint = max_contig; /* fully scanned */
+		else
+			chunk->contig_hint = max(chunk->contig_hint,
+						 max_contig);
+
+		chunk->free_size -= chunk->map[i];
+		chunk->map[i] = -chunk->map[i];
+
+		pcpu_chunk_relocate(chunk, oslot);
+		return off;
+	}
+
+	chunk->contig_hint = max_contig;	/* fully scanned */
+	pcpu_chunk_relocate(chunk, oslot);
+
+	/*
+	 * Tell the upper layer that this chunk has no area left.
+	 * Note that this is not an error condition but a notification
+	 * to upper layer that it needs to look at other chunks.
+	 * -ENOSPC is chosen as it isn't used in memory subsystem and
+	 * matches the meaning in a way.
+	 */
+	return -ENOSPC;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk.  Note that this function
+ * only modifies the allocation map.  It doesn't depopulate or unmap
+ * the area.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int i, off;
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+		if (off == freeme)
+			break;
+	BUG_ON(off != freeme);
+	BUG_ON(chunk->map[i] > 0);
+
+	chunk->map[i] = -chunk->map[i];
+	chunk->free_size += chunk->map[i];
+
+	/* merge with previous? */
+	if (i > 0 && chunk->map[i - 1] >= 0) {
+		chunk->map[i - 1] += chunk->map[i];
+		chunk->map_used--;
+		memmove(&chunk->map[i], &chunk->map[i + 1],
+			(chunk->map_used - i) * sizeof(chunk->map[0]));
+		i--;
+	}
+	/* merge with next? */
+	if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+		chunk->map[i] += chunk->map[i + 1];
+		chunk->map_used--;
+		memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+			(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+	}
+
+	chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+	pcpu_chunk_relocate(chunk, oslot);
+}
+
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+		       bool flush)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+
+	/*
+	 * Each flushing trial can be very expensive, issue flush on
+	 * the whole region at once rather than doing it for each cpu.
+	 * This could be an overkill but is more scalable.
+	 */
+	if (flush)
+		flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+				   pcpu_chunk_addr(chunk, last, page_end));
+
+	for_each_possible_cpu(cpu)
+		unmap_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT);
+
+	/* ditto as flush_cache_vunmap() */
+	if (flush)
+		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+				       pcpu_chunk_addr(chunk, last, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
+				  size_t size, bool flush)
+{
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int unmap_start = -1;
+	int uninitialized_var(unmap_end);
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			if (!*pagep)
+				continue;
+
+			__free_page(*pagep);
+
+			/*
+			 * If it's partial depopulation, it might get
+			 * populated or depopulated again.  Mark the
+			 * page gone.
+			 */
+			*pagep = NULL;
+
+			unmap_start = unmap_start < 0 ? i : unmap_start;
+			unmap_end = i + 1;
+		}
+	}
+
+	if (unmap_start >= 0)
+		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+	int err;
+
+	for_each_possible_cpu(cpu) {
+		err = map_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT,
+				PAGE_KERNEL,
+				pcpu_chunk_pagep(chunk, cpu, page_start));
+		if (err < 0)
+			return err;
+	}
+
+	/* flush at once, please read comments in pcpu_unmap() */
+	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+			 pcpu_chunk_addr(chunk, last, page_end));
+	return 0;
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int map_start = -1;
+	int map_end;
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		if (pcpu_chunk_page_occupied(chunk, i)) {
+			if (map_start >= 0) {
+				if (pcpu_map(chunk, map_start, map_end))
+					goto err;
+				map_start = -1;
+			}
+			continue;
+		}
+
+		map_start = map_start < 0 ? i : map_start;
+		map_end = i + 1;
+
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			*pagep = alloc_pages_node(cpu_to_node(cpu),
+						  alloc_mask, 0);
+			if (!*pagep)
+				goto err;
+		}
+	}
+
+	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+		goto err;
+
+	for_each_possible_cpu(cpu)
+		memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
+		       size);
+
+	return 0;
+err:
+	/* likely under heavy memory pressure, give memory back */
+	pcpu_depopulate_chunk(chunk, off, size, true);
+	return -ENOMEM;
+}
+
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+	if (!chunk)
+		return;
+	if (chunk->vm)
+		free_vm_area(chunk->vm);
+	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	kfree(chunk);
+}
+
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+	struct pcpu_chunk *chunk;
+
+	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	chunk->map = pcpu_realloc(NULL, 0,
+				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+	chunk->map[chunk->map_used++] = pcpu_unit_size;
+
+	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+	if (!chunk->vm) {
+		free_pcpu_chunk(chunk);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->free_size = pcpu_unit_size;
+	chunk->contig_hint = pcpu_unit_size;
+
+	return chunk;
+}
+
+/**
+ * __alloc_percpu - allocate percpu area
+ * @size: size of area to allocate
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	void *ptr = NULL;
+	struct pcpu_chunk *chunk;
+	int slot, off;
+
+	if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
+		     align > PAGE_SIZE)) {
+		WARN(true, "illegal size (%zu) or align (%zu) for "
+		     "percpu allocation\n", size, align);
+		return NULL;
+	}
+
+	mutex_lock(&pcpu_mutex);
+
+	/* allocate area */
+	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (size > chunk->contig_hint)
+				continue;
+			off = pcpu_alloc_area(chunk, size, align);
+			if (off >= 0)
+				goto area_found;
+			if (off != -ENOSPC)
+				goto out_unlock;
+		}
+	}
+
+	/* hmmm... no space left, create a new chunk */
+	chunk = alloc_pcpu_chunk();
+	if (!chunk)
+		goto out_unlock;
+	pcpu_chunk_relocate(chunk, -1);
+	pcpu_chunk_addr_insert(chunk);
+
+	off = pcpu_alloc_area(chunk, size, align);
+	if (off < 0)
+		goto out_unlock;
+
+area_found:
+	/* populate, map and clear the area */
+	if (pcpu_populate_chunk(chunk, off, size)) {
+		pcpu_free_area(chunk, off);
+		goto out_unlock;
+	}
+
+	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+out_unlock:
+	mutex_unlock(&pcpu_mutex);
+	return ptr;
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+{
+	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+	list_del(&chunk->list);
+	rb_erase(&chunk->rb_node, &pcpu_addr_root);
+	free_pcpu_chunk(chunk);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.  Might sleep.
+ */
+void free_percpu(void *ptr)
+{
+	void *addr = __pcpu_ptr_to_addr(ptr);
+	struct pcpu_chunk *chunk;
+	int off;
+
+	if (!ptr)
+		return;
+
+	mutex_lock(&pcpu_mutex);
+
+	chunk = pcpu_chunk_addr_search(addr);
+	off = addr - chunk->vm->addr;
+
+	pcpu_free_area(chunk, off);
+
+	/* the chunk became fully free, kill one if there are other free ones */
+	if (chunk->free_size == pcpu_unit_size) {
+		struct pcpu_chunk *pos;
+
+		list_for_each_entry(pos,
+				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+			if (pos != chunk) {
+				pcpu_kill_chunk(pos);
+				break;
+			}
+	}
+
+	mutex_unlock(&pcpu_mutex);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+/**
+ * pcpu_setup_static - initialize kernel static percpu area
+ * @populate_pte_fn: callback to allocate pagetable
+ * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
+ *
+ * Initialize kernel static percpu area.  The caller should allocate
+ * all the necessary pages and pass them in @pages.
+ * @populate_pte_fn() is called on each page to be used for percpu
+ * mapping and is responsible for making sure all the necessary page
+ * tables for the page is allocated.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access.
+ */
+size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				struct page **pages, size_t cpu_size)
+{
+	static struct vm_struct static_vm;
+	struct pcpu_chunk *static_chunk;
+	int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
+	unsigned int cpu;
+	int err, i;
+
+	pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
+				      order_base_2(cpu_size) - PAGE_SHIFT);
+
+	pcpu_static_size = cpu_size;
+	pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
+	pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
+	pcpu_unit_size = 1 << pcpu_unit_shift;
+	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
+	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+		+ (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
+
+	/* allocate chunk slots */
+	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+	for (i = 0; i < pcpu_nr_slots; i++)
+		INIT_LIST_HEAD(&pcpu_slot[i]);
+
+	/* init and register vm area */
+	static_vm.flags = VM_ALLOC;
+	static_vm.size = pcpu_chunk_size;
+	vm_area_register_early(&static_vm);
+
+	/* init static_chunk */
+	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&static_chunk->list);
+	static_chunk->vm = &static_vm;
+	static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+	static_chunk->contig_hint = static_chunk->free_size;
+
+	/* assign pages and map them */
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < nr_cpu_pages; i++) {
+			*pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
+			populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+		}
+	}
+
+	err = pcpu_map(static_chunk, 0, nr_cpu_pages);
+	if (err)
+		panic("failed to setup static percpu area, err=%d\n", err);
+
+	/* link static_chunk in */
+	pcpu_chunk_relocate(static_chunk, -1);
+	pcpu_chunk_addr_insert(static_chunk);
+
+	/* we're done */
+	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	return pcpu_unit_size;
+}
-- 
cgit v1.2.3-71-gd317


From 59089d8d162ddcb5c434672e915331964d38a754 Mon Sep 17 00:00:00 2001
From: Santwona Behera <santwona.behera@sun.com>
Date: Fri, 20 Feb 2009 00:58:13 -0800
Subject: ethtool: Add RX pkt classification interface

Signed-off-by: Santwona Behera <santwona.behera@sun.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 89 +++++++++++++++++++++++++++++++++++++++++++++----
 net/core/ethtool.c      | 58 ++++++++++++++++++++++++++------
 2 files changed, 131 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 27c67a542235..131b127b70f8 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -7,6 +7,7 @@
  * Portions Copyright 2002 Intel (eli.kupermann@intel.com,
  *                                christopher.leech@intel.com,
  *                                scott.feldman@intel.com)
+ * Portions Copyright (C) Sun Microsystems 2008
  */
 
 #ifndef _LINUX_ETHTOOL_H
@@ -287,10 +288,75 @@ enum ethtool_flags {
 	ETH_FLAG_LRO		= (1 << 15),	/* LRO is enabled */
 };
 
-struct ethtool_rxnfc {
-	__u32		cmd;
+/* The following structures are for supporting RX network flow
+ * classification configuration. Note, all multibyte fields, e.g.,
+ * ip4src, ip4dst, psrc, pdst, spi, etc. are expected to be in network
+ * byte order.
+ */
+struct ethtool_tcpip4_spec {
+	__be32	ip4src;
+	__be32	ip4dst;
+	__be16	psrc;
+	__be16	pdst;
+	__u8    tos;
+};
+
+struct ethtool_ah_espip4_spec {
+	__be32	ip4src;
+	__be32	ip4dst;
+	__be32	spi;
+	__u8    tos;
+};
+
+struct ethtool_rawip4_spec {
+	__be32	ip4src;
+	__be32	ip4dst;
+	__u8	hdata[64];
+};
+
+struct ethtool_ether_spec {
+	__be16	ether_type;
+	__u8	frame_size;
+	__u8	eframe[16];
+};
+
+#define	ETH_RX_NFC_IP4	1
+#define	ETH_RX_NFC_IP6	2
+
+struct ethtool_usrip4_spec {
+	__be32	ip4src;
+	__be32	ip4dst;
+	__be32	l4_4_bytes;
+	__u8    tos;
+	__u8    ip_ver;
+	__u8    proto;
+};
+
+struct ethtool_rx_flow_spec {
 	__u32		flow_type;
-	__u64		data;
+	union {
+		struct ethtool_tcpip4_spec		tcp_ip4_spec;
+		struct ethtool_tcpip4_spec		udp_ip4_spec;
+		struct ethtool_tcpip4_spec		sctp_ip4_spec;
+		struct ethtool_ah_espip4_spec		ah_ip4_spec;
+		struct ethtool_ah_espip4_spec		esp_ip4_spec;
+		struct ethtool_rawip4_spec		raw_ip4_spec;
+		struct ethtool_ether_spec		ether_spec;
+		struct ethtool_usrip4_spec		usr_ip4_spec;
+		__u8					hdata[64];
+	} h_u, m_u; /* entry, mask */
+	__u64		ring_cookie;
+	__u32		location;
+};
+
+struct ethtool_rxnfc {
+	__u32				cmd;
+	__u32				flow_type;
+	/* The rx flow hash value or the rule DB size */
+	__u64				data;
+	struct ethtool_rx_flow_spec	fs;
+	__u32				rule_cnt;
+	__u32				rule_locs[0];
 };
 
 #ifdef __KERNEL__
@@ -417,8 +483,8 @@ struct ethtool_ops {
 	/* the following hooks are obsolete */
 	int	(*self_test_count)(struct net_device *);/* use get_sset_count */
 	int	(*get_stats_count)(struct net_device *);/* use get_sset_count */
-	int	(*get_rxhash)(struct net_device *, struct ethtool_rxnfc *);
-	int	(*set_rxhash)(struct net_device *, struct ethtool_rxnfc *);
+	int	(*get_rxnfc)(struct net_device *, struct ethtool_rxnfc *, void *);
+	int	(*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *);
 };
 #endif /* __KERNEL__ */
 
@@ -469,6 +535,12 @@ struct ethtool_ops {
 #define	ETHTOOL_SRXFH		0x0000002a /* Set RX flow hash configuration */
 #define ETHTOOL_GGRO		0x0000002b /* Get GRO enable (ethtool_value) */
 #define ETHTOOL_SGRO		0x0000002c /* Set GRO enable (ethtool_value) */
+#define	ETHTOOL_GRXRINGS	0x0000002d /* Get RX rings available for LB */
+#define	ETHTOOL_GRXCLSRLCNT	0x0000002e /* Get RX class rule count */
+#define	ETHTOOL_GRXCLSRULE	0x0000002f /* Get RX classification rule */
+#define	ETHTOOL_GRXCLSRLALL	0x00000030 /* Get all RX classification rule */
+#define	ETHTOOL_SRXCLSRLDEL	0x00000031 /* Delete RX classification rule */
+#define	ETHTOOL_SRXCLSRLINS	0x00000032 /* Insert RX classification rule */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
@@ -565,9 +637,13 @@ struct ethtool_ops {
 #define	UDP_V6_FLOW	0x06
 #define	SCTP_V6_FLOW	0x07
 #define	AH_ESP_V6_FLOW	0x08
+#define	AH_V4_FLOW	0x09
+#define	ESP_V4_FLOW	0x0a
+#define	AH_V6_FLOW	0x0b
+#define	ESP_V6_FLOW	0x0c
+#define	IP_USER_FLOW	0x0d
 
 /* L3-L4 network traffic flow hash options */
-#define	RXH_DEV_PORT	(1 << 0)
 #define	RXH_L2DA	(1 << 1)
 #define	RXH_VLAN	(1 << 2)
 #define	RXH_L3_PROTO	(1 << 3)
@@ -577,5 +653,6 @@ struct ethtool_ops {
 #define	RXH_L4_B_2_3	(1 << 7) /* dst port in case of TCP/UDP/SCTP */
 #define	RXH_DISCARD	(1 << 31)
 
+#define	RX_CLS_FLOW_DISC	0xffffffffffffffffULL
 
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 947710a36ced..244ca56dffac 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -209,34 +209,62 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
 	return 0;
 }
 
-static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr)
+static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc cmd;
 
-	if (!dev->ethtool_ops->set_rxhash)
+	if (!dev->ethtool_ops->set_rxnfc)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
-	return dev->ethtool_ops->set_rxhash(dev, &cmd);
+	return dev->ethtool_ops->set_rxnfc(dev, &cmd);
 }
 
-static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc info;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	int ret;
+	void *rule_buf = NULL;
 
-	if (!dev->ethtool_ops->get_rxhash)
+	if (!ops->get_rxnfc)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&info, useraddr, sizeof(info)))
 		return -EFAULT;
 
-	dev->ethtool_ops->get_rxhash(dev, &info);
+	if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+		if (info.rule_cnt > 0) {
+			rule_buf = kmalloc(info.rule_cnt * sizeof(u32),
+					   GFP_USER);
+			if (!rule_buf)
+				return -ENOMEM;
+		}
+	}
 
+	ret = ops->get_rxnfc(dev, &info, rule_buf);
+	if (ret < 0)
+		goto err_out;
+
+	ret = -EFAULT;
 	if (copy_to_user(useraddr, &info, sizeof(info)))
-		return -EFAULT;
-	return 0;
+		goto err_out;
+
+	if (rule_buf) {
+		useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+		if (copy_to_user(useraddr, rule_buf,
+				 info.rule_cnt * sizeof(u32)))
+			goto err_out;
+	}
+	ret = 0;
+
+err_out:
+	if (rule_buf)
+		kfree(rule_buf);
+
+	return ret;
 }
 
 static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
@@ -901,6 +929,10 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GFLAGS:
 	case ETHTOOL_GPFLAGS:
 	case ETHTOOL_GRXFH:
+	case ETHTOOL_GRXRINGS:
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
 		break;
 	default:
 		if (!capable(CAP_NET_ADMIN))
@@ -1052,10 +1084,16 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 				       dev->ethtool_ops->set_priv_flags);
 		break;
 	case ETHTOOL_GRXFH:
-		rc = ethtool_get_rxhash(dev, useraddr);
+	case ETHTOOL_GRXRINGS:
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
+		rc = ethtool_get_rxnfc(dev, useraddr);
 		break;
 	case ETHTOOL_SRXFH:
-		rc = ethtool_set_rxhash(dev, useraddr);
+	case ETHTOOL_SRXCLSRLDEL:
+	case ETHTOOL_SRXCLSRLINS:
+		rc = ethtool_set_rxnfc(dev, useraddr);
 		break;
 	case ETHTOOL_GGRO:
 		rc = ethtool_get_gro(dev, useraddr);
-- 
cgit v1.2.3-71-gd317


From be0c22a46cfb79ab2342bb28fde99afa94ef868e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 18 Feb 2009 01:40:43 +0000
Subject: netlink: add NETLINK_BROADCAST_ERROR socket option

This patch adds NETLINK_BROADCAST_ERROR which is a netlink
socket option that the listener can set to make netlink_broadcast()
return errors in the delivery to the caller. This option is useful
if the caller of netlink_broadcast() do something with the result
of the message delivery, like in ctnetlink where it drops a network
packet if the event delivery failed, this is used to enable reliable
logging and state-synchronization. If this socket option is not set,
netlink_broadcast() only reports ESRCH errors and silently ignore
ENOBUFS errors, which is what most netlink_broadcast() callers
should do.

This socket option is based on a suggestion from Patrick McHardy.
Patrick McHardy can exchange this patch for a beer from me ;).

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |  1 +
 net/netlink/af_netlink.c | 25 +++++++++++++++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 51b09a1f46c3..1e6bf995435c 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -103,6 +103,7 @@ struct nlmsgerr
 #define NETLINK_ADD_MEMBERSHIP	1
 #define NETLINK_DROP_MEMBERSHIP	2
 #define NETLINK_PKTINFO		3
+#define NETLINK_BROADCAST_ERROR	4
 
 struct nl_pktinfo
 {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6ee69c27f806..ed587be1e1c2 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -85,6 +85,7 @@ struct netlink_sock {
 
 #define NETLINK_KERNEL_SOCKET	0x1
 #define NETLINK_RECV_PKTINFO	0x2
+#define NETLINK_BROADCAST_SEND_ERROR	0x4
 
 static inline struct netlink_sock *nlk_sk(struct sock *sk)
 {
@@ -995,12 +996,15 @@ static inline int do_one_broadcast(struct sock *sk,
 		netlink_overrun(sk);
 		/* Clone failed. Notify ALL listeners. */
 		p->failure = 1;
+		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+			p->delivery_failure = 1;
 	} else if (sk_filter(sk, p->skb2)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
 	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
 		netlink_overrun(sk);
-		p->delivery_failure = 1;
+		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+			p->delivery_failure = 1;
 	} else {
 		p->congested |= val;
 		p->delivered = 1;
@@ -1048,7 +1052,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
 	if (info.skb2)
 		kfree_skb(info.skb2);
 
-	if (info.delivery_failure || info.failure)
+	if (info.delivery_failure)
 		return -ENOBUFS;
 
 	if (info.delivered) {
@@ -1163,6 +1167,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		err = 0;
 		break;
 	}
+	case NETLINK_BROADCAST_ERROR:
+		if (val)
+			nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
+		else
+			nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1195,6 +1206,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 		err = 0;
 		break;
+	case NETLINK_BROADCAST_ERROR:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
+		if (put_user(len, optlen) ||
+		    put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
-- 
cgit v1.2.3-71-gd317


From 784544739a25c30637397ace5489eeb6e15d7d49 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 20 Feb 2009 10:35:32 +0100
Subject: netfilter: iptables: lock free counters

The reader/writer lock in ip_tables is acquired in the critical path of
processing packets and is one of the reasons just loading iptables can cause
a 20% performance loss. The rwlock serves two functions:

1) it prevents changes to table state (xt_replace) while table is in use.
   This is now handled by doing rcu on the xt_table. When table is
   replaced, the new table(s) are put in and the old one table(s) are freed
   after RCU period.

2) it provides synchronization when accesing the counter values.
   This is now handled by swapping in new table_info entries for each cpu
   then summing the old values, and putting the result back onto one
   cpu.  On a busy system it may cause sampling to occur at different
   times on each cpu, but no packet/byte counts are lost in the process.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)

Acked-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h |   6 +-
 net/ipv4/netfilter/arp_tables.c    | 115 ++++++++++++++++++++++++++---------
 net/ipv4/netfilter/ip_tables.c     | 120 +++++++++++++++++++++++++++----------
 net/ipv6/netfilter/ip6_tables.c    | 119 +++++++++++++++++++++++++-----------
 net/netfilter/x_tables.c           |  26 ++++++--
 5 files changed, 284 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 9fac88fc0e72..e8e08d036752 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -353,7 +353,7 @@ struct xt_table
 	unsigned int valid_hooks;
 
 	/* Lock for the curtain */
-	rwlock_t lock;
+	struct mutex lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -385,7 +385,7 @@ struct xt_table_info
 
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	char *entries[1];
+	void *entries[1];
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
@@ -432,6 +432,8 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
+extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
+				    struct xt_table_info *new);
 
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b5db46342614..64a7c6ce0b98 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -261,9 +261,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -335,7 +336,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+
+	rcu_read_unlock();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -738,11 +740,65 @@ static void get_counters(const struct xt_table_info *t,
 	}
 }
 
-static inline struct xt_counters *alloc_counters(struct xt_table *table)
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	ARPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct arpt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
+}
+
+static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -752,14 +808,30 @@ static inline struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1099,20 +1171,6 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK.
- */
-static inline int add_counter_to_entry(struct arpt_entry *e,
-				       const struct xt_counters addme[],
-				       unsigned int *i)
-{
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1172,13 +1230,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1187,8 +1246,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
+
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ef8b6ca068b2..08cde5bd70a5 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -924,13 +926,68 @@ get_counters(const struct xt_table_info *t,
 				  counters,
 				  &i);
 	}
+
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+
+static inline int
+zero_entry_counter(struct ipt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -939,14 +996,30 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d64594b6c061..34af7bb8df5f 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -382,10 +382,12 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -483,7 +485,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -964,11 +966,64 @@ get_counters(const struct xt_table_info *t,
 	}
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IP6T_ENTRY_ITERATE(t->entries[cpu],
+			   t->size,
+			   add_counter_to_entry,
+			   counters,
+			   &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct ip6t_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				   zero_entry_counter, NULL);
+	}
+}
+
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -977,14 +1032,28 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	xt_free_table_info(info);
 
-	return counters;
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1351,28 +1420,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1433,13 +1480,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1448,8 +1496,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index bfbf521f6ea5..bfcac92d5563 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,6 +625,20 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
+void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
+			     struct xt_table_info *newinfo)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *p = oldinfo->entries[cpu];
+		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
+		newinfo->entries[cpu] = p;
+	}
+
+}
+EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
+
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -671,21 +685,22 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
+	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -719,7 +734,8 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	mutex_init(&table->lock);
+
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
-- 
cgit v1.2.3-71-gd317


From 268cb38e1802db560c73167e643f14a3dcb4b07c Mon Sep 17 00:00:00 2001
From: Adam Nielsen <a.nielsen@shikadi.net>
Date: Fri, 20 Feb 2009 10:55:14 +0100
Subject: netfilter: x_tables: add LED trigger target

Kernel module providing implementation of LED netfilter target.  Each
instance of the target appears as a led-trigger device, which can be
associated with one or more LEDs in /sys/class/leds/

Signed-off-by: Adam Nielsen <a.nielsen@shikadi.net>
Acked-by: Richard Purdie <rpurdie@linux.intel.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 drivers/leds/Kconfig             |   3 +
 include/linux/netfilter/Kbuild   |   1 +
 include/linux/netfilter/xt_LED.h |  13 ++++
 net/netfilter/Kconfig            |  24 ++++++
 net/netfilter/Makefile           |   1 +
 net/netfilter/xt_LED.c           | 161 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 203 insertions(+)
 create mode 100644 include/linux/netfilter/xt_LED.h
 create mode 100644 net/netfilter/xt_LED.c

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 742713611bc5..556aeca0d860 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -223,4 +223,7 @@ config LEDS_TRIGGER_DEFAULT_ON
 	  This allows LEDs to be initialised in the ON state.
 	  If unsure, say Y.
 
+comment "iptables trigger is under Netfilter config (LED target)"
+	depends on LEDS_TRIGGERS
+
 endif # NEW_LEDS
diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index 5a8af875bce2..deeaee5c83f2 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -7,6 +7,7 @@ header-y += xt_CLASSIFY.h
 header-y += xt_CONNMARK.h
 header-y += xt_CONNSECMARK.h
 header-y += xt_DSCP.h
+header-y += xt_LED.h
 header-y += xt_MARK.h
 header-y += xt_NFLOG.h
 header-y += xt_NFQUEUE.h
diff --git a/include/linux/netfilter/xt_LED.h b/include/linux/netfilter/xt_LED.h
new file mode 100644
index 000000000000..4c91a0d770d0
--- /dev/null
+++ b/include/linux/netfilter/xt_LED.h
@@ -0,0 +1,13 @@
+#ifndef _XT_LED_H
+#define _XT_LED_H
+
+struct xt_led_info {
+	char id[27];        /* Unique ID for this trigger in the LED class */
+	__u8 always_blink;  /* Blink even if the LED is already on */
+	__u32 delay;        /* Delay until LED is switched off after trigger */
+
+	/* Kernel data used in the module */
+	void *internal_data __attribute__((aligned(8)));
+};
+
+#endif /* _XT_LED_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 0eb98b4fbf44..cdbaaff6d0d6 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -372,6 +372,30 @@ config NETFILTER_XT_TARGET_HL
 	since you can easily create immortal packets that loop
 	forever on the network.
 
+config NETFILTER_XT_TARGET_LED
+	tristate '"LED" target support'
+	depends on LEDS_CLASS
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `LED' target, which allows you to blink LEDs in
+	  response to particular packets passing through your machine.
+
+	  This can be used to turn a spare LED into a network activity LED,
+	  which only flashes in response to FTP transfers, for example.  Or
+	  you could have an LED which lights up for a minute or two every time
+	  somebody connects to your machine via SSH.
+
+	  You will need support for the "led" class to make this work.
+
+	  To create an LED trigger for incoming SSH traffic:
+	    iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000
+
+	  Then attach the new trigger to an LED on your system:
+	    echo netfilter-ssh > /sys/class/leds/<ledname>/trigger
+
+	  For more information on the LEDs available on your system, see
+	  Documentation/leds-class.txt
+
 config NETFILTER_XT_TARGET_MARK
 	tristate '"MARK" target support'
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index da73ed25701c..7a9b8397573a 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
new file mode 100644
index 000000000000..8ff7843bb921
--- /dev/null
+++ b/net/netfilter/xt_LED.c
@@ -0,0 +1,161 @@
+/*
+ * xt_LED.c - netfilter target to make LEDs blink upon packet matches
+ *
+ * Copyright (C) 2008 Adam Nielsen <a.nielsen@shikadi.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/leds.h>
+#include <linux/mutex.h>
+
+#include <linux/netfilter/xt_LED.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");
+MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match");
+
+/*
+ * This is declared in here (the kernel module) only, to avoid having these
+ * dependencies in userspace code.  This is what xt_led_info.internal_data
+ * points to.
+ */
+struct xt_led_info_internal {
+	struct led_trigger netfilter_led_trigger;
+	struct timer_list timer;
+};
+
+static unsigned int
+led_tg(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_led_info *ledinfo = par->targinfo;
+	struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
+
+	/*
+	 * If "always blink" is enabled, and there's still some time until the
+	 * LED will switch off, briefly switch it off now.
+	 */
+	if ((ledinfo->delay > 0) && ledinfo->always_blink &&
+	    timer_pending(&ledinternal->timer))
+		led_trigger_event(&ledinternal->netfilter_led_trigger,LED_OFF);
+
+	led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL);
+
+	/* If there's a positive delay, start/update the timer */
+	if (ledinfo->delay > 0) {
+		mod_timer(&ledinternal->timer,
+			  jiffies + msecs_to_jiffies(ledinfo->delay));
+
+	/* Otherwise if there was no delay given, blink as fast as possible */
+	} else if (ledinfo->delay == 0) {
+		led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
+	}
+
+	/* else the delay is negative, which means switch on and stay on */
+
+	return XT_CONTINUE;
+}
+
+static void led_timeout_callback(unsigned long data)
+{
+	struct xt_led_info *ledinfo = (struct xt_led_info *)data;
+	struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
+
+	led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
+}
+
+static bool led_tg_check(const struct xt_tgchk_param *par)
+{
+	struct xt_led_info *ledinfo = par->targinfo;
+	struct xt_led_info_internal *ledinternal;
+	int err;
+
+	if (ledinfo->id[0] == '\0') {
+		printk(KERN_ERR KBUILD_MODNAME ": No 'id' parameter given.\n");
+		return false;
+	}
+
+	ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL);
+	if (!ledinternal) {
+		printk(KERN_CRIT KBUILD_MODNAME ": out of memory\n");
+		return false;
+	}
+
+	ledinternal->netfilter_led_trigger.name = ledinfo->id;
+
+	err = led_trigger_register(&ledinternal->netfilter_led_trigger);
+	if (err) {
+		printk(KERN_CRIT KBUILD_MODNAME
+			": led_trigger_register() failed\n");
+		if (err == -EEXIST)
+			printk(KERN_ERR KBUILD_MODNAME
+				": Trigger name is already in use.\n");
+		goto exit_alloc;
+	}
+
+	/* See if we need to set up a timer */
+	if (ledinfo->delay > 0)
+		setup_timer(&ledinternal->timer, led_timeout_callback,
+			    (unsigned long)ledinfo);
+
+	ledinfo->internal_data = ledinternal;
+
+	return true;
+
+exit_alloc:
+	kfree(ledinternal);
+
+	return false;
+}
+
+static void led_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct xt_led_info *ledinfo = par->targinfo;
+	struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
+
+	if (ledinfo->delay > 0)
+		del_timer_sync(&ledinternal->timer);
+
+	led_trigger_unregister(&ledinternal->netfilter_led_trigger);
+	kfree(ledinternal);
+}
+
+static struct xt_target led_tg_reg __read_mostly = {
+	.name		= "LED",
+	.revision	= 0,
+	.family		= NFPROTO_UNSPEC,
+	.target		= led_tg,
+	.targetsize	= XT_ALIGN(sizeof(struct xt_led_info)),
+	.checkentry	= led_tg_check,
+	.destroy	= led_tg_destroy,
+	.me		= THIS_MODULE,
+};
+
+static int __init led_tg_init(void)
+{
+	return xt_register_target(&led_tg_reg);
+}
+
+static void __exit led_tg_exit(void)
+{
+	xt_unregister_target(&led_tg_reg);
+}
+
+module_init(led_tg_init);
+module_exit(led_tg_exit);
-- 
cgit v1.2.3-71-gd317


From ffadd4d0feb5376c82dc3a4104731b7ce2794edc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Tue, 17 Feb 2009 12:05:07 -0500
Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants

As a preparational patch to bump up page allocator pass-through threshold,
introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert
mm/slub.c to use them.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h | 19 ++++++++++++++++---
 mm/slub.c                | 16 ++++++++--------
 2 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..986e09dcfd8f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -120,11 +120,24 @@ struct kmem_cache {
 
 #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
 
+/*
+ * Maximum kmalloc object size handled by SLUB. Larger object allocations
+ * are passed through to the page allocator. The page allocator "fastpath"
+ * is relatively slow so we need this value sufficiently high so that
+ * performance critical objects are allocated through the SLUB fastpath.
+ *
+ * This should be dropped to PAGE_SIZE / 2 once the page allocator
+ * "fastpath" becomes competitive with the slab allocator fastpaths.
+ */
+#define SLUB_MAX_SIZE (PAGE_SIZE)
+
+#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1)
+
 /*
  * We keep the general caches in an array of slab caches that are used for
  * 2^x bytes of allocations.
  */
-extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
+extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT];
 
 /*
  * Sorry that the following has to be that ugly but some versions of GCC
@@ -212,7 +225,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
-		if (size > PAGE_SIZE)
+		if (size > SLUB_MAX_SIZE)
 			return kmalloc_large(size, flags);
 
 		if (!(flags & SLUB_DMA)) {
@@ -234,7 +247,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) &&
-		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
+		size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
 
 		if (!s)
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..5a5e7f5bf799 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
 static int __init setup_slub_min_order(char *str)
@@ -2537,7 +2537,7 @@ panic:
 }
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
+static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
 
 static void sysfs_add_func(struct work_struct *w)
 {
@@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, flags, node);
 
 	s = get_slab(size, flags);
@@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void)
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_KERNEL);
 		caches++;
@@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void)
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 
@@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
@@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, gfpflags, node);
 
 	s = get_slab(size, gfpflags);
-- 
cgit v1.2.3-71-gd317


From 51735a7ca67531267a27b57e5fe20f7815192f9c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 20 Feb 2009 12:21:33 +0200
Subject: SLUB: Do not pass 8k objects through to the page allocator

Increase the maximum object size in SLUB so that 8k objects are not
passed through to the page allocator anymore. The network stack uses 8k
objects for performance critical operations.

The patch is motivated by a SLAB vs. SLUB regression in the netperf
benchmark. The problem is that the kfree(skb->head) call in
skb_release_data() that is subject to page allocator pass-through as the
size passed to __alloc_skb() is larger than 4 KB in this test.

As explained by Yanmin Zhang:

  I use 2.6.29-rc2 kernel to run netperf UDP-U-4k CPU_NUM client/server
  pair loopback testing on x86-64 machines. Comparing with SLUB, SLAB's
  result is about 2.3 times of SLUB's. After applying the reverting patch,
  the result difference between SLUB and SLAB becomes 1% which we might
  consider as fluctuation.

[ penberg@cs.helsinki.fi: fix oops in kmalloc() ]
Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 986e09dcfd8f..e217a7a68ea7 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -129,9 +129,9 @@ struct kmem_cache {
  * This should be dropped to PAGE_SIZE / 2 once the page allocator
  * "fastpath" becomes competitive with the slab allocator fastpaths.
  */
-#define SLUB_MAX_SIZE (PAGE_SIZE)
+#define SLUB_MAX_SIZE (2 * PAGE_SIZE)
 
-#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1)
+#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 2)
 
 /*
  * We keep the general caches in an array of slab caches that are used for
-- 
cgit v1.2.3-71-gd317


From fe1200b63d158b28eef6d4de1e5b5f99c681ba2f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Tue, 17 Feb 2009 12:05:07 -0500
Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants

As a preparational patch to bump up page allocator pass-through threshold,
introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert
mm/slub.c to use them.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h | 19 ++++++++++++++++---
 mm/slub.c                | 16 ++++++++--------
 2 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..986e09dcfd8f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -120,11 +120,24 @@ struct kmem_cache {
 
 #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
 
+/*
+ * Maximum kmalloc object size handled by SLUB. Larger object allocations
+ * are passed through to the page allocator. The page allocator "fastpath"
+ * is relatively slow so we need this value sufficiently high so that
+ * performance critical objects are allocated through the SLUB fastpath.
+ *
+ * This should be dropped to PAGE_SIZE / 2 once the page allocator
+ * "fastpath" becomes competitive with the slab allocator fastpaths.
+ */
+#define SLUB_MAX_SIZE (PAGE_SIZE)
+
+#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1)
+
 /*
  * We keep the general caches in an array of slab caches that are used for
  * 2^x bytes of allocations.
  */
-extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
+extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT];
 
 /*
  * Sorry that the following has to be that ugly but some versions of GCC
@@ -212,7 +225,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
-		if (size > PAGE_SIZE)
+		if (size > SLUB_MAX_SIZE)
 			return kmalloc_large(size, flags);
 
 		if (!(flags & SLUB_DMA)) {
@@ -234,7 +247,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) &&
-		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
+		size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
 
 		if (!s)
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..5a5e7f5bf799 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
 static int __init setup_slub_min_order(char *str)
@@ -2537,7 +2537,7 @@ panic:
 }
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
+static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
 
 static void sysfs_add_func(struct work_struct *w)
 {
@@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, flags, node);
 
 	s = get_slab(size, flags);
@@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void)
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_KERNEL);
 		caches++;
@@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void)
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 
@@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
@@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE))
+	if (unlikely(size > SLUB_MAX_SIZE))
 		return kmalloc_large_node(size, gfpflags, node);
 
 	s = get_slab(size, gfpflags);
-- 
cgit v1.2.3-71-gd317


From 6503e5df08008b9a47022b5e9ebba658c8fa69af Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Thu, 27 Nov 2008 17:48:13 +0000
Subject: thermal: use integers rather than strings for thermal values

The thermal API currently uses strings to pass values to userspace. This
makes it difficult to use from within the kernel. Change the interface
to use integers and fix up the consumers.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/fan.c                  | 20 ++++----
 drivers/acpi/processor_thermal.c    | 20 ++++----
 drivers/acpi/thermal.c              | 80 ++++++++++++++++++++------------
 drivers/acpi/video.c                | 22 +++++----
 drivers/platform/x86/intel_menlow.c | 29 ++++--------
 drivers/thermal/thermal_sys.c       | 91 +++++++++++++++++++++++++++++++------
 include/linux/thermal.h             | 32 +++++++++----
 7 files changed, 198 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index eaaee1660bdf..ae41cf3cf4e5 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -68,31 +68,35 @@ static struct acpi_driver acpi_fan_driver = {
 };
 
 /* thermal cooling device callbacks */
-static int fan_get_max_state(struct thermal_cooling_device *cdev, char *buf)
+static int fan_get_max_state(struct thermal_cooling_device *cdev, unsigned long
+			     *state)
 {
 	/* ACPI fan device only support two states: ON/OFF */
-	return sprintf(buf, "1\n");
+	*state = 1;
+	return 0;
 }
 
-static int fan_get_cur_state(struct thermal_cooling_device *cdev, char *buf)
+static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long
+			     *state)
 {
 	struct acpi_device *device = cdev->devdata;
-	int state;
 	int result;
+	int acpi_state;
 
 	if (!device)
 		return -EINVAL;
 
-	result = acpi_bus_get_power(device->handle, &state);
+	result = acpi_bus_get_power(device->handle, &acpi_state);
 	if (result)
 		return result;
 
-	return sprintf(buf, "%s\n", state == ACPI_STATE_D3 ? "0" :
-			 (state == ACPI_STATE_D0 ? "1" : "unknown"));
+	*state = (acpi_state == ACPI_STATE_D3 ? 0 :
+		 (acpi_state == ACPI_STATE_D0 ? 1 : -1));
+	return 0;
 }
 
 static int
-fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state)
+fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	int result;
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index b1eb376fae45..0e47e299a9ac 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -373,7 +373,8 @@ static int acpi_processor_max_state(struct acpi_processor *pr)
 	return max_state;
 }
 static int
-processor_get_max_state(struct thermal_cooling_device *cdev, char *buf)
+processor_get_max_state(struct thermal_cooling_device *cdev,
+			unsigned long *state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_processor *pr = acpi_driver_data(device);
@@ -381,28 +382,29 @@ processor_get_max_state(struct thermal_cooling_device *cdev, char *buf)
 	if (!device || !pr)
 		return -EINVAL;
 
-	return sprintf(buf, "%d\n", acpi_processor_max_state(pr));
+	*state = acpi_processor_max_state(pr);
+	return 0;
 }
 
 static int
-processor_get_cur_state(struct thermal_cooling_device *cdev, char *buf)
+processor_get_cur_state(struct thermal_cooling_device *cdev,
+			unsigned long *cur_state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_processor *pr = acpi_driver_data(device);
-	int cur_state;
 
 	if (!device || !pr)
 		return -EINVAL;
 
-	cur_state = cpufreq_get_cur_state(pr->id);
+	*cur_state = cpufreq_get_cur_state(pr->id);
 	if (pr->flags.throttling)
-		cur_state += pr->throttling.state;
-
-	return sprintf(buf, "%d\n", cur_state);
+		*cur_state += pr->throttling.state;
+	return 0;
 }
 
 static int
-processor_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state)
+processor_set_cur_state(struct thermal_cooling_device *cdev,
+			unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_processor *pr = acpi_driver_data(device);
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 99e6f1f8ea45..1c410ef859c6 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -954,7 +954,8 @@ static void acpi_thermal_check(void *data)
 /* sys I/F for generic thermal sysfs support */
 #define KELVIN_TO_MILLICELSIUS(t) (t * 100 - 273200)
 
-static int thermal_get_temp(struct thermal_zone_device *thermal, char *buf)
+static int thermal_get_temp(struct thermal_zone_device *thermal,
+			    unsigned long *temp)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int result;
@@ -966,25 +967,28 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, char *buf)
 	if (result)
 		return result;
 
-	return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(tz->temperature));
+	*temp = KELVIN_TO_MILLICELSIUS(tz->temperature);
+	return 0;
 }
 
 static const char enabled[] = "kernel";
 static const char disabled[] = "user";
 static int thermal_get_mode(struct thermal_zone_device *thermal,
-				char *buf)
+				enum thermal_device_mode *mode)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 
 	if (!tz)
 		return -EINVAL;
 
-	return sprintf(buf, "%s\n", tz->tz_enabled ?
-			enabled : disabled);
+	*mode = tz->tz_enabled ? THERMAL_DEVICE_ENABLED :
+		THERMAL_DEVICE_DISABLED;
+
+	return 0;
 }
 
 static int thermal_set_mode(struct thermal_zone_device *thermal,
-				const char *buf)
+				enum thermal_device_mode mode)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int enable;
@@ -995,9 +999,9 @@ static int thermal_set_mode(struct thermal_zone_device *thermal,
 	/*
 	 * enable/disable thermal management from ACPI thermal driver
 	 */
-	if (!strncmp(buf, enabled, sizeof enabled - 1))
+	if (mode == THERMAL_DEVICE_ENABLED)
 		enable = 1;
-	else if (!strncmp(buf, disabled, sizeof disabled - 1))
+	else if (mode == THERMAL_DEVICE_DISABLED)
 		enable = 0;
 	else
 		return -EINVAL;
@@ -1013,7 +1017,7 @@ static int thermal_set_mode(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_trip_type(struct thermal_zone_device *thermal,
-				 int trip, char *buf)
+				 int trip, enum thermal_trip_type *type)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int i;
@@ -1022,27 +1026,35 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal,
 		return -EINVAL;
 
 	if (tz->trips.critical.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "critical\n");
+		if (!trip) {
+			*type = THERMAL_TRIP_CRITICAL;
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.hot.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "hot\n");
+		if (!trip) {
+			*type = THERMAL_TRIP_HOT;
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.passive.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "passive\n");
+		if (!trip) {
+			*type = THERMAL_TRIP_PASSIVE;
+			return 0;
+		}
 		trip--;
 	}
 
 	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE &&
 		tz->trips.active[i].flags.valid; i++) {
-		if (!trip)
-			return sprintf(buf, "active%d\n", i);
+		if (!trip) {
+			*type = THERMAL_TRIP_ACTIVE;
+			return 0;
+		}
 		trip--;
 	}
 
@@ -1050,7 +1062,7 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
-				 int trip, char *buf)
+				 int trip, unsigned long *temp)
 {
 	struct acpi_thermal *tz = thermal->devdata;
 	int i;
@@ -1059,31 +1071,39 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
 		return -EINVAL;
 
 	if (tz->trips.critical.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-				tz->trips.critical.temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.critical.temperature);
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.hot.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-					tz->trips.hot.temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.hot.temperature);
+			return 0;
+		}
 		trip--;
 	}
 
 	if (tz->trips.passive.flags.valid) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-					tz->trips.passive.temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.passive.temperature);
+			return 0;
+		}
 		trip--;
 	}
 
 	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE &&
 		tz->trips.active[i].flags.valid; i++) {
-		if (!trip)
-			return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(
-					tz->trips.active[i].temperature));
+		if (!trip) {
+			*temp = KELVIN_TO_MILLICELSIUS(
+				tz->trips.active[i].temperature);
+			return 0;
+		}
 		trip--;
 	}
 
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index bb5ed059114a..5259d502add6 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -358,32 +358,36 @@ static struct output_properties acpi_output_properties = {
 
 
 /* thermal cooling device callbacks */
-static int video_get_max_state(struct thermal_cooling_device *cdev, char *buf)
+static int video_get_max_state(struct thermal_cooling_device *cdev, unsigned
+			       long *state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_video_device *video = acpi_driver_data(device);
 
-	return sprintf(buf, "%d\n", video->brightness->count - 3);
+	*state = video->brightness->count - 3;
+	return 0;
 }
 
-static int video_get_cur_state(struct thermal_cooling_device *cdev, char *buf)
+static int video_get_cur_state(struct thermal_cooling_device *cdev, unsigned
+			       long *state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_video_device *video = acpi_driver_data(device);
 	unsigned long long level;
-	int state;
+	int offset;
 
 	acpi_video_device_lcd_get_level_current(video, &level);
-	for (state = 2; state < video->brightness->count; state++)
-		if (level == video->brightness->levels[state])
-			return sprintf(buf, "%d\n",
-				       video->brightness->count - state - 1);
+	for (offset = 2; offset < video->brightness->count; offset++)
+		if (level == video->brightness->levels[offset]) {
+			*state = video->brightness->count - offset - 1;
+			return 0;
+		}
 
 	return -EINVAL;
 }
 
 static int
-video_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state)
+video_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	struct acpi_video_device *video = acpi_driver_data(device);
diff --git a/drivers/platform/x86/intel_menlow.c b/drivers/platform/x86/intel_menlow.c
index 27b7662955bb..29432a50be45 100644
--- a/drivers/platform/x86/intel_menlow.c
+++ b/drivers/platform/x86/intel_menlow.c
@@ -57,8 +57,8 @@ MODULE_LICENSE("GPL");
  * In that case max_cstate would be n-1
  * GTHS returning '0' would mean that no bandwidth control states are supported
  */
-static int memory_get_int_max_bandwidth(struct thermal_cooling_device *cdev,
-					unsigned long *max_state)
+static int memory_get_max_bandwidth(struct thermal_cooling_device *cdev,
+				    unsigned long *max_state)
 {
 	struct acpi_device *device = cdev->devdata;
 	acpi_handle handle = device->handle;
@@ -83,22 +83,12 @@ static int memory_get_int_max_bandwidth(struct thermal_cooling_device *cdev,
 	return 0;
 }
 
-static int memory_get_max_bandwidth(struct thermal_cooling_device *cdev,
-				    char *buf)
-{
-	unsigned long value;
-	if (memory_get_int_max_bandwidth(cdev, &value))
-		return -EINVAL;
-
-	return sprintf(buf, "%ld\n", value);
-}
-
 static int memory_get_cur_bandwidth(struct thermal_cooling_device *cdev,
-				    char *buf)
+				    unsigned long *value)
 {
 	struct acpi_device *device = cdev->devdata;
 	acpi_handle handle = device->handle;
-	unsigned long long value;
+	unsigned long long result;
 	struct acpi_object_list arg_list;
 	union acpi_object arg;
 	acpi_status status = AE_OK;
@@ -108,15 +98,16 @@ static int memory_get_cur_bandwidth(struct thermal_cooling_device *cdev,
 	arg.type = ACPI_TYPE_INTEGER;
 	arg.integer.value = MEMORY_ARG_CUR_BANDWIDTH;
 	status = acpi_evaluate_integer(handle, MEMORY_GET_BANDWIDTH,
-				       &arg_list, &value);
+				       &arg_list, &result);
 	if (ACPI_FAILURE(status))
 		return -EFAULT;
 
-	return sprintf(buf, "%llu\n", value);
+	*value = result;
+	return 0;
 }
 
 static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev,
-				    unsigned int state)
+				    unsigned long state)
 {
 	struct acpi_device *device = cdev->devdata;
 	acpi_handle handle = device->handle;
@@ -126,7 +117,7 @@ static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev,
 	unsigned long long temp;
 	unsigned long max_state;
 
-	if (memory_get_int_max_bandwidth(cdev, &max_state))
+	if (memory_get_max_bandwidth(cdev, &max_state))
 		return -EFAULT;
 
 	if (state > max_state)
@@ -142,7 +133,7 @@ static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev,
 				  &temp);
 
 	printk(KERN_INFO
-	       "Bandwidth value was %d: status is %d\n", state, status);
+	       "Bandwidth value was %ld: status is %d\n", state, status);
 	if (ACPI_FAILURE(status))
 		return -EFAULT;
 
diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 8171ca17b936..bd139adc6d32 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -104,22 +104,36 @@ static ssize_t
 temp_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	long temperature;
+	int ret;
 
 	if (!tz->ops->get_temp)
 		return -EPERM;
 
-	return tz->ops->get_temp(tz, buf);
+	ret = tz->ops->get_temp(tz, &temperature);
+
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 static ssize_t
 mode_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	enum thermal_device_mode mode;
+	int result;
 
 	if (!tz->ops->get_mode)
 		return -EPERM;
 
-	return tz->ops->get_mode(tz, buf);
+	result = tz->ops->get_mode(tz, &mode);
+	if (result)
+		return result;
+
+	return sprintf(buf, "%s\n", mode == THERMAL_DEVICE_ENABLED ? "enabled"
+		       : "disabled");
 }
 
 static ssize_t
@@ -132,7 +146,13 @@ mode_store(struct device *dev, struct device_attribute *attr,
 	if (!tz->ops->set_mode)
 		return -EPERM;
 
-	result = tz->ops->set_mode(tz, buf);
+	if (!strncmp(buf, "enabled", sizeof("enabled")))
+		result = tz->ops->set_mode(tz, THERMAL_DEVICE_ENABLED);
+	else if (!strncmp(buf, "disabled", sizeof("disabled")))
+		result = tz->ops->set_mode(tz, THERMAL_DEVICE_DISABLED);
+	else
+		result = -EINVAL;
+
 	if (result)
 		return result;
 
@@ -144,7 +164,8 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr,
 		     char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	int trip;
+	enum thermal_trip_type type;
+	int trip, result;
 
 	if (!tz->ops->get_trip_type)
 		return -EPERM;
@@ -152,7 +173,22 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr,
 	if (!sscanf(attr->attr.name, "trip_point_%d_type", &trip))
 		return -EINVAL;
 
-	return tz->ops->get_trip_type(tz, trip, buf);
+	result = tz->ops->get_trip_type(tz, trip, &type);
+	if (result)
+		return result;
+
+	switch (type) {
+	case THERMAL_TRIP_CRITICAL:
+		return sprintf(buf, "critical");
+	case THERMAL_TRIP_HOT:
+		return sprintf(buf, "hot");
+	case THERMAL_TRIP_PASSIVE:
+		return sprintf(buf, "passive");
+	case THERMAL_TRIP_ACTIVE:
+		return sprintf(buf, "active");
+	default:
+		return sprintf(buf, "unknown");
+	}
 }
 
 static ssize_t
@@ -160,7 +196,8 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 		     char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	int trip;
+	int trip, ret;
+	long temperature;
 
 	if (!tz->ops->get_trip_temp)
 		return -EPERM;
@@ -168,7 +205,12 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 	if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
 		return -EINVAL;
 
-	return tz->ops->get_trip_temp(tz, trip, buf);
+	ret = tz->ops->get_trip_temp(tz, trip, &temperature);
+
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 static DEVICE_ATTR(type, 0444, type_show, NULL);
@@ -236,8 +278,13 @@ thermal_cooling_device_max_state_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
 {
 	struct thermal_cooling_device *cdev = to_cooling_device(dev);
+	unsigned long state;
+	int ret;
 
-	return cdev->ops->get_max_state(cdev, buf);
+	ret = cdev->ops->get_max_state(cdev, &state);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%ld\n", state);
 }
 
 static ssize_t
@@ -245,8 +292,13 @@ thermal_cooling_device_cur_state_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
 {
 	struct thermal_cooling_device *cdev = to_cooling_device(dev);
+	unsigned long state;
+	int ret;
 
-	return cdev->ops->get_cur_state(cdev, buf);
+	ret = cdev->ops->get_cur_state(cdev, &state);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%ld\n", state);
 }
 
 static ssize_t
@@ -255,10 +307,10 @@ thermal_cooling_device_cur_state_store(struct device *dev,
 				       const char *buf, size_t count)
 {
 	struct thermal_cooling_device *cdev = to_cooling_device(dev);
-	int state;
+	unsigned long state;
 	int result;
 
-	if (!sscanf(buf, "%d\n", &state))
+	if (!sscanf(buf, "%ld\n", &state))
 		return -EINVAL;
 
 	if (state < 0)
@@ -312,13 +364,20 @@ static DEVICE_ATTR(name, 0444, name_show, NULL);
 static ssize_t
 temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
+	long temperature;
+	int ret;
 	struct thermal_hwmon_attr *hwmon_attr
 			= container_of(attr, struct thermal_hwmon_attr, attr);
 	struct thermal_zone_device *tz
 			= container_of(hwmon_attr, struct thermal_zone_device,
 				       temp_input);
 
-	return tz->ops->get_temp(tz, buf);
+	ret = tz->ops->get_temp(tz, &temperature);
+
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 static ssize_t
@@ -330,8 +389,14 @@ temp_crit_show(struct device *dev, struct device_attribute *attr,
 	struct thermal_zone_device *tz
 			= container_of(hwmon_attr, struct thermal_zone_device,
 				       temp_crit);
+	long temperature;
+	int ret;
+
+	ret = tz->ops->get_trip_temp(tz, 0, &temperature);
+	if (ret)
+		return ret;
 
-	return tz->ops->get_trip_temp(tz, 0, buf);
+	return sprintf(buf, "%ld\n", temperature);
 }
 
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 917707e6151d..4cb3292fb6e4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -31,23 +31,39 @@
 struct thermal_zone_device;
 struct thermal_cooling_device;
 
+enum thermal_device_mode {
+	THERMAL_DEVICE_DISABLED = 0,
+	THERMAL_DEVICE_ENABLED,
+};
+
+enum thermal_trip_type {
+	THERMAL_TRIP_ACTIVE = 0,
+	THERMAL_TRIP_PASSIVE,
+	THERMAL_TRIP_HOT,
+	THERMAL_TRIP_CRITICAL,
+};
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
 	int (*unbind) (struct thermal_zone_device *,
 		       struct thermal_cooling_device *);
-	int (*get_temp) (struct thermal_zone_device *, char *);
-	int (*get_mode) (struct thermal_zone_device *, char *);
-	int (*set_mode) (struct thermal_zone_device *, const char *);
-	int (*get_trip_type) (struct thermal_zone_device *, int, char *);
-	int (*get_trip_temp) (struct thermal_zone_device *, int, char *);
+	int (*get_temp) (struct thermal_zone_device *, unsigned long *);
+	int (*get_mode) (struct thermal_zone_device *,
+			 enum thermal_device_mode *);
+	int (*set_mode) (struct thermal_zone_device *,
+		enum thermal_device_mode);
+	int (*get_trip_type) (struct thermal_zone_device *, int,
+		enum thermal_trip_type *);
+	int (*get_trip_temp) (struct thermal_zone_device *, int,
+			      unsigned long *);
 	int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *);
 };
 
 struct thermal_cooling_device_ops {
-	int (*get_max_state) (struct thermal_cooling_device *, char *);
-	int (*get_cur_state) (struct thermal_cooling_device *, char *);
-	int (*set_cur_state) (struct thermal_cooling_device *, unsigned int);
+	int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
+	int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
+	int (*set_cur_state) (struct thermal_cooling_device *, unsigned long);
 };
 
 #define THERMAL_TRIPS_NONE -1
-- 
cgit v1.2.3-71-gd317


From 000ab691172db3921efa3cb7f17fc79235a1de7f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:35:06 -0500
Subject: ftrace: allow archs to preform pre and post process for code
 modification

This patch creates the weak functions: ftrace_arch_code_modify_prepare
and ftrace_arch_code_modify_post_process that are called before and
after the stop machine is called to modify the kernel text.

If the arch needs to do pre or post processing, it only needs to define
these functions.

[ Update: Ingo Molnar suggested using the name ftrace_arch_code_modify_*
          over using ftrace_arch_modify_* ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  3 +++
 kernel/trace/ftrace.c  | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..fdb2a89ae543 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -99,6 +99,9 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+int ftrace_arch_code_modify_prepare(void);
+int ftrace_arch_code_modify_post_process(void);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..72316d9647bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -585,6 +585,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 	return 1;
 }
 
+/*
+ * archs can override this function if they must do something
+ * before the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_prepare(void)
+{
+	return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * after the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_post_process(void)
+{
+	return 0;
+}
+
 static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
@@ -607,7 +625,17 @@ static int __ftrace_modify_code(void *data)
 
 static void ftrace_run_update_code(int command)
 {
+	int ret;
+
+	ret = ftrace_arch_code_modify_prepare();
+	FTRACE_WARN_ON(ret);
+	if (ret)
+		return;
+
 	stop_machine(__ftrace_modify_code, &command, NULL);
+
+	ret = ftrace_arch_code_modify_post_process();
+	FTRACE_WARN_ON(ret);
 }
 
 static ftrace_func_t saved_ftrace_func;
-- 
cgit v1.2.3-71-gd317


From b1569e99c795bf83b4ddf41c4f1c42761ab7f75e Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Wed, 3 Dec 2008 17:55:32 +0000
Subject: ACPI: move thermal trip handling to generic thermal layer

The ACPI code currently carries its own thermal trip handling, meaning that
any other thermal implementation will need to reimplement it. Move the code
to the generic thermal layer.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/thermal.c        | 458 +++++-------------------------------------
 drivers/thermal/thermal_sys.c | 188 ++++++++++++++++-
 include/linux/thermal.h       |  15 +-
 3 files changed, 248 insertions(+), 413 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 1c410ef859c6..0ec48d2f85c5 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -37,7 +37,6 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/proc_fs.h>
-#include <linux/timer.h>
 #include <linux/jiffies.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
@@ -190,7 +189,6 @@ struct acpi_thermal {
 	struct acpi_thermal_state state;
 	struct acpi_thermal_trips trips;
 	struct acpi_handle_list devices;
-	struct timer_list timer;
 	struct thermal_zone_device *thermal_zone;
 	int tz_enabled;
 	struct mutex lock;
@@ -290,6 +288,11 @@ static int acpi_thermal_set_polling(struct acpi_thermal *tz, int seconds)
 
 	tz->polling_frequency = seconds * 10;	/* Convert value to deci-seconds */
 
+	tz->thermal_zone->polling_delay = seconds * 1000;
+
+	if (tz->tz_enabled)
+		thermal_zone_device_update(tz->thermal_zone);
+
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 			  "Polling frequency set to %lu seconds\n",
 			  tz->polling_frequency/10));
@@ -569,386 +572,11 @@ static int acpi_thermal_get_trip_points(struct acpi_thermal *tz)
 	return acpi_thermal_trips_update(tz, ACPI_TRIPS_INIT);
 }
 
-static int acpi_thermal_critical(struct acpi_thermal *tz)
-{
-	if (!tz || !tz->trips.critical.flags.valid)
-		return -EINVAL;
-
-	if (tz->temperature >= tz->trips.critical.temperature) {
-		printk(KERN_WARNING PREFIX "Critical trip point\n");
-		tz->trips.critical.flags.enabled = 1;
-	} else if (tz->trips.critical.flags.enabled)
-		tz->trips.critical.flags.enabled = 0;
-
-	acpi_bus_generate_proc_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
-				tz->trips.critical.flags.enabled);
-	acpi_bus_generate_netlink_event(tz->device->pnp.device_class,
-					  dev_name(&tz->device->dev),
-					  ACPI_THERMAL_NOTIFY_CRITICAL,
-					  tz->trips.critical.flags.enabled);
-
-	/* take no action if nocrt is set */
-	if(!nocrt) {
-		printk(KERN_EMERG
-			"Critical temperature reached (%ld C), shutting down.\n",
-			KELVIN_TO_CELSIUS(tz->temperature));
-		orderly_poweroff(true);
-	}
-
-	return 0;
-}
-
-static int acpi_thermal_hot(struct acpi_thermal *tz)
-{
-	if (!tz || !tz->trips.hot.flags.valid)
-		return -EINVAL;
-
-	if (tz->temperature >= tz->trips.hot.temperature) {
-		printk(KERN_WARNING PREFIX "Hot trip point\n");
-		tz->trips.hot.flags.enabled = 1;
-	} else if (tz->trips.hot.flags.enabled)
-		tz->trips.hot.flags.enabled = 0;
-
-	acpi_bus_generate_proc_event(tz->device, ACPI_THERMAL_NOTIFY_HOT,
-				tz->trips.hot.flags.enabled);
-	acpi_bus_generate_netlink_event(tz->device->pnp.device_class,
-					  dev_name(&tz->device->dev),
-					  ACPI_THERMAL_NOTIFY_HOT,
-					  tz->trips.hot.flags.enabled);
-
-	/* TBD: Call user-mode "sleep(S4)" function if nocrt is cleared */
-
-	return 0;
-}
-
-static void acpi_thermal_passive(struct acpi_thermal *tz)
-{
-	int result = 1;
-	struct acpi_thermal_passive *passive = NULL;
-	int trend = 0;
-	int i = 0;
-
-
-	if (!tz || !tz->trips.passive.flags.valid)
-		return;
-
-	passive = &(tz->trips.passive);
-
-	/*
-	 * Above Trip?
-	 * -----------
-	 * Calculate the thermal trend (using the passive cooling equation)
-	 * and modify the performance limit for all passive cooling devices
-	 * accordingly.  Note that we assume symmetry.
-	 */
-	if (tz->temperature >= passive->temperature) {
-		trend =
-		    (passive->tc1 * (tz->temperature - tz->last_temperature)) +
-		    (passive->tc2 * (tz->temperature - passive->temperature));
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "trend[%d]=(tc1[%lu]*(tmp[%lu]-last[%lu]))+(tc2[%lu]*(tmp[%lu]-psv[%lu]))\n",
-				  trend, passive->tc1, tz->temperature,
-				  tz->last_temperature, passive->tc2,
-				  tz->temperature, passive->temperature));
-		passive->flags.enabled = 1;
-		/* Heating up? */
-		if (trend > 0)
-			for (i = 0; i < passive->devices.count; i++)
-				acpi_processor_set_thermal_limit(passive->
-								 devices.
-								 handles[i],
-								 ACPI_PROCESSOR_LIMIT_INCREMENT);
-		/* Cooling off? */
-		else if (trend < 0) {
-			for (i = 0; i < passive->devices.count; i++)
-				/*
-				 * assume that we are on highest
-				 * freq/lowest thrott and can leave
-				 * passive mode, even in error case
-				 */
-				if (!acpi_processor_set_thermal_limit
-				    (passive->devices.handles[i],
-				     ACPI_PROCESSOR_LIMIT_DECREMENT))
-					result = 0;
-			/*
-			 * Leave cooling mode, even if the temp might
-			 * higher than trip point This is because some
-			 * machines might have long thermal polling
-			 * frequencies (tsp) defined. We will fall back
-			 * into passive mode in next cycle (probably quicker)
-			 */
-			if (result) {
-				passive->flags.enabled = 0;
-				ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-						  "Disabling passive cooling, still above threshold,"
-						  " but we are cooling down\n"));
-			}
-		}
-		return;
-	}
-
-	/*
-	 * Below Trip?
-	 * -----------
-	 * Implement passive cooling hysteresis to slowly increase performance
-	 * and avoid thrashing around the passive trip point.  Note that we
-	 * assume symmetry.
-	 */
-	if (!passive->flags.enabled)
-		return;
-	for (i = 0; i < passive->devices.count; i++)
-		if (!acpi_processor_set_thermal_limit
-		    (passive->devices.handles[i],
-		     ACPI_PROCESSOR_LIMIT_DECREMENT))
-			result = 0;
-	if (result) {
-		passive->flags.enabled = 0;
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "Disabling passive cooling (zone is cool)\n"));
-	}
-}
-
-static void acpi_thermal_active(struct acpi_thermal *tz)
-{
-	int result = 0;
-	struct acpi_thermal_active *active = NULL;
-	int i = 0;
-	int j = 0;
-	unsigned long maxtemp = 0;
-
-
-	if (!tz)
-		return;
-
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
-		active = &(tz->trips.active[i]);
-		if (!active || !active->flags.valid)
-			break;
-		if (tz->temperature >= active->temperature) {
-			/*
-			 * Above Threshold?
-			 * ----------------
-			 * If not already enabled, turn ON all cooling devices
-			 * associated with this active threshold.
-			 */
-			if (active->temperature > maxtemp)
-				tz->state.active_index = i;
-			maxtemp = active->temperature;
-			if (active->flags.enabled)
-				continue;
-			for (j = 0; j < active->devices.count; j++) {
-				result =
-				    acpi_bus_set_power(active->devices.
-						       handles[j],
-						       ACPI_STATE_D0);
-				if (result) {
-					printk(KERN_WARNING PREFIX
-						      "Unable to turn cooling device [%p] 'on'\n",
-						      active->devices.
-						      handles[j]);
-					continue;
-				}
-				active->flags.enabled = 1;
-				ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-						  "Cooling device [%p] now 'on'\n",
-						  active->devices.handles[j]));
-			}
-			continue;
-		}
-		if (!active->flags.enabled)
-			continue;
-		/*
-		 * Below Threshold?
-		 * ----------------
-		 * Turn OFF all cooling devices associated with this
-		 * threshold.
-		 */
-		for (j = 0; j < active->devices.count; j++) {
-			result = acpi_bus_set_power(active->devices.handles[j],
-						    ACPI_STATE_D3);
-			if (result) {
-				printk(KERN_WARNING PREFIX
-					      "Unable to turn cooling device [%p] 'off'\n",
-					      active->devices.handles[j]);
-				continue;
-			}
-			active->flags.enabled = 0;
-			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-					  "Cooling device [%p] now 'off'\n",
-					  active->devices.handles[j]));
-		}
-	}
-}
-
-static void acpi_thermal_check(void *context);
-
-static void acpi_thermal_run(unsigned long data)
-{
-	struct acpi_thermal *tz = (struct acpi_thermal *)data;
-	if (!tz->zombie)
-		acpi_os_execute(OSL_GPE_HANDLER, acpi_thermal_check, (void *)data);
-}
-
-static void acpi_thermal_active_off(void *data)
-{
-	int result = 0;
-	struct acpi_thermal *tz = data;
-	int i = 0;
-	int j = 0;
-	struct acpi_thermal_active *active = NULL;
-
-	if (!tz) {
-		printk(KERN_ERR PREFIX "Invalid (NULL) context\n");
-		return;
-	}
-
-	result = acpi_thermal_get_temperature(tz);
-	if (result)
-		return;
-
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
-		active = &(tz->trips.active[i]);
-		if (!active || !active->flags.valid)
-			break;
-		if (tz->temperature >= active->temperature) {
-			/*
-			 * If the thermal temperature is greater than the
-			 * active threshod, unnecessary to turn off the
-			 * the active cooling device.
-			 */
-			continue;
-		}
-		/*
-		 * Below Threshold?
-		 * ----------------
-		 * Turn OFF all cooling devices associated with this
-		 * threshold.
-		 */
-		for (j = 0; j < active->devices.count; j++)
-			result = acpi_bus_set_power(active->devices.handles[j],
-						    ACPI_STATE_D3);
-	}
-}
-
 static void acpi_thermal_check(void *data)
 {
-	int result = 0;
 	struct acpi_thermal *tz = data;
-	unsigned long sleep_time = 0;
-	unsigned long timeout_jiffies = 0;
-	int i = 0;
-	struct acpi_thermal_state state;
-
-
-	if (!tz) {
-		printk(KERN_ERR PREFIX "Invalid (NULL) context\n");
-		return;
-	}
-
-	/* Check if someone else is already running */
-	if (!mutex_trylock(&tz->lock))
-		return;
-
-	state = tz->state;
-
-	result = acpi_thermal_get_temperature(tz);
-	if (result)
-		goto unlock;
-
-	if (!tz->tz_enabled)
-		goto unlock;
-
-	memset(&tz->state, 0, sizeof(tz->state));
-
-	/*
-	 * Check Trip Points
-	 * -----------------
-	 * Compare the current temperature to the trip point values to see
-	 * if we've entered one of the thermal policy states.  Note that
-	 * this function determines when a state is entered, but the 
-	 * individual policy decides when it is exited (e.g. hysteresis).
-	 */
-	if (tz->trips.critical.flags.valid)
-		state.critical |=
-		    (tz->temperature >= tz->trips.critical.temperature);
-	if (tz->trips.hot.flags.valid)
-		state.hot |= (tz->temperature >= tz->trips.hot.temperature);
-	if (tz->trips.passive.flags.valid)
-		state.passive |=
-		    (tz->temperature >= tz->trips.passive.temperature);
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++)
-		if (tz->trips.active[i].flags.valid)
-			state.active |=
-			    (tz->temperature >=
-			     tz->trips.active[i].temperature);
-
-	/*
-	 * Invoke Policy
-	 * -------------
-	 * Separated from the above check to allow individual policy to 
-	 * determine when to exit a given state.
-	 */
-	if (state.critical)
-		acpi_thermal_critical(tz);
-	if (state.hot)
-		acpi_thermal_hot(tz);
-	if (state.passive)
-		acpi_thermal_passive(tz);
-	if (state.active)
-		acpi_thermal_active(tz);
-
-	/*
-	 * Calculate State
-	 * ---------------
-	 * Again, separated from the above two to allow independent policy
-	 * decisions.
-	 */
-	tz->state.critical = tz->trips.critical.flags.enabled;
-	tz->state.hot = tz->trips.hot.flags.enabled;
-	tz->state.passive = tz->trips.passive.flags.enabled;
-	tz->state.active = 0;
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++)
-		tz->state.active |= tz->trips.active[i].flags.enabled;
-
-	/*
-	 * Calculate Sleep Time
-	 * --------------------
-	 * If we're in the passive state, use _TSP's value.  Otherwise
-	 * use the default polling frequency (e.g. _TZP).  If no polling
-	 * frequency is specified then we'll wait forever (at least until
-	 * a thermal event occurs).  Note that _TSP and _TZD values are
-	 * given in 1/10th seconds (we must covert to milliseconds).
-	 */
-	if (tz->state.passive) {
-		sleep_time = tz->trips.passive.tsp * 100;
-		timeout_jiffies =  jiffies + (HZ * sleep_time) / 1000;
-	} else if (tz->polling_frequency > 0) {
-		sleep_time = tz->polling_frequency * 100;
-		timeout_jiffies =  round_jiffies(jiffies + (HZ * sleep_time) / 1000);
-	}
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "%s: temperature[%lu] sleep[%lu]\n",
-			  tz->name, tz->temperature, sleep_time));
 
-	/*
-	 * Schedule Next Poll
-	 * ------------------
-	 */
-	if (!sleep_time) {
-		if (timer_pending(&(tz->timer)))
-			del_timer(&(tz->timer));
-	} else {
-		if (timer_pending(&(tz->timer)))
-			mod_timer(&(tz->timer), timeout_jiffies);
-		else {
-			tz->timer.data = (unsigned long)tz;
-			tz->timer.function = acpi_thermal_run;
-			tz->timer.expires = timeout_jiffies;
-			add_timer(&(tz->timer));
-		}
-	}
-      unlock:
-	mutex_unlock(&tz->lock);
+	thermal_zone_device_update(tz->thermal_zone);
 }
 
 /* sys I/F for generic thermal sysfs support */
@@ -1122,6 +750,29 @@ static int thermal_get_crit_temp(struct thermal_zone_device *thermal,
 		return -EINVAL;
 }
 
+static int thermal_notify(struct thermal_zone_device *thermal, int trip,
+			   enum thermal_trip_type trip_type)
+{
+	u8 type = 0;
+	struct acpi_thermal *tz = thermal->devdata;
+
+	if (trip_type == THERMAL_TRIP_CRITICAL)
+		type = ACPI_THERMAL_NOTIFY_CRITICAL;
+	else if (trip_type == THERMAL_TRIP_HOT)
+		type = ACPI_THERMAL_NOTIFY_HOT;
+	else
+		return 0;
+
+	acpi_bus_generate_proc_event(tz->device, type, 1);
+	acpi_bus_generate_netlink_event(tz->device->pnp.device_class,
+					tz->device->dev.bus_id, type, 1);
+
+	if (trip_type == THERMAL_TRIP_CRITICAL && nocrt)
+		return 1;
+
+	return 0;
+}
+
 typedef int (*cb)(struct thermal_zone_device *, int,
 		  struct thermal_cooling_device *);
 static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
@@ -1214,6 +865,7 @@ static struct thermal_zone_device_ops acpi_thermal_zone_ops = {
 	.get_trip_type = thermal_get_trip_type,
 	.get_trip_temp = thermal_get_trip_temp,
 	.get_crit_temp = thermal_get_crit_temp,
+	.notify = thermal_notify,
 };
 
 static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz)
@@ -1234,8 +886,21 @@ static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz)
 
 	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE &&
 			tz->trips.active[i].flags.valid; i++, trips++);
-	tz->thermal_zone = thermal_zone_device_register("acpitz",
-					trips, tz, &acpi_thermal_zone_ops);
+
+	if (tz->trips.passive.flags.valid)
+		tz->thermal_zone =
+			thermal_zone_device_register("acpitz", trips, tz,
+						     &acpi_thermal_zone_ops,
+						     tz->trips.passive.tc1,
+						     tz->trips.passive.tc2,
+						     tz->trips.passive.tsp*100,
+						     tz->polling_frequency*100);
+	else
+		tz->thermal_zone =
+			thermal_zone_device_register("acpitz", trips, tz,
+						     &acpi_thermal_zone_ops,
+						     0, 0, 0,
+						     tz->polling_frequency);
 	if (IS_ERR(tz->thermal_zone))
 		return -ENODEV;
 
@@ -1467,13 +1132,13 @@ static int acpi_thermal_polling_seq_show(struct seq_file *seq, void *offset)
 	if (!tz)
 		goto end;
 
-	if (!tz->polling_frequency) {
+	if (!tz->thermal_zone->polling_delay) {
 		seq_puts(seq, "<polling disabled>\n");
 		goto end;
 	}
 
-	seq_printf(seq, "polling frequency:       %lu seconds\n",
-		   (tz->polling_frequency / 10));
+	seq_printf(seq, "polling frequency:       %d seconds\n",
+		   (tz->thermal_zone->polling_delay / 1000));
 
       end:
 	return 0;
@@ -1703,12 +1368,6 @@ static int acpi_thermal_add(struct acpi_device *device)
 	if (result)
 		goto unregister_thermal_zone;
 
-	init_timer(&tz->timer);
-
-	acpi_thermal_active_off(tz);
-
-	acpi_thermal_check(tz);
-
 	status = acpi_install_notify_handler(device->handle,
 					     ACPI_DEVICE_NOTIFY,
 					     acpi_thermal_notify, tz);
@@ -1737,36 +1396,15 @@ static int acpi_thermal_remove(struct acpi_device *device, int type)
 	acpi_status status = AE_OK;
 	struct acpi_thermal *tz = NULL;
 
-
 	if (!device || !acpi_driver_data(device))
 		return -EINVAL;
 
 	tz = acpi_driver_data(device);
 
-	/* avoid timer adding new defer task */
-	tz->zombie = 1;
-	/* wait for running timer (on other CPUs) finish */
-	del_timer_sync(&(tz->timer));
-	/* synchronize deferred task */
-	acpi_os_wait_events_complete(NULL);
-	/* deferred task may reinsert timer */
-	del_timer_sync(&(tz->timer));
-
 	status = acpi_remove_notify_handler(device->handle,
 					    ACPI_DEVICE_NOTIFY,
 					    acpi_thermal_notify);
 
-	/* Terminate policy */
-	if (tz->trips.passive.flags.valid && tz->trips.passive.flags.enabled) {
-		tz->trips.passive.flags.enabled = 0;
-		acpi_thermal_passive(tz);
-	}
-	if (tz->trips.active[0].flags.valid
-	    && tz->trips.active[0].flags.enabled) {
-		tz->trips.active[0].flags.enabled = 0;
-		acpi_thermal_active(tz);
-	}
-
 	acpi_thermal_remove_fs(device);
 	acpi_thermal_unregister_thermal_zone(tz);
 	mutex_destroy(&tz->lock);
diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index bd139adc6d32..6378741882f3 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/thermal.h>
 #include <linux/spinlock.h>
+#include <linux/reboot.h>
 
 MODULE_AUTHOR("Zhang Rui");
 MODULE_DESCRIPTION("Generic thermal management sysfs support");
@@ -517,6 +518,97 @@ thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 }
 #endif
 
+static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
+					    int delay)
+{
+	cancel_delayed_work(&(tz->poll_queue));
+
+	if (!delay)
+		return;
+
+	if (delay > 1000)
+		schedule_delayed_work(&(tz->poll_queue),
+				      round_jiffies(msecs_to_jiffies(delay)));
+	else
+		schedule_delayed_work(&(tz->poll_queue),
+				      msecs_to_jiffies(delay));
+}
+
+static void thermal_zone_device_passive(struct thermal_zone_device *tz,
+					int temp, int trip_temp, int trip)
+{
+	int trend = 0;
+	struct thermal_cooling_device_instance *instance;
+	struct thermal_cooling_device *cdev;
+	long state, max_state;
+
+	/*
+	 * Above Trip?
+	 * -----------
+	 * Calculate the thermal trend (using the passive cooling equation)
+	 * and modify the performance limit for all passive cooling devices
+	 * accordingly.  Note that we assume symmetry.
+	 */
+	if (temp >= trip_temp) {
+		tz->passive = true;
+
+		trend = (tz->tc1 * (temp - tz->last_temperature)) +
+			(tz->tc2 * (temp - trip_temp));
+
+		/* Heating up? */
+		if (trend > 0) {
+			list_for_each_entry(instance, &tz->cooling_devices,
+					    node) {
+				if (instance->trip != trip)
+					continue;
+				cdev = instance->cdev;
+				cdev->ops->get_cur_state(cdev, &state);
+				cdev->ops->get_max_state(cdev, &max_state);
+				if (state++ < max_state)
+					cdev->ops->set_cur_state(cdev, state);
+			}
+		} else if (trend < 0) { /* Cooling off? */
+			list_for_each_entry(instance, &tz->cooling_devices,
+					    node) {
+				if (instance->trip != trip)
+					continue;
+				cdev = instance->cdev;
+				cdev->ops->get_cur_state(cdev, &state);
+				cdev->ops->get_max_state(cdev, &max_state);
+				if (state > 0)
+					cdev->ops->set_cur_state(cdev, --state);
+			}
+		}
+		return;
+	}
+
+	/*
+	 * Below Trip?
+	 * -----------
+	 * Implement passive cooling hysteresis to slowly increase performance
+	 * and avoid thrashing around the passive trip point.  Note that we
+	 * assume symmetry.
+	 */
+	list_for_each_entry(instance, &tz->cooling_devices, node) {
+		if (instance->trip != trip)
+			continue;
+		cdev = instance->cdev;
+		cdev->ops->get_cur_state(cdev, &state);
+		cdev->ops->get_max_state(cdev, &max_state);
+		if (state > 0)
+			cdev->ops->set_cur_state(cdev, --state);
+		if (state == 0)
+			tz->passive = false;
+	}
+}
+
+static void thermal_zone_device_check(struct work_struct *work)
+{
+	struct thermal_zone_device *tz = container_of(work, struct
+						      thermal_zone_device,
+						      poll_queue.work);
+	thermal_zone_device_update(tz);
+}
 
 /**
  * thermal_zone_bind_cooling_device - bind a cooling device to a thermal zone
@@ -786,21 +878,102 @@ void thermal_cooling_device_unregister(struct
 
 EXPORT_SYMBOL(thermal_cooling_device_unregister);
 
+/**
+ * thermal_zone_device_update - force an update of a thermal zone's state
+ * @ttz:	the thermal zone to update
+ */
+
+void thermal_zone_device_update(struct thermal_zone_device *tz)
+{
+	int count, ret = 0;
+	long temp, trip_temp;
+	enum thermal_trip_type trip_type;
+	struct thermal_cooling_device_instance *instance;
+	struct thermal_cooling_device *cdev;
+
+	mutex_lock(&tz->lock);
+
+	tz->ops->get_temp(tz, &temp);
+
+	for (count = 0; count < tz->trips; count++) {
+		tz->ops->get_trip_type(tz, count, &trip_type);
+		tz->ops->get_trip_temp(tz, count, &trip_temp);
+
+		switch (trip_type) {
+		case THERMAL_TRIP_CRITICAL:
+			if (temp > trip_temp) {
+				if (tz->ops->notify)
+					ret = tz->ops->notify(tz, count,
+							      trip_type);
+				if (!ret) {
+					printk(KERN_EMERG
+					       "Critical temperature reached (%ld C), shutting down.\n",
+					       temp/1000);
+					orderly_poweroff(true);
+				}
+			}
+			break;
+		case THERMAL_TRIP_HOT:
+			if (temp > trip_temp)
+				if (tz->ops->notify)
+					tz->ops->notify(tz, count, trip_type);
+			break;
+		case THERMAL_TRIP_ACTIVE:
+			list_for_each_entry(instance, &tz->cooling_devices,
+					    node) {
+				if (instance->trip != count)
+					continue;
+
+				cdev = instance->cdev;
+
+				if (temp > trip_temp)
+					cdev->ops->set_cur_state(cdev, 1);
+				else
+					cdev->ops->set_cur_state(cdev, 0);
+			}
+			break;
+		case THERMAL_TRIP_PASSIVE:
+			if (temp > trip_temp || tz->passive)
+				thermal_zone_device_passive(tz, temp,
+							    trip_temp, count);
+			break;
+		}
+	}
+	tz->last_temperature = temp;
+	if (tz->passive)
+		thermal_zone_device_set_polling(tz, tz->passive_delay);
+	else if (tz->polling_delay)
+		thermal_zone_device_set_polling(tz, tz->polling_delay);
+	mutex_unlock(&tz->lock);
+}
+EXPORT_SYMBOL(thermal_zone_device_update);
+
 /**
  * thermal_zone_device_register - register a new thermal zone device
  * @type:	the thermal zone device type
  * @trips:	the number of trip points the thermal zone support
  * @devdata:	private device data
  * @ops:	standard thermal zone device callbacks
+ * @tc1:	thermal coefficient 1 for passive calculations
+ * @tc2:	thermal coefficient 2 for passive calculations
+ * @passive_delay: number of milliseconds to wait between polls when
+ *		   performing passive cooling
+ * @polling_delay: number of milliseconds to wait between polls when checking
+ *		   whether trip points have been crossed (0 for interrupt
+ *		   driven systems)
  *
  * thermal_zone_device_unregister() must be called when the device is no
- * longer needed.
+ * longer needed. The passive cooling formula uses tc1 and tc2 as described in
+ * section 11.1.5.1 of the ACPI specification 3.0.
  */
 struct thermal_zone_device *thermal_zone_device_register(char *type,
 							 int trips,
 							 void *devdata, struct
 							 thermal_zone_device_ops
-							 *ops)
+							 *ops, int tc1, int
+							 tc2,
+							 int passive_delay,
+							 int polling_delay)
 {
 	struct thermal_zone_device *tz;
 	struct thermal_cooling_device *pos;
@@ -834,6 +1007,11 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 	tz->device.class = &thermal_class;
 	tz->devdata = devdata;
 	tz->trips = trips;
+	tz->tc1 = tc1;
+	tz->tc2 = tc2;
+	tz->passive_delay = passive_delay;
+	tz->polling_delay = polling_delay;
+
 	dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 	result = device_register(&tz->device);
 	if (result) {
@@ -879,6 +1057,10 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 		}
 	mutex_unlock(&thermal_list_lock);
 
+	INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
+
+	thermal_zone_device_update(tz);
+
 	if (!result)
 		return tz;
 
@@ -918,6 +1100,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 		    tz->ops->unbind(tz, cdev);
 	mutex_unlock(&thermal_list_lock);
 
+	thermal_zone_device_set_polling(tz, 0);
+
 	if (tz->type[0])
 		device_remove_file(&tz->device, &dev_attr_type);
 	device_remove_file(&tz->device, &dev_attr_temp);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 4cb3292fb6e4..a81c61521ba4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -27,6 +27,7 @@
 
 #include <linux/idr.h>
 #include <linux/device.h>
+#include <linux/workqueue.h>
 
 struct thermal_zone_device;
 struct thermal_cooling_device;
@@ -58,6 +59,8 @@ struct thermal_zone_device_ops {
 	int (*get_trip_temp) (struct thermal_zone_device *, int,
 			      unsigned long *);
 	int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *);
+	int (*notify) (struct thermal_zone_device *, int,
+		       enum thermal_trip_type);
 };
 
 struct thermal_cooling_device_ops {
@@ -104,11 +107,18 @@ struct thermal_zone_device {
 	struct device device;
 	void *devdata;
 	int trips;
+	int tc1;
+	int tc2;
+	int passive_delay;
+	int polling_delay;
+	int last_temperature;
+	bool passive;
 	struct thermal_zone_device_ops *ops;
 	struct list_head cooling_devices;
 	struct idr idr;
 	struct mutex lock;	/* protect cooling devices list */
 	struct list_head node;
+	struct delayed_work poll_queue;
 #if defined(CONFIG_THERMAL_HWMON)
 	struct list_head hwmon_node;
 	struct thermal_hwmon_device *hwmon;
@@ -120,13 +130,16 @@ struct thermal_zone_device {
 struct thermal_zone_device *thermal_zone_device_register(char *, int, void *,
 							 struct
 							 thermal_zone_device_ops
-							 *);
+							 *, int tc1, int tc2,
+							 int passive_freq,
+							 int polling_freq);
 void thermal_zone_device_unregister(struct thermal_zone_device *);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
+void thermal_zone_device_update(struct thermal_zone_device *);
 struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
 							       struct
 							       thermal_cooling_device_ops
-- 
cgit v1.2.3-71-gd317


From 9c3c133b1ed6e6d01bfabb6de29bf3d0f0886354 Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex@digriz.org.uk>
Date: Sun, 22 Feb 2009 12:03:56 +0800
Subject: hwrng: timeriomem - New driver

Some hardware platforms, the TS-7800[1] is one for example, can
supply the kernel with an entropy source, albeit a slow one for
TS-7800 users, by just reading a particular IO address.  This
source must not be read above a certain rate otherwise the quality
suffers.

The driver is then hooked into by calling
platform_device_(register|add|del) passing a structure similar to:
------
static struct timeriomem_rng_data ts78xx_ts_rng_data = {
        .address        = (u32 *__iomem) TS_RNG,
        .period         = 1000000, /* one second */
};

static struct platform_device ts78xx_ts_rng_device = {
        .name           = "timeriomem_rng",
        .id             = -1,
        .dev            = {
                .platform_data  = &ts78xx_ts_rng_data,
        },
        .num_resources  = 0,
};
------

[1] http://www.embeddedarm.com/products/board-detail.php?product=TS-7800

Signed-off-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/Kconfig          |  14 +++
 drivers/char/hw_random/Makefile         |   1 +
 drivers/char/hw_random/timeriomem-rng.c | 151 ++++++++++++++++++++++++++++++++
 include/linux/timeriomem-rng.h          |  21 +++++
 4 files changed, 187 insertions(+)
 create mode 100644 drivers/char/hw_random/timeriomem-rng.c
 create mode 100644 include/linux/timeriomem-rng.h

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 8822eca58ffa..e86dd425a70f 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -20,6 +20,20 @@ config HW_RANDOM
 
 	  If unsure, say Y.
 
+config HW_RANDOM_TIMERIOMEM
+	tristate "Timer IOMEM HW Random Number Generator support"
+	depends on HW_RANDOM
+	---help---
+	  This driver provides kernel-side support for a generic Random
+	  Number Generator used by reading a 'dumb' iomem address that
+	  is to be read no faster than, for example, once a second;
+	  the default FPGA bitstream on the TS-7800 has such functionality.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called timeriomem-rng.
+
+	  If unsure, say Y.
+
 config HW_RANDOM_INTEL
 	tristate "Intel HW Random Number Generator support"
 	depends on HW_RANDOM && (X86 || IA64) && PCI
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index b6effb7522c2..e81d21a5f28f 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_HW_RANDOM) += rng-core.o
 rng-core-y := core.o
+obj-$(CONFIG_HW_RANDOM_TIMERIOMEM) += timeriomem-rng.o
 obj-$(CONFIG_HW_RANDOM_INTEL) += intel-rng.o
 obj-$(CONFIG_HW_RANDOM_AMD) += amd-rng.o
 obj-$(CONFIG_HW_RANDOM_GEODE) += geode-rng.o
diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
new file mode 100644
index 000000000000..10ad41be5897
--- /dev/null
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -0,0 +1,151 @@
+/*
+ * drivers/char/hw_random/timeriomem-rng.c
+ *
+ * Copyright (C) 2009 Alexander Clouter <alex@digriz.org.uk>
+ *
+ * Derived from drivers/char/hw_random/omap-rng.c
+ *   Copyright 2005 (c) MontaVista Software, Inc.
+ *   Author: Deepak Saxena <dsaxena@plexity.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Overview:
+ *   This driver is useful for platforms that have an IO range that provides
+ *   periodic random data from a single IO memory address.  All the platform
+ *   has to do is provide the address and 'wait time' that new data becomes
+ *   available.
+ *
+ * TODO: add support for reading sizes other than 32bits and masking
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/hw_random.h>
+#include <linux/io.h>
+#include <linux/timeriomem-rng.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/completion.h>
+
+static struct timeriomem_rng_data *timeriomem_rng_data;
+
+static void timeriomem_rng_trigger(unsigned long);
+static DEFINE_TIMER(timeriomem_rng_timer, timeriomem_rng_trigger, 0, 0);
+
+/*
+ * have data return 1, however return 0 if we have nothing
+ */
+static int timeriomem_rng_data_present(struct hwrng *rng, int wait)
+{
+	if (rng->priv == 0)
+		return 1;
+
+	if (!wait || timeriomem_rng_data->present)
+		return timeriomem_rng_data->present;
+
+	wait_for_completion(&timeriomem_rng_data->completion);
+
+	return 1;
+}
+
+static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
+{
+	unsigned long cur;
+	s32 delay;
+
+	*data = readl(timeriomem_rng_data->address);
+
+	if (rng->priv != 0) {
+		cur = jiffies;
+
+		delay = cur - timeriomem_rng_timer.expires;
+		delay = rng->priv - (delay % rng->priv);
+
+		timeriomem_rng_timer.expires = cur + delay;
+		timeriomem_rng_data->present = 0;
+
+		init_completion(&timeriomem_rng_data->completion);
+		add_timer(&timeriomem_rng_timer);
+	}
+
+	return 4;
+}
+
+static void timeriomem_rng_trigger(unsigned long dummy)
+{
+	timeriomem_rng_data->present = 1;
+	complete(&timeriomem_rng_data->completion);
+}
+
+static struct hwrng timeriomem_rng_ops = {
+	.name		= "timeriomem",
+	.data_present	= timeriomem_rng_data_present,
+	.data_read	= timeriomem_rng_data_read,
+	.priv		= 0,
+};
+
+static int __init timeriomem_rng_probe(struct platform_device *pdev)
+{
+	int ret;
+
+	timeriomem_rng_data = pdev->dev.platform_data;
+
+	if (timeriomem_rng_data->period != 0
+		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
+		timeriomem_rng_timer.expires = jiffies;
+
+		timeriomem_rng_ops.priv = usecs_to_jiffies(
+						timeriomem_rng_data->period);
+	}
+	timeriomem_rng_data->present = 1;
+
+	ret = hwrng_register(&timeriomem_rng_ops);
+	if (ret) {
+		dev_err(&pdev->dev, "problem registering\n");
+		return ret;
+	}
+
+	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
+			timeriomem_rng_data->address,
+			timeriomem_rng_data->period);
+
+	return 0;
+}
+
+static int __devexit timeriomem_rng_remove(struct platform_device *pdev)
+{
+	del_timer_sync(&timeriomem_rng_timer);
+	hwrng_unregister(&timeriomem_rng_ops);
+
+	return 0;
+}
+
+static struct platform_driver timeriomem_rng_driver = {
+	.driver = {
+		.name		= "timeriomem_rng",
+		.owner		= THIS_MODULE,
+	},
+	.probe		= timeriomem_rng_probe,
+	.remove		= __devexit_p(timeriomem_rng_remove),
+};
+
+static int __init timeriomem_rng_init(void)
+{
+	return platform_driver_register(&timeriomem_rng_driver);
+}
+
+static void __exit timeriomem_rng_exit(void)
+{
+	platform_driver_unregister(&timeriomem_rng_driver);
+}
+
+module_init(timeriomem_rng_init);
+module_exit(timeriomem_rng_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>");
+MODULE_DESCRIPTION("Timer IOMEM H/W RNG driver");
diff --git a/include/linux/timeriomem-rng.h b/include/linux/timeriomem-rng.h
new file mode 100644
index 000000000000..dd253177f65f
--- /dev/null
+++ b/include/linux/timeriomem-rng.h
@@ -0,0 +1,21 @@
+/*
+ * linux/include/linux/timeriomem-rng.h
+ *
+ * Copyright (c) 2009 Alexander Clouter <alex@digriz.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/completion.h>
+
+struct timeriomem_rng_data {
+	struct completion	completion;
+	unsigned int		present:1;
+
+	u32 __iomem		*address;
+
+	/* measures in usecs */
+	unsigned int		period;
+};
-- 
cgit v1.2.3-71-gd317


From 3b89d7d881a1dbb4da158f7eb5d6b3ceefc72810 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sun, 22 Feb 2009 17:40:07 -0800
Subject: slub: move min_partial to struct kmem_cache

Although it allows for better cacheline use, it is unnecessary to save a
copy of the cache's min_partial value in each kmem_cache_node.

Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slub_def.h |  2 +-
 mm/slub.c                | 29 ++++++++++++++++-------------
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..f20a89e4d52c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -46,7 +46,6 @@ struct kmem_cache_cpu {
 struct kmem_cache_node {
 	spinlock_t list_lock;	/* Protect partial list and nr_partial */
 	unsigned long nr_partial;
-	unsigned long min_partial;
 	struct list_head partial;
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_t nr_slabs;
@@ -89,6 +88,7 @@ struct kmem_cache {
 	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
+	unsigned long min_partial;
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
 #ifdef CONFIG_SLUB_DEBUG
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..4fff385b17a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1335,7 +1335,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 		n = get_node(s, zone_to_nid(zone));
 
 		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-				n->nr_partial > n->min_partial) {
+				n->nr_partial > s->min_partial) {
 			page = get_partial_node(n);
 			if (page)
 				return page;
@@ -1387,7 +1387,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 		slab_unlock(page);
 	} else {
 		stat(c, DEACTIVATE_EMPTY);
-		if (n->nr_partial < n->min_partial) {
+		if (n->nr_partial < s->min_partial) {
 			/*
 			 * Adding an empty slab to the partial slabs in order
 			 * to avoid page allocator overhead. This slab needs
@@ -1928,17 +1928,6 @@ static void
 init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 {
 	n->nr_partial = 0;
-
-	/*
-	 * The larger the object size is, the more pages we want on the partial
-	 * list to avoid pounding the page allocator excessively.
-	 */
-	n->min_partial = ilog2(s->size);
-	if (n->min_partial < MIN_PARTIAL)
-		n->min_partial = MIN_PARTIAL;
-	else if (n->min_partial > MAX_PARTIAL)
-		n->min_partial = MAX_PARTIAL;
-
 	spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
@@ -2181,6 +2170,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 }
 #endif
 
+static void calculate_min_partial(struct kmem_cache *s, unsigned long min)
+{
+	if (min < MIN_PARTIAL)
+		min = MIN_PARTIAL;
+	else if (min > MAX_PARTIAL)
+		min = MAX_PARTIAL;
+	s->min_partial = min;
+}
+
 /*
  * calculate_sizes() determines the order and the distribution of data within
  * a slab object.
@@ -2319,6 +2317,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 	if (!calculate_sizes(s, -1))
 		goto error;
 
+	/*
+	 * The larger the object size is, the more pages we want on the partial
+	 * list to avoid pounding the page allocator excessively.
+	 */
+	calculate_min_partial(s, ilog2(s->size));
 	s->refcount = 1;
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
-- 
cgit v1.2.3-71-gd317


From c132937556f56ee4b831ef4b23f1846e05fde102 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:20 +0900
Subject: bootmem: clean up arch-specific bootmem wrapping

Impact: cleaner and consistent bootmem wrapping

By setting CONFIG_HAVE_ARCH_BOOTMEM_NODE, archs can define
arch-specific wrappers for bootmem allocation.  However, this is done
a bit strangely in that only the high level convenience macros can be
changed while lower level, but still exported, interface functions
can't be wrapped.  This not only is messy but also leads to strange
situation where alloc_bootmem() does what the arch wants it to do but
the equivalent __alloc_bootmem() call doesn't although they should be
able to be used interchangeably.

This patch updates bootmem such that archs can override / wrap the
backend function - alloc_bootmem_core() instead of the highlevel
interface functions to allow simpler and consistent wrapping.  Also,
HAVE_ARCH_BOOTMEM_NODE is renamed to HAVE_ARCH_BOOTMEM.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@saeurebad.de>
---
 arch/avr32/Kconfig               |  2 +-
 arch/x86/Kconfig                 |  2 +-
 arch/x86/include/asm/mmzone_32.h | 43 +++++-----------------------------------
 include/linux/bootmem.h          | 10 ++++------
 mm/bootmem.c                     | 14 ++++++++++---
 5 files changed, 22 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b189680d18b0..05fe3053dcae 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
 	def_bool y
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool n
 
 config ARCH_HAVE_MEMORY_PRESENT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d3f6eadfd4ba..6fd3b2302ed9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1111,7 +1111,7 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accomodate various tables.
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool y
 	depends on X86_32 && NUMA
 
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 07f1af494ca5..1e0fa9e63afa 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -93,45 +93,12 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size, flags) \
-	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x)					\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_pages_node(pgdat, x)				\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_low_pages_node(pgdat, x)				\
+/* always use node 0 for bootmem on this numa platform */
+#define alloc_bootmem_core(__bdata, size, align, goal, limit)		\
 ({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
+	bootmem_data_t __maybe_unused *	__abm_bdata_dummy = (__bdata);	\
+	__alloc_bootmem_core(NODE_DATA(0)->bdata,			\
+			     (size), (align), (goal), (limit));		\
 })
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 95837bfb5256..3a87f93081ed 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -69,10 +69,9 @@ extern int reserve_bootmem_node(pg_data_t *pgdat,
 				 unsigned long physaddr,
 				 unsigned long size,
 				 int flags);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
-#endif
-
+extern int reserve_bootmem(unsigned long addr,
+			   unsigned long size,
+			   int flags);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
@@ -94,7 +93,7 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
 				      unsigned long goal);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+
 #define alloc_bootmem(x) \
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
@@ -113,7 +112,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
 				   int flags);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf61e0e..d7140c008ba8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 
 static int bootmem_debug;
 
+/*
+ * If an arch needs to apply workarounds to bootmem allocation, it can
+ * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
+ * __alloc_bootmem_core().
+ */
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM
+#define alloc_bootmem_core(bdata, size, align, goal, limit)		\
+	__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
+#endif
+
 static int __init bootmem_debug_setup(char *buf)
 {
 	bootmem_debug = 1;
@@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
 
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 /**
  * reserve_bootmem - mark a page range as usable
  * @addr: starting address of the range
@@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 
 	return mark_bootmem(start, end, 1, flags);
 }
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
 			unsigned long step)
@@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 	return ALIGN(base + off, align) - base;
 }
 
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
 {
-- 
cgit v1.2.3-71-gd317


From 2d0aae41695257603fc281b519677131ab5a752b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: bootmem: reorder interface functions and add a missing one

Impact: cleanup and addition of missing interface wrapper

The interface functions in bootmem.h was ordered in not so orderly
manner.  Reorder them such that

* functions allocating the same area group together -
  ie. alloc_bootmem group and alloc_bootmem_low group.

* functions w/o node parameter come before the ones w/ node parameter.

* nopanic variants are immediately below their panicky counterparts.

While at it, add alloc_bootmem_pages_node_nopanic() which was missing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@saeurebad.de>
---
 include/linux/bootmem.h | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3a87f93081ed..455d83219fae 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -65,22 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
 #define BOOTMEM_DEFAULT		0
 #define BOOTMEM_EXCLUSIVE	(1<<0)
 
-extern int reserve_bootmem_node(pg_data_t *pgdat,
-				 unsigned long physaddr,
-				 unsigned long size,
-				 int flags);
 extern int reserve_bootmem(unsigned long addr,
 			   unsigned long size,
 			   int flags);
-extern void *__alloc_bootmem_nopanic(unsigned long size,
+extern int reserve_bootmem_node(pg_data_t *pgdat,
+				unsigned long physaddr,
+				unsigned long size,
+				int flags);
+
+extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
-extern void *__alloc_bootmem(unsigned long size,
+extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
 				     unsigned long goal);
-extern void *__alloc_bootmem_low(unsigned long size,
-				 unsigned long align,
-				 unsigned long goal);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
@@ -89,6 +87,9 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
+extern void *__alloc_bootmem_low(unsigned long size,
+				 unsigned long align,
+				 unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
@@ -98,18 +99,21 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
 	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_nopanic(x) \
 	__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
+	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+
+#define alloc_bootmem_low(x) \
+	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_pages(x) \
+	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
 
-- 
cgit v1.2.3-71-gd317


From c0c0a29379b5848aec2e8f1c58d853d3cb7118b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: vmalloc: add @align to vm_area_register_early()

Impact: allow larger alignment for early vmalloc area allocation

Some early vmalloc users might want larger alignment, for example, for
custom large page mapping.  Add @align to vm_area_register_early().
While at it, drop docbook comment on non-existent @size.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
---
 arch/alpha/mm/init.c    |  2 +-
 include/linux/vmalloc.h |  2 +-
 mm/percpu.c             |  2 +-
 mm/vmalloc.c            | 11 +++++++----
 4 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index df6df025ded4..91eddd8505df 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -200,7 +200,7 @@ callback_init(void * kernel_end)
 		/* register the vm area */
 		console_remap_vm.flags = VM_ALLOC;
 		console_remap_vm.size = nr_pages << PAGE_SHIFT;
-		vm_area_register_early(&console_remap_vm);
+		vm_area_register_early(&console_remap_vm, PAGE_SIZE);
 
 		vaddr = (unsigned long)consle_remap_vm.addr;
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 599ba7984310..2f6994fdf0e0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -109,6 +109,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
-extern __init void vm_area_register_early(struct vm_struct *vm);
+extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/percpu.c b/mm/percpu.c
index ed92caa2aa3b..41e7a5f5ab1b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -860,7 +860,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	/* init and register vm area */
 	static_vm.flags = VM_ALLOC;
 	static_vm.size = pcpu_chunk_size;
-	vm_area_register_early(&static_vm);
+	vm_area_register_early(&static_vm, PAGE_SIZE);
 
 	/* init static_chunk */
 	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 224eca9650a8..366ae9ea6af2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -995,7 +995,7 @@ EXPORT_SYMBOL(vm_map_ram);
 /**
  * vm_area_register_early - register vmap area early during boot
  * @vm: vm_struct to register
- * @size: size of area to register
+ * @align: requested alignment
  *
  * This function is used to register kernel vm area before
  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
@@ -1004,12 +1004,15 @@ EXPORT_SYMBOL(vm_map_ram);
  *
  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
  */
-void __init vm_area_register_early(struct vm_struct *vm)
+void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 {
 	static size_t vm_init_off __initdata;
+	unsigned long addr;
+
+	addr = ALIGN(VMALLOC_START + vm_init_off, align);
+	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
 
-	vm->addr = (void *)VMALLOC_START + vm_init_off;
-	vm_init_off = PFN_ALIGN(vm_init_off + vm->size);
+	vm->addr = (void *)addr;
 
 	vm->next = vmlist;
 	vmlist = vm;
-- 
cgit v1.2.3-71-gd317


From 8d408b4be37bc49c9086531f2ebe411cf5731746 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Feb 2009 11:57:21 +0900
Subject: percpu: give more latitude to arch specific first chunk
 initialization

Impact: more latitude for first percpu chunk allocation

The first percpu chunk serves the kernel static percpu area and may or
may not contain extra room for further dynamic allocation.
Initialization of the first chunk needs to be done before normal
memory allocation service is up, so it has its own init path -
pcpu_setup_static().

It seems archs need more latitude while initializing the first chunk
for example to take advantage of large page mapping.  This patch makes
the following changes to allow this.

* Define PERCPU_DYNAMIC_RESERVE to give arch hint about how much space
  to reserve in the first chunk for further dynamic allocation.

* Rename pcpu_setup_static() to pcpu_setup_first_chunk().

* Make pcpu_setup_first_chunk() much more flexible by fetching page
  pointer by callback and adding optional @unit_size, @free_size and
  @base_addr arguments which allow archs to selectively part of chunk
  initialization to their likings.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |  15 ++++-
 include/linux/percpu.h         |  39 ++++++++++-
 mm/percpu.c                    | 149 ++++++++++++++++++++++++++++++++---------
 3 files changed, 167 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 671e6528a82d..d928e8887201 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -41,6 +41,16 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+	if (pageno < pcpu4k_nr_static_pages)
+		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+	return NULL;
+}
+
 static void __init pcpu4k_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
@@ -109,7 +119,10 @@ void __init setup_per_cpu_areas(void)
 		}
 	}
 
-	pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size);
+	pcpu4k_pages = pages;
+	pcpu4k_nr_static_pages = nr_cpu_pages;
+	pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0,
+						NULL, pcpu4k_populate_pte);
 
 	free_bootmem(__pa(pages), pages_size);
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 18080995ff3e..910beb0abea2 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -78,12 +78,47 @@
 
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
+/* minimum unit size, also is the maximum supported allocation size */
+#define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
+
+/*
+ * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
+ * back on the first chunk if arch is manually allocating and mapping
+ * it for faster access (as a part of large page mapping for example).
+ * Note that dynamic percpu allocator covers both static and dynamic
+ * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ *
+ * On typical configuration with modules, the following values leave
+ * about 8k of free space on the first chunk after boot on both x86_32
+ * and 64 when module support is enabled.  When module support is
+ * disabled, it's much tighter.
+ */
+#ifndef PERCPU_DYNAMIC_RESERVE
+#  if BITS_PER_LONG > 32
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#    endif
+#  else
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
+#    endif
+#  endif
+#endif	/* PERCPU_DYNAMIC_RESERVE */
+
 extern void *pcpu_base_addr;
 
+typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
-extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
-				       struct page **pages, size_t cpu_size);
+extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+					size_t static_size, size_t unit_size,
+					size_t free_size, void *base_addr,
+					pcpu_populate_pte_fn_t populate_pte_fn);
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index d9e6e5d1dbd4..9ac01980cce0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -48,8 +48,8 @@
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back
  *
- * - use pcpu_setup_static() during percpu area initialization to
- *   setup kernel static percpu area
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ *   setup the first chunk containing the kernel static percpu area
  */
 
 #include <linux/bitmap.h>
@@ -67,7 +67,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-#define PCPU_MIN_UNIT_PAGES		16	/* max alloc size in pages */
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 
@@ -80,6 +79,7 @@ struct pcpu_chunk {
 	int			map_used;	/* # of map entries used */
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
+	bool			immutable;	/* no [de]population allowed */
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 
@@ -521,6 +521,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 	unsigned int last = num_possible_cpus() - 1;
 	unsigned int cpu;
 
+	/* unmap must not be done on immutable chunk */
+	WARN_ON(chunk->immutable);
+
 	/*
 	 * Each flushing trial can be very expensive, issue flush on
 	 * the whole region at once rather than doing it for each cpu.
@@ -602,6 +605,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 	unsigned int cpu;
 	int err;
 
+	/* map must not be done on immutable chunk */
+	WARN_ON(chunk->immutable);
+
 	for_each_possible_cpu(cpu) {
 		err = map_kernel_range_noflush(
 				pcpu_chunk_addr(chunk, cpu, page_start),
@@ -727,8 +733,7 @@ void *__alloc_percpu(size_t size, size_t align)
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
-	if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE ||
-		     align > PAGE_SIZE)) {
+	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 		WARN(true, "illegal size (%zu) or align (%zu) for "
 		     "percpu allocation\n", size, align);
 		return NULL;
@@ -776,6 +781,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
+	WARN_ON(chunk->immutable);
 	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 	list_del(&chunk->list);
 	rb_erase(&chunk->rb_node, &pcpu_addr_root);
@@ -821,33 +827,73 @@ void free_percpu(void *ptr)
 EXPORT_SYMBOL_GPL(free_percpu);
 
 /**
- * pcpu_setup_static - initialize kernel static percpu area
- * @populate_pte_fn: callback to allocate pagetable
- * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
- * @cpu_size: the size of static percpu area in bytes
- *
- * Initialize kernel static percpu area.  The caller should allocate
- * all the necessary pages and pass them in @pages.
- * @populate_pte_fn() is called on each page to be used for percpu
- * mapping and is responsible for making sure all the necessary page
- * tables for the page is allocated.
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @get_page_fn: callback to fetch page pointer
+ * @static_size: the size of static percpu area in bytes
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
+ * @free_size: free size in bytes, 0 for auto
+ * @base_addr: mapped address, NULL for auto
+ * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ *
+ * Initialize the first percpu chunk which contains the kernel static
+ * perpcu area.  This function is to be called from arch percpu area
+ * setup path.  The first two parameters are mandatory.  The rest are
+ * optional.
+ *
+ * @get_page_fn() should return pointer to percpu page given cpu
+ * number and page number.  It should at least return enough pages to
+ * cover the static area.  The returned pages for static area should
+ * have been initialized with valid data.  If @unit_size is specified,
+ * it can also return pages after the static area.  NULL return
+ * indicates end of pages for the cpu.  Note that @get_page_fn() must
+ * return the same number of pages for all cpus.
+ *
+ * @unit_size, if non-zero, determines unit size and must be aligned
+ * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ *
+ * @free_size determines the number of free bytes after the static
+ * area in the first chunk.  If zero, whatever left is available.
+ * Specifying non-zero value make percpu leave the area after
+ * @static_size + @free_size alone.
+ *
+ * Non-null @base_addr means that the caller already allocated virtual
+ * region for the first chunk and mapped it.  percpu must not mess
+ * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
+ * @populate_pte_fn doesn't make any sense.
+ *
+ * @populate_pte_fn is used to populate the pagetable.  NULL means the
+ * caller already populated the pagetable.
  *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
-				struct page **pages, size_t cpu_size)
+size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+				     size_t static_size, size_t unit_size,
+				     size_t free_size, void *base_addr,
+				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct static_vm;
 	struct pcpu_chunk *static_chunk;
-	int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
 	unsigned int cpu;
+	int nr_pages;
 	int err, i;
 
-	pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size));
+	/* santiy checks */
+	BUG_ON(!static_size);
+	BUG_ON(!unit_size && free_size);
+	BUG_ON(unit_size && unit_size < static_size + free_size);
+	BUG_ON(unit_size & ~PAGE_MASK);
+	BUG_ON(base_addr && !unit_size);
+	BUG_ON(base_addr && populate_pte_fn);
 
-	pcpu_static_size = cpu_size;
+	if (unit_size)
+		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+	else
+		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
+					PFN_UP(static_size));
+
+	pcpu_static_size = static_size;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -862,29 +908,66 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init and register vm area */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
-	vm_area_register_early(&static_vm, PAGE_SIZE);
-
 	/* init static_chunk */
 	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&static_chunk->list);
 	static_chunk->vm = &static_vm;
-	static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+
+	if (free_size)
+		static_chunk->free_size = free_size;
+	else
+		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+
 	static_chunk->contig_hint = static_chunk->free_size;
 
-	/* assign pages and map them */
+	/* allocate vm address */
+	static_vm.flags = VM_ALLOC;
+	static_vm.size = pcpu_chunk_size;
+
+	if (!base_addr)
+		vm_area_register_early(&static_vm, PAGE_SIZE);
+	else {
+		/*
+		 * Pages already mapped.  No need to remap into
+		 * vmalloc area.  In this case the static chunk can't
+		 * be mapped or unmapped by percpu and is marked
+		 * immutable.
+		 */
+		static_vm.addr = base_addr;
+		static_chunk->immutable = true;
+	}
+
+	/* assign pages */
+	nr_pages = -1;
 	for_each_possible_cpu(cpu) {
-		for (i = 0; i < nr_cpu_pages; i++) {
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
-			populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+		for (i = 0; i < pcpu_unit_pages; i++) {
+			struct page *page = get_page_fn(cpu, i);
+
+			if (!page)
+				break;
+			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
 		}
+
+		BUG_ON(i < PFN_UP(pcpu_static_size));
+
+		if (nr_pages < 0)
+			nr_pages = i;
+		else
+			BUG_ON(nr_pages != i);
 	}
 
-	err = pcpu_map(static_chunk, 0, nr_cpu_pages);
-	if (err)
-		panic("failed to setup static percpu area, err=%d\n", err);
+	/* map them */
+	if (populate_pte_fn) {
+		for_each_possible_cpu(cpu)
+			for (i = 0; i < nr_pages; i++)
+				populate_pte_fn(pcpu_chunk_addr(static_chunk,
+								cpu, i));
+
+		err = pcpu_map(static_chunk, 0, nr_pages);
+		if (err)
+			panic("failed to setup static percpu area, err=%d\n",
+			      err);
+	}
 
 	/* link static_chunk in */
 	pcpu_chunk_relocate(static_chunk, -1);
-- 
cgit v1.2.3-71-gd317


From d060ffc1840e37100628f520e66600c5ae483b44 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Tue, 24 Feb 2009 15:23:58 +0100
Subject: netfilter: install missing headers

iptables imports headers from (the unifdefed headers of a)
kernel tree, but some headers happened to not be installed.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild      | 6 ++++++
 include/linux/netfilter_ipv6/Kbuild | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index deeaee5c83f2..947b47d7f6c0 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -14,8 +14,11 @@ header-y += xt_NFQUEUE.h
 header-y += xt_RATEEST.h
 header-y += xt_SECMARK.h
 header-y += xt_TCPMSS.h
+header-y += xt_TCPOPTSTRIP.h
+header-y += xt_TPROXY.h
 header-y += xt_comment.h
 header-y += xt_connbytes.h
+header-y += xt_connlimit.h
 header-y += xt_connmark.h
 header-y += xt_conntrack.h
 header-y += xt_dccp.h
@@ -31,6 +34,7 @@ header-y += xt_mark.h
 header-y += xt_multiport.h
 header-y += xt_owner.h
 header-y += xt_pkttype.h
+header-y += xt_quota.h
 header-y += xt_rateest.h
 header-y += xt_realm.h
 header-y += xt_recent.h
@@ -40,6 +44,8 @@ header-y += xt_statistic.h
 header-y += xt_string.h
 header-y += xt_tcpmss.h
 header-y += xt_tcpudp.h
+header-y += xt_time.h
+header-y += xt_u32.h
 
 unifdef-y += nf_conntrack_common.h
 unifdef-y += nf_conntrack_ftp.h
diff --git a/include/linux/netfilter_ipv6/Kbuild b/include/linux/netfilter_ipv6/Kbuild
index 8887a5fcd1d0..aca4bd1f6d7c 100644
--- a/include/linux/netfilter_ipv6/Kbuild
+++ b/include/linux/netfilter_ipv6/Kbuild
@@ -11,6 +11,7 @@ header-y += ip6t_length.h
 header-y += ip6t_limit.h
 header-y += ip6t_mac.h
 header-y += ip6t_mark.h
+header-y += ip6t_mh.h
 header-y += ip6t_multiport.h
 header-y += ip6t_opts.h
 header-y += ip6t_owner.h
-- 
cgit v1.2.3-71-gd317


From 7c37730cd31ddb2d3a1da142af9b18c29b8c433b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 12:07:53 -0500
Subject: tracing: add DEFINE_TRACE_FMT to tracepoint.h

This patch creates a DEFINE_TRACE_FMT to map to DECLARE_TRACE.
This allows for the developers to place format strings and
args in with their tracepoint declaration. A tracer may now
override the DEFINE_TRACE_FMT macro and use it to record
a default format.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 757005458366..34ae464effff 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,4 +153,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
+#define DEFINE_TRACE_FMT(name, proto, args, fmt)		\
+	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 1ce85fe402137824246bad03ff85f3913d565c17 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 24 Feb 2009 23:18:28 -0800
Subject: netlink: change nlmsg_notify() return value logic

This patch changes the return value of nlmsg_notify() as follows:

If NETLINK_BROADCAST_ERROR is set by any of the listeners and
an error in the delivery happened, return the broadcast error;
else if there are no listeners apart from the socket that
requested a change with the echo flag, return the result of the
unicast notification. Thus, with this patch, the unicast
notification is handled in the same way of a broadcast listener
that has set the NETLINK_BROADCAST_ERROR socket flag.

This patch is useful in case that the caller of nlmsg_notify()
wants to know the result of the delivery of a netlink notification
(including the broadcast delivery) and take any action in case
that the delivery failed. For example, ctnetlink can drop packets
if the event delivery failed to provide reliable logging and
state-synchronization at the cost of dropping packets.

This patch also modifies the rtnetlink code to ignore the return
value of rtnl_notify() in all callers. The function rtnl_notify()
(before this patch) returned the error of the unicast notification
which makes rtnl_set_sk_err() reports errors to all listeners. This
is not of any help since the origin of the change (the socket that
requested the echoing) notices the ENOBUFS error if the notification
fails and should resync itself.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  4 ++--
 net/bridge/br_netlink.c   |  3 ++-
 net/core/fib_rules.c      |  3 ++-
 net/core/neighbour.c      |  3 ++-
 net/core/rtnetlink.c      |  9 +++++----
 net/decnet/dn_dev.c       |  3 ++-
 net/decnet/dn_table.c     |  3 ++-
 net/ipv4/devinet.c        |  3 ++-
 net/ipv4/fib_semantics.c  |  5 +++--
 net/ipv6/addrconf.c       |  9 ++++++---
 net/ipv6/ndisc.c          |  6 +-----
 net/ipv6/route.c          |  5 +++--
 net/netlink/af_netlink.c  | 14 ++++++++++----
 net/phonet/pn_netlink.c   |  5 +++--
 14 files changed, 45 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 1e5f6730ff31..35a07c830f79 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -622,8 +622,8 @@ static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str)
 
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
-extern int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
-		       struct nlmsghdr *nlh, gfp_t flags);
+extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
+			u32 group, struct nlmsghdr *nlh, gfp_t flags);
 extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ba7be195803c..fcffb3fb1177 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -98,7 +98,8 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 32b3a0152d7a..98691e1466b8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -588,7 +588,8 @@ static void notify_rule_change(int event, struct fib_rule *rule,
 		goto errout;
 	}
 
-	err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+	rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, ops->nlgroup, err);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 278a142d1047..e1144cb94b99 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2534,7 +2534,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 790dd205bb5d..d78030f88bd0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -455,8 +455,8 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
 	return nlmsg_unicast(rtnl, skb, pid);
 }
 
-int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
-		struct nlmsghdr *nlh, gfp_t flags)
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+		 struct nlmsghdr *nlh, gfp_t flags)
 {
 	struct sock *rtnl = net->rtnl;
 	int report = 0;
@@ -464,7 +464,7 @@ int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
 	if (nlh)
 		report = nlmsg_report(nlh);
 
-	return nlmsg_notify(rtnl, skb, pid, group, report, flags);
+	nlmsg_notify(rtnl, skb, pid, group, report, flags);
 }
 
 void rtnl_set_sk_err(struct net *net, u32 group, int error)
@@ -1246,7 +1246,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index daf2b98b15fe..e457769bf7a7 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -769,7 +769,8 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+	rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 69ad9280c693..67054b0d550f 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -375,7 +375,8 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+	rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d519a6a66726..126bb911880f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1216,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+	rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4817dea3bc73..f831df500907 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
-			  info->nlh, GFP_KERNEL);
+	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+		    info->nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 03e2a1ad71e9..f8f76d6e21cb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3638,7 +3638,8 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3849,7 +3850,8 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3919,7 +3921,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3cd83b85e9ef..9f061d1adbc2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1095,11 +1095,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
 		&ipv6_hdr(ra)->saddr);
 	nlmsg_end(skb, nlh);
 
-	err = rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL,
-			  GFP_ATOMIC);
-	if (err < 0)
-		goto errout;
-
+	rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
 	return;
 
 nla_put_failure:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c3d486a3edad..1394ddb6e35c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2400,8 +2400,9 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
-			  info->nlh, gfp_any());
+	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
+		    info->nlh, gfp_any());
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ed587be1e1c2..2760b62dc2c1 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1760,12 +1760,18 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
 			exclude_pid = pid;
 		}
 
-		/* errors reported via destination sk->sk_err */
-		nlmsg_multicast(sk, skb, exclude_pid, group, flags);
+		/* errors reported via destination sk->sk_err, but propagate
+		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
+		err = nlmsg_multicast(sk, skb, exclude_pid, group, flags);
 	}
 
-	if (report)
-		err = nlmsg_unicast(sk, skb, pid);
+	if (report) {
+		int err2;
+
+		err2 = nlmsg_unicast(sk, skb, pid);
+		if (!err || err == -ESRCH)
+			err = err2;
+	}
 
 	return err;
 }
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 1ceea1f92413..cec4e5951681 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -47,8 +47,9 @@ static void rtmsg_notify(int event, struct net_device *dev, u8 addr)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, dev_net(dev), 0,
-			  RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+	rtnl_notify(skb, dev_net(dev), 0,
+		    RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
-- 
cgit v1.2.3-71-gd317


From 6e2756376c706e4da3454a272947983f92e80a7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:48 +0100
Subject: generic-ipi: remove CSD_FLAG_WAIT

Oleg noticed that we don't strictly need CSD_FLAG_WAIT, rework
the code so that we can use CSD_FLAG_LOCK for both purposes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-softirq.c |  2 +-
 include/linux/smp.h |  3 +-
 kernel/sched.c      |  2 +-
 kernel/smp.c        | 90 ++++++++++++++---------------------------------------
 kernel/softirq.c    |  2 +-
 5 files changed, 28 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ce0efc6b26dc..ee9c21602228 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
 		data->info = rq;
 		data->flags = 0;
 
-		__smp_call_function_single(cpu, data);
+		__smp_call_function_single(cpu, data, 0);
 		return 0;
 	}
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 715196b09d67..00866d7fdf34 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -82,7 +82,8 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 	return 0;
 }
 
-void __smp_call_function_single(int cpuid, struct call_single_data *data);
+void __smp_call_function_single(int cpuid, struct call_single_data *data,
+				int wait);
 
 /*
  * Generic and arch helpers
diff --git a/kernel/sched.c b/kernel/sched.c
index 410eec404133..d4c2749a2998 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1093,7 +1093,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
-		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
 		rq->hrtick_csd_pending = 1;
 	}
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 7a0ce25829dc..f5308258891a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,8 +23,7 @@ static struct {
 };
 
 enum {
-	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_LOCK		= 0x02,
+	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
@@ -94,31 +93,6 @@ static int __cpuinit init_call_single_data(void)
 }
 early_initcall(init_call_single_data);
 
-/*
- * csd_wait/csd_complete are used for synchronous ipi calls
- */
-static void csd_wait_prepare(struct call_single_data *data)
-{
-	data->flags |= CSD_FLAG_WAIT;
-}
-
-static void csd_complete(struct call_single_data *data)
-{
-	if (data->flags & CSD_FLAG_WAIT) {
-		/*
-		 * ensure we're all done before saying we are
-		 */
-		smp_mb();
-		data->flags &= ~CSD_FLAG_WAIT;
-	}
-}
-
-static void csd_wait(struct call_single_data *data)
-{
-	while (data->flags & CSD_FLAG_WAIT)
-		cpu_relax();
-}
-
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
@@ -126,10 +100,15 @@ static void csd_wait(struct call_single_data *data)
  * function call. For multi-cpu calls its even more interesting as we'll have
  * to ensure no other cpu is observing our csd.
  */
-static void csd_lock(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *data)
 {
 	while (data->flags & CSD_FLAG_LOCK)
 		cpu_relax();
+}
+
+static void csd_lock(struct call_single_data *data)
+{
+	csd_lock_wait(data);
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
@@ -155,11 +134,12 @@ static void csd_unlock(struct call_single_data *data)
  * Insert a previously allocated call_single_data element for execution
  * on the given CPU. data must already have ->func, ->info, and ->flags set.
  */
-static void generic_exec_single(int cpu, struct call_single_data *data)
+static
+void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 {
 	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
-	int wait = data->flags & CSD_FLAG_WAIT, ipi;
 	unsigned long flags;
+	int ipi;
 
 	spin_lock_irqsave(&dst->lock, flags);
 	ipi = list_empty(&dst->list);
@@ -182,7 +162,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_wait(data);
+		csd_lock_wait(data);
 }
 
 /*
@@ -232,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
 		if (refs)
 			continue;
 
-		csd_complete(&data->csd);
 		csd_unlock(&data->csd);
 	}
 
@@ -270,9 +249,6 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT)
-			csd_complete(data);
-
 		/*
 		 * Unlocked CSDs are valid through generic_exec_single()
 		 */
@@ -313,36 +289,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		func(info);
 		local_irq_restore(flags);
 	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data;
+		struct call_single_data *data = &d;
 
-		if (!wait) {
-			/*
-			 * We are calling a function on a single CPU
-			 * and we are not going to wait for it to finish.
-			 * We use a per cpu data to pass the information to
-			 * that CPU. Since all callers of this code will
-			 * use the same data, we must synchronize the
-			 * callers to prevent a new caller from corrupting
-			 * the data before the callee can access it.
-			 *
-			 * The CSD_FLAG_LOCK is used to let us know when
-			 * the IPI handler is done with the data.
-			 * The first caller will set it, and the callee
-			 * will clear it. The next caller must wait for
-			 * it to clear before we set it again. This
-			 * will make sure the callee is done with the
-			 * data before a new caller will use it.
-			 */
+		if (!wait)
 			data = &__get_cpu_var(csd_data);
-			csd_lock(data);
-		} else {
-			data = &d;
-			csd_wait_prepare(data);
-		}
+
+		csd_lock(data);
 
 		data->func = func;
 		data->info = info;
-		generic_exec_single(cpu, data);
+		generic_exec_single(cpu, data, wait);
 	} else {
 		err = -ENXIO;	/* CPU not online */
 	}
@@ -362,12 +318,15 @@ EXPORT_SYMBOL(smp_call_function_single);
  * instance.
  *
  */
-void __smp_call_function_single(int cpu, struct call_single_data *data)
+void __smp_call_function_single(int cpu, struct call_single_data *data,
+				int wait)
 {
+	csd_lock(data);
+
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+	WARN_ON(wait && irqs_disabled());
 
-	generic_exec_single(cpu, data);
+	generic_exec_single(cpu, data, wait);
 }
 
 /* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
@@ -425,9 +384,6 @@ void smp_call_function_many(const struct cpumask *mask,
 	csd_lock(&data->csd);
 
 	spin_lock_irqsave(&data->lock, flags);
-	if (wait)
-		csd_wait_prepare(&data->csd);
-
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
@@ -456,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_wait(&data->csd);
+		csd_lock_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..48c3d5d627a8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
 		cp->flags = 0;
 		cp->priv = softirq;
 
-		__smp_call_function_single(cpu, cp);
+		__smp_call_function_single(cpu, cp, 0);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3-71-gd317


From d2b0261506602bd969164879206027b30358ffdf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 14:36:45 +0100
Subject: alloc_percpu: fix UP build

Impact: build fix

the !SMP branch had a 'gfp' leftover:

 include/linux/percpu.h: In function '__alloc_percpu':
 include/linux/percpu.h:160: error: 'gfp' undeclared (first use in this function)
 include/linux/percpu.h:160: error: (Each undeclared identifier is reported only once
 include/linux/percpu.h:160: error: for each function it appears in.)

Use GFP_KERNEL like the SMP version does.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/percpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 910beb0abea2..d8e5a9abbce0 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -157,7 +157,7 @@ static inline void *__alloc_percpu(size_t size, size_t align)
 	 * percpu sections on SMP for which this path isn't used.
 	 */
 	WARN_ON_ONCE(align > __alignof__(unsigned long long));
-	return kzalloc(size, gfp);
+	return kzalloc(size, GFP_KERNEL);
 }
 
 static inline void free_percpu(void *p)
-- 
cgit v1.2.3-71-gd317


From 2b9d1496e7835a603c340e8f0dd81f4b74d5f248 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 15:48:43 +0100
Subject: time: ntp: make 64-bit constants more robust

Impact: cleanup, no functionality changed

 - make PPM_SCALE an explicit s64 constant, to
   remove (s64) casts from usage sites.

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2536	    114	    136	   2786	    ae2	ntp.o.before
   2536	    114	    136	   2786	    ae2	ntp.o.after

md5:
   40a7728d1188aa18e83e21a81fa7b150  ntp.o.before.asm
   40a7728d1188aa18e83e21a81fa7b150  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h | 2 +-
 kernel/time/ntp.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 998a55d80acf..aa3475fcff64 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -190,7 +190,7 @@ struct timex {
  * offset and maximum frequency tolerance.
  */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
-#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
+#define PPM_SCALE ((s64)NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
 #define PPM_SCALE_INV_SHIFT 19
 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
 		       PPM_SCALE + 1)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4346ed6e623f..7447d57e021a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -408,7 +408,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 		time_status &= ~STA_NANO;
 
 	if (txc->modes & ADJ_FREQUENCY) {
-		time_freq = (s64)txc->freq * PPM_SCALE;
+		time_freq = txc->freq * PPM_SCALE;
 		time_freq = min(time_freq, MAXFREQ_SCALED);
 		time_freq = max(time_freq, -MAXFREQ_SCALED);
 	}
@@ -505,7 +505,7 @@ int do_adjtimex(struct timex *txc)
 		result = TIME_ERROR;
 
 	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
-					 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
+					 PPM_SCALE_INV, NTP_SCALE_SHIFT);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
-- 
cgit v1.2.3-71-gd317


From e317603694bfd17b28a40de9d65e1a4ec12f816e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Feb 2009 10:54:17 +0900
Subject: percpu: fix too low alignment restriction on UP

UP __alloc_percpu() triggered WARN_ON_ONCE() if the requested
alignment is larger than that of unsigned long long, which is too
small for all the cacheline aligned allocations.  Bump it up to
SMP_CACHE_BYTES which kmalloc allocations generally guarantee.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/percpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d8e5a9abbce0..545b068bcb70 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -156,7 +156,7 @@ static inline void *__alloc_percpu(size_t size, size_t align)
 	 * on it.  Larger alignment should only be used for module
 	 * percpu sections on SMP for which this path isn't used.
 	 */
-	WARN_ON_ONCE(align > __alignof__(unsigned long long));
+	WARN_ON_ONCE(align > SMP_CACHE_BYTES);
 	return kzalloc(size, GFP_KERNEL);
 }
 
-- 
cgit v1.2.3-71-gd317


From eef62a6826b8ab530cefff5aa55c1661a209c803 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Feb 2009 15:49:52 -0500
Subject: tracing: rename DEFINE_TRACE_FMT to just TRACE_FORMAT

There's been a bit confusion to whether DEFINE/DECLARE_TRACE_FMT should
be a DEFINE or a DECLARE. Ingo Molnar suggested simply calling it
TRACE_FORMAT.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h        |  2 +-
 include/trace/sched_event_types.h | 26 +++++++++++++-------------
 kernel/trace/trace_events.h       |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 34ae464effff..3de09fa8e01d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,7 +153,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
-#define DEFINE_TRACE_FMT(name, proto, args, fmt)		\
+#define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
 
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a4f662940f4e..a3d3d66a51c8 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,72 +1,72 @@
 
 /* use <trace/sched.h> instead */
-#ifndef DEFINE_TRACE_FMT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
 
-DEFINE_TRACE_FMT(sched_kthread_stop,
+TRACE_FORMAT(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 	TPARGS(t),
 	TPFMT("task %s:%d", t->comm, t->pid));
 
-DEFINE_TRACE_FMT(sched_kthread_stop_ret,
+TRACE_FORMAT(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 	TPARGS(ret),
 	TPFMT("ret=%d", ret));
 
-DEFINE_TRACE_FMT(sched_wait_task,
+TRACE_FORMAT(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 	TPARGS(rq, p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_wakeup,
+TRACE_FORMAT(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d %s",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_wakeup_new,
+TRACE_FORMAT(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_switch,
+TRACE_FORMAT(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 	TPARGS(rq, prev, next),
 	TPFMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid));
 
-DEFINE_TRACE_FMT(sched_migrate_task,
+TRACE_FORMAT(sched_migrate_task,
 	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
 	TPARGS(p, orig_cpu, dest_cpu),
 	TPFMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu));
 
-DEFINE_TRACE_FMT(sched_process_free,
+TRACE_FORMAT(sched_process_free,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_exit,
+TRACE_FORMAT(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_wait,
+TRACE_FORMAT(sched_process_wait,
 	TPPROTO(struct pid *pid),
 	TPARGS(pid),
 	TPFMT("pid %d", pid));
 
-DEFINE_TRACE_FMT(sched_process_fork,
+TRACE_FORMAT(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 	TPARGS(parent, child),
 	TPFMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid));
 
-DEFINE_TRACE_FMT(sched_signal_send,
+TRACE_FORMAT(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 	TPARGS(sig, p),
 	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index cb8455b3ac9a..deb95e5006c8 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -17,8 +17,8 @@ struct ftrace_event_call {
 #undef TPFMT
 #define TPFMT(fmt, args...)	fmt "\n", ##args
 
-#undef DEFINE_TRACE_FMT
-#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
 	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-- 
cgit v1.2.3-71-gd317


From 3cdfdf91fcc77cfc82592e2b5c2ab35abe819c41 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Feb 2009 15:54:30 -0500
Subject: tracing: wrap arguments with PARAMS

Peter Zijlstra warned that TPPROTO and TPARGS might become something
other than a simple copy of itself. To prevent this from having
side effects in the TRACE_FORMAT macro in tracepoint.h, we add a
PARAMS() macro to be defined as just a wrapper.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3de09fa8e01d..62d13391a240 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,7 +153,8 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
+#define PARAMS(args...) args
 #define TRACE_FORMAT(name, proto, args, fmt)		\
-	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 14131f2f98ac350ee9e73faed916d2238a8b6a0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 18:47:11 +0100
Subject: tracing: implement trace_clock_*() APIs

Impact: implement new tracing timestamp APIs

Add three trace clock variants, with differing scalability/precision
tradeoffs:

 -   local: CPU-local trace clock
 -  medium: scalable global clock with some jitter
 -  global: globally monotonic, serialized clock

Make the ring-buffer use the local trace clock internally.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/trace_clock.h |  19 +++++++++
 kernel/trace/Makefile       |   1 +
 kernel/trace/ring_buffer.c  |   5 +--
 kernel/trace/trace_clock.c  | 101 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/trace_clock.h
 create mode 100644 kernel/trace/trace_clock.c

(limited to 'include/linux')

diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
new file mode 100644
index 000000000000..7a8130384087
--- /dev/null
+++ b/include/linux/trace_clock.h
@@ -0,0 +1,19 @@
+#ifndef _LINUX_TRACE_CLOCK_H
+#define _LINUX_TRACE_CLOCK_H
+
+/*
+ * 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ */
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+extern u64 notrace trace_clock_local(void);
+extern u64 notrace trace_clock(void);
+extern u64 notrace trace_clock_global(void);
+
+#endif /* _LINUX_TRACE_CLOCK_H */
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 664b6c0dc75a..c931fe0560cb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8f19f1aa42b0..a8c275c01e83 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/trace_clock.h>
 #include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
@@ -12,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>	/* used for sched_clock() (for now) */
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
@@ -112,14 +112,13 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
-/* FIXME!!! */
 u64 ring_buffer_time_stamp(int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
 	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = sched_clock() << DEBUG_SHIFT;
+	time = trace_clock_local() << DEBUG_SHIFT;
 	preempt_enable_no_resched_notrace();
 
 	return time;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..2d4953f93560
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,101 @@
+/*
+ * tracing clocks
+ *
+ *  Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Implements 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ *
+ * Tracer plugins will chose a default from these clocks.
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/ktime.h>
+
+/*
+ * trace_clock_local(): the simplest and least coherent tracing clock.
+ *
+ * Useful for tracing that does not cross to other CPUs nor
+ * does it go through idle events.
+ */
+u64 notrace trace_clock_local(void)
+{
+	/*
+	 * sched_clock() is an architecture implemented, fast, scalable,
+	 * lockless clock. It is not guaranteed to be coherent across
+	 * CPUs, nor across CPU idle events.
+	 */
+	return sched_clock();
+}
+
+/*
+ * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * but not completely incorrect when crossing CPUs either.
+ *
+ * This is based on cpu_clock(), which will allow at most ~1 jiffy of
+ * jitter between CPUs. So it's a pretty scalable clock, but there
+ * can be offsets in the trace data.
+ */
+u64 notrace trace_clock(void)
+{
+	return cpu_clock(raw_smp_processor_id());
+}
+
+
+/*
+ * trace_clock_global(): special globally coherent trace clock
+ *
+ * It has higher overhead than the other trace clocks but is still
+ * an order of magnitude faster than GTOD derived hardware clocks.
+ *
+ * Used by plugins that need globally coherent timestamps.
+ */
+
+static u64 prev_trace_clock_time;
+
+static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+u64 notrace trace_clock_global(void)
+{
+	unsigned long flags;
+	int this_cpu;
+	u64 now;
+
+	raw_local_irq_save(flags);
+
+	this_cpu = raw_smp_processor_id();
+	now = cpu_clock(this_cpu);
+	/*
+	 * If in an NMI context then dont risk lockups and return the
+	 * cpu_clock() time:
+	 */
+	if (unlikely(in_nmi()))
+		goto out;
+
+	__raw_spin_lock(&trace_clock_lock);
+
+	/*
+	 * TODO: if this happens often then maybe we should reset
+	 * my_scd->clock to prev_trace_clock_time+1, to make sure
+	 * we start ticking with the local clock from now on?
+	 */
+	if ((s64)(now - prev_trace_clock_time) < 0)
+		now = prev_trace_clock_time + 1;
+
+	prev_trace_clock_time = now;
+
+	__raw_spin_unlock(&trace_clock_lock);
+
+ out:
+	raw_local_irq_restore(flags);
+
+	return now;
+}
-- 
cgit v1.2.3-71-gd317


From b342501cd31e5546d0c9ca8ceff5ded1832f9e5b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 20:20:29 +0100
Subject: sched: allow architectures to specify sched_clock_stable

Allow CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures to still specify
that their sched_clock() implementation is reliable.

This will be used by x86 to switch on a faster sched_clock_cpu()
implementation on certain CPU types.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 10 ++++++++++
 kernel/sched_clock.c  | 45 ++++++++++++++++++++-------------------------
 2 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8981e52c714f..a063d19b7a7d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1670,6 +1670,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 	return set_cpus_allowed_ptr(p, &new_mask);
 }
 
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+extern int sched_clock_stable;
+#endif
+
 extern unsigned long long sched_clock(void);
 
 extern void sched_clock_init(void);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..a755d023805a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -24,11 +24,11 @@
  * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
  * consistent between cpus (never more than 2 jiffies difference).
  */
-#include <linux/sched.h>
-#include <linux/percpu.h>
 #include <linux/spinlock.h>
-#include <linux/ktime.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -43,6 +43,10 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 static __read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__read_mostly int sched_clock_stable;
+#else
+static const int sched_clock_stable = 1;
+#endif
 
 struct sched_clock_data {
 	/*
@@ -87,7 +91,7 @@ void sched_clock_init(void)
 }
 
 /*
- * min,max except they take wrapping into account
+ * min, max except they take wrapping into account
  */
 
 static inline u64 wrap_min(u64 x, u64 y)
@@ -116,10 +120,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	if (unlikely(delta < 0))
 		delta = 0;
 
+	if (unlikely(!sched_clock_running))
+		return 0ull;
+
 	/*
 	 * scd->clock = clamp(scd->tick_gtod + delta,
-	 * 		      max(scd->tick_gtod, scd->clock),
-	 * 		      scd->tick_gtod + TICK_NSEC);
+	 *		      max(scd->tick_gtod, scd->clock),
+	 *		      scd->tick_gtod + TICK_NSEC);
 	 */
 
 	clock = scd->tick_gtod + delta;
@@ -148,12 +155,13 @@ static void lock_double_clock(struct sched_clock_data *data1,
 
 u64 sched_clock_cpu(int cpu)
 {
-	struct sched_clock_data *scd = cpu_sdc(cpu);
 	u64 now, clock, this_clock, remote_clock;
+	struct sched_clock_data *scd;
 
-	if (unlikely(!sched_clock_running))
-		return 0ull;
+	if (sched_clock_stable)
+		return sched_clock();
 
+	scd = cpu_sdc(cpu);
 	WARN_ON_ONCE(!irqs_disabled());
 	now = sched_clock();
 
@@ -193,6 +201,8 @@ u64 sched_clock_cpu(int cpu)
 	return clock;
 }
 
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
 void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd = this_scd();
@@ -235,22 +245,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-
-void sched_clock_init(void)
-{
-	sched_clock_running = 1;
-}
-
-u64 sched_clock_cpu(int cpu)
-{
-	if (unlikely(!sched_clock_running))
-		return 0;
-
-	return sched_clock();
-}
-
-#endif
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 unsigned long long cpu_clock(int cpu)
 {
-- 
cgit v1.2.3-71-gd317


From b233b28eac0cc37d07c2d007ea08c86c778c5af4 Mon Sep 17 00:00:00 2001
From: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Date: Fri, 27 Feb 2009 16:07:32 +0900
Subject: sh: maple: Support block reads and writes.

This patch updates the maple bus to support asynchronous block reads
and writes as well as generally improving the quality of the code and
supporting concurrency (all needed to support the Dreamcast visual
memory unit - a driver will also be posted for that).

Changes in the bus driver necessitate some changes in the two maple bus
input drivers that are currently in mainline.

As well as supporting block reads and writes this code clean up removes
some poor handling of locks, uses an atomic status variable to serialise
access to devices and more robusly handles the general performance
problems of the bus.

Signed-off-by: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/input/joystick/maplecontrol.c |   4 +-
 drivers/input/keyboard/maple_keyb.c   |  37 +--
 drivers/sh/maple/maple.c              | 463 +++++++++++++++++-----------------
 include/linux/maple.h                 |  62 +++--
 4 files changed, 300 insertions(+), 266 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/joystick/maplecontrol.c b/drivers/input/joystick/maplecontrol.c
index e50047bfe938..77cfde571bd9 100644
--- a/drivers/input/joystick/maplecontrol.c
+++ b/drivers/input/joystick/maplecontrol.c
@@ -3,7 +3,7 @@
  *	Based on drivers/usb/iforce.c
  *
  *	Copyright Yaegashi Takeshi, 2001
- *	Adrian McMenamin, 2008
+ *	Adrian McMenamin, 2008 - 2009
  */
 
 #include <linux/kernel.h>
@@ -29,7 +29,7 @@ static void dc_pad_callback(struct mapleq *mq)
 	struct maple_device *mapledev = mq->dev;
 	struct dc_pad *pad = maple_get_drvdata(mapledev);
 	struct input_dev *dev = pad->dev;
-	unsigned char *res = mq->recvbuf;
+	unsigned char *res = mq->recvbuf->buf;
 
 	buttons = ~le16_to_cpup((__le16 *)(res + 8));
 
diff --git a/drivers/input/keyboard/maple_keyb.c b/drivers/input/keyboard/maple_keyb.c
index 22f17a593be7..5aa2361aef95 100644
--- a/drivers/input/keyboard/maple_keyb.c
+++ b/drivers/input/keyboard/maple_keyb.c
@@ -1,8 +1,8 @@
 /*
  * SEGA Dreamcast keyboard driver
  * Based on drivers/usb/usbkbd.c
- * Copyright YAEGASHI Takeshi, 2001
- * Porting to 2.6 Copyright Adrian McMenamin, 2007, 2008
+ * Copyright (c) YAEGASHI Takeshi, 2001
+ * Porting to 2.6 Copyright (c) Adrian McMenamin, 2007 - 2009
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,7 +33,7 @@ static DEFINE_MUTEX(maple_keyb_mutex);
 
 #define NR_SCANCODES 256
 
-MODULE_AUTHOR("YAEGASHI Takeshi, Adrian McMenamin");
+MODULE_AUTHOR("Adrian McMenamin <adrian@mcmen.demon.co.uk");
 MODULE_DESCRIPTION("SEGA Dreamcast keyboard driver");
 MODULE_LICENSE("GPL");
 
@@ -115,7 +115,7 @@ static void dc_scan_kbd(struct dc_kbd *kbd)
 				input_event(dev, EV_MSC, MSC_SCAN, code);
 				input_report_key(dev, keycode, 0);
 			} else
-				printk(KERN_DEBUG "maple_keyb: "
+				dev_dbg(&dev->dev,
 					"Unknown key (scancode %#x) released.",
 					code);
 		}
@@ -127,7 +127,7 @@ static void dc_scan_kbd(struct dc_kbd *kbd)
 				input_event(dev, EV_MSC, MSC_SCAN, code);
 				input_report_key(dev, keycode, 1);
 			} else
-				printk(KERN_DEBUG "maple_keyb: "
+				dev_dbg(&dev->dev,
 					"Unknown key (scancode %#x) pressed.",
 					code);
 		}
@@ -140,7 +140,7 @@ static void dc_kbd_callback(struct mapleq *mq)
 {
 	struct maple_device *mapledev = mq->dev;
 	struct dc_kbd *kbd = maple_get_drvdata(mapledev);
-	unsigned long *buf = mq->recvbuf;
+	unsigned long *buf = (unsigned long *)(mq->recvbuf->buf);
 
 	/*
 	 * We should always get the lock because the only
@@ -159,22 +159,27 @@ static void dc_kbd_callback(struct mapleq *mq)
 
 static int probe_maple_kbd(struct device *dev)
 {
-	struct maple_device *mdev = to_maple_dev(dev);
-	struct maple_driver *mdrv = to_maple_driver(dev->driver);
+	struct maple_device *mdev;
+	struct maple_driver *mdrv;
 	int i, error;
 	struct dc_kbd *kbd;
 	struct input_dev *idev;
 
-	if (!(mdev->function & MAPLE_FUNC_KEYBOARD))
-		return -EINVAL;
+	mdev = to_maple_dev(dev);
+	mdrv = to_maple_driver(dev->driver);
 
 	kbd = kzalloc(sizeof(struct dc_kbd), GFP_KERNEL);
-	idev = input_allocate_device();
-	if (!kbd || !idev) {
+	if (!kbd) {
 		error = -ENOMEM;
 		goto fail;
 	}
 
+	idev = input_allocate_device();
+	if (!idev) {
+		error = -ENOMEM;
+		goto fail_idev_alloc;
+	}
+
 	kbd->dev = idev;
 	memcpy(kbd->keycode, dc_kbd_keycode, sizeof(kbd->keycode));
 
@@ -195,7 +200,7 @@ static int probe_maple_kbd(struct device *dev)
 
 	error = input_register_device(idev);
 	if (error)
-		goto fail;
+		goto fail_register;
 
 	/* Maple polling is locked to VBLANK - which may be just 50/s */
 	maple_getcond_callback(mdev, dc_kbd_callback, HZ/50,
@@ -207,10 +212,12 @@ static int probe_maple_kbd(struct device *dev)
 
 	return error;
 
-fail:
+fail_register:
+	maple_set_drvdata(mdev, NULL);
 	input_free_device(idev);
+fail_idev_alloc:
 	kfree(kbd);
-	maple_set_drvdata(mdev, NULL);
+fail:
 	return error;
 }
 
diff --git a/drivers/sh/maple/maple.c b/drivers/sh/maple/maple.c
index 63f0de29aa14..4054fe93d6e4 100644
--- a/drivers/sh/maple/maple.c
+++ b/drivers/sh/maple/maple.c
@@ -1,16 +1,10 @@
 /*
  * Core maple bus functionality
  *
- *  Copyright (C) 2007, 2008 Adrian McMenamin
+ *  Copyright (C) 2007 - 2009 Adrian McMenamin
  *  Copyright (C) 2001 - 2008 Paul Mundt
- *
- * Based on 2.4 code by:
- *
- *  Copyright (C) 2000-2001 YAEGASHI Takeshi
+ *  Copyright (C) 2000 - 2001 YAEGASHI Takeshi
  *  Copyright (C) 2001 M. R. Brown
- *  Copyright (C) 2001 Paul Mundt
- *
- * and others.
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -32,7 +26,7 @@
 #include <mach/dma.h>
 #include <mach/sysasic.h>
 
-MODULE_AUTHOR("Yaegashi Takeshi, Paul Mundt, M. R. Brown, Adrian McMenamin");
+MODULE_AUTHOR("Adrian McMenamin <adrian@mcmen.demon.co.uk>");
 MODULE_DESCRIPTION("Maple bus driver for Dreamcast");
 MODULE_LICENSE("GPL v2");
 MODULE_SUPPORTED_DEVICE("{{SEGA, Dreamcast/Maple}}");
@@ -49,7 +43,7 @@ static LIST_HEAD(maple_sentq);
 /* mutex to protect queue of waiting packets */
 static DEFINE_MUTEX(maple_wlist_lock);
 
-static struct maple_driver maple_dummy_driver;
+static struct maple_driver maple_unsupported_device;
 static struct device maple_bus;
 static int subdevice_map[MAPLE_PORTS];
 static unsigned long *maple_sendbuf, *maple_sendptr, *maple_lastptr;
@@ -62,8 +56,9 @@ struct maple_device_specify {
 	int unit;
 };
 
-static bool checked[4];
-static struct maple_device *baseunits[4];
+static bool checked[MAPLE_PORTS];
+static bool empty[MAPLE_PORTS];
+static struct maple_device *baseunits[MAPLE_PORTS];
 
 /**
  * maple_driver_register - register a maple driver
@@ -97,12 +92,20 @@ void maple_driver_unregister(struct maple_driver *drv)
 EXPORT_SYMBOL_GPL(maple_driver_unregister);
 
 /* set hardware registers to enable next round of dma */
-static void maplebus_dma_reset(void)
+static void maple_dma_reset(void)
 {
 	ctrl_outl(MAPLE_MAGIC, MAPLE_RESET);
 	/* set trig type to 0 for software trigger, 1 for hardware (VBLANK) */
 	ctrl_outl(1, MAPLE_TRIGTYPE);
-	ctrl_outl(MAPLE_2MBPS | MAPLE_TIMEOUT(50000), MAPLE_SPEED);
+	/*
+	* Maple system register
+	* bits 31 - 16	timeout in units of 20nsec
+	* bit 12	hard trigger - set 0 to keep responding to VBLANK
+	* bits 9 - 8	set 00 for 2 Mbps, 01 for 1 Mbps
+	* bits 3 - 0	delay (in 1.3ms) between VBLANK and start of DMA
+	* max delay is 11
+	*/
+	ctrl_outl(MAPLE_2MBPS | MAPLE_TIMEOUT(0xFFFF), MAPLE_SPEED);
 	ctrl_outl(PHYSADDR(maple_sendbuf), MAPLE_DMAADDR);
 	ctrl_outl(1, MAPLE_ENABLE);
 }
@@ -134,21 +137,16 @@ static void maple_release_device(struct device *dev)
 {
 	struct maple_device *mdev;
 	struct mapleq *mq;
-	if (!dev)
-		return;
+
 	mdev = to_maple_dev(dev);
 	mq = mdev->mq;
-	if (mq) {
-		if (mq->recvbufdcsp)
-			kmem_cache_free(maple_queue_cache, mq->recvbufdcsp);
-		kfree(mq);
-		mq = NULL;
-	}
+	kmem_cache_free(maple_queue_cache, mq->recvbuf);
+	kfree(mq);
 	kfree(mdev);
 }
 
 /**
- * maple_add_packet - add a single instruction to the queue
+ * maple_add_packet - add a single instruction to the maple bus queue
  * @mdev: maple device
  * @function: function on device being queried
  * @command: maple command to add
@@ -158,68 +156,12 @@ static void maple_release_device(struct device *dev)
 int maple_add_packet(struct maple_device *mdev, u32 function, u32 command,
 	size_t length, void *data)
 {
-	int locking, ret = 0;
+	int ret = 0;
 	void *sendbuf = NULL;
 
-	mutex_lock(&maple_wlist_lock);
-	/* bounce if device already locked */
-	locking = mutex_is_locked(&mdev->mq->mutex);
-	if (locking) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	mutex_lock(&mdev->mq->mutex);
-
 	if (length) {
-		sendbuf = kmalloc(length * 4, GFP_KERNEL);
+		sendbuf = kzalloc(length * 4, GFP_KERNEL);
 		if (!sendbuf) {
-			mutex_unlock(&mdev->mq->mutex);
-			ret = -ENOMEM;
-			goto out;
-		}
-		((__be32 *)sendbuf)[0] = cpu_to_be32(function);
-	}
-
-	mdev->mq->command = command;
-	mdev->mq->length = length;
-	if (length > 1)
-		memcpy(sendbuf + 4, data, (length - 1) * 4);
-	mdev->mq->sendbuf = sendbuf;
-
-	list_add(&mdev->mq->list, &maple_waitq);
-out:
-	mutex_unlock(&maple_wlist_lock);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(maple_add_packet);
-
-/**
- * maple_add_packet_sleeps - add a single instruction to the queue
- * @mdev: maple device
- * @function: function on device being queried
- * @command: maple command to add
- * @length: length of command string (in 32 bit words)
- * @data: remainder of command string
- *
- * Same as maple_add_packet(), but waits for the lock to become free.
- */
-int maple_add_packet_sleeps(struct maple_device *mdev, u32 function,
-	u32 command, size_t length, void *data)
-{
-	int locking, ret = 0;
-	void *sendbuf = NULL;
-
-	locking = mutex_lock_interruptible(&mdev->mq->mutex);
-	if (locking) {
-		ret = -EIO;
-		goto out;
-	}
-
-	if (length) {
-		sendbuf = kmalloc(length * 4, GFP_KERNEL);
-		if (!sendbuf) {
-			mutex_unlock(&mdev->mq->mutex);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -233,38 +175,35 @@ int maple_add_packet_sleeps(struct maple_device *mdev, u32 function,
 	mdev->mq->sendbuf = sendbuf;
 
 	mutex_lock(&maple_wlist_lock);
-	list_add(&mdev->mq->list, &maple_waitq);
+	list_add_tail(&mdev->mq->list, &maple_waitq);
 	mutex_unlock(&maple_wlist_lock);
 out:
 	return ret;
 }
-EXPORT_SYMBOL_GPL(maple_add_packet_sleeps);
+EXPORT_SYMBOL_GPL(maple_add_packet);
 
 static struct mapleq *maple_allocq(struct maple_device *mdev)
 {
 	struct mapleq *mq;
 
-	mq = kmalloc(sizeof(*mq), GFP_KERNEL);
+	mq = kzalloc(sizeof(*mq), GFP_KERNEL);
 	if (!mq)
 		goto failed_nomem;
 
+	INIT_LIST_HEAD(&mq->list);
 	mq->dev = mdev;
-	mq->recvbufdcsp = kmem_cache_zalloc(maple_queue_cache, GFP_KERNEL);
-	mq->recvbuf = (void *) P2SEGADDR(mq->recvbufdcsp);
+	mq->recvbuf = kmem_cache_zalloc(maple_queue_cache, GFP_KERNEL);
 	if (!mq->recvbuf)
 		goto failed_p2;
-	/*
-	 * most devices do not need the mutex - but
-	 * anything that injects block reads or writes
-	 * will rely on it
-	 */
-	mutex_init(&mq->mutex);
+	mq->recvbuf->buf = &((mq->recvbuf->bufx)[0]);
 
 	return mq;
 
 failed_p2:
 	kfree(mq);
 failed_nomem:
+	dev_err(&mdev->dev, "could not allocate memory for device (%d, %d)\n",
+		mdev->port, mdev->unit);
 	return NULL;
 }
 
@@ -272,12 +211,16 @@ static struct maple_device *maple_alloc_dev(int port, int unit)
 {
 	struct maple_device *mdev;
 
+	/* zero this out to avoid kobj subsystem
+	* thinking it has already been registered */
+
 	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
 	if (!mdev)
 		return NULL;
 
 	mdev->port = port;
 	mdev->unit = unit;
+
 	mdev->mq = maple_allocq(mdev);
 
 	if (!mdev->mq) {
@@ -286,19 +229,14 @@ static struct maple_device *maple_alloc_dev(int port, int unit)
 	}
 	mdev->dev.bus = &maple_bus_type;
 	mdev->dev.parent = &maple_bus;
+	init_waitqueue_head(&mdev->maple_wait);
 	return mdev;
 }
 
 static void maple_free_dev(struct maple_device *mdev)
 {
-	if (!mdev)
-		return;
-	if (mdev->mq) {
-		if (mdev->mq->recvbufdcsp)
-			kmem_cache_free(maple_queue_cache,
-				mdev->mq->recvbufdcsp);
-		kfree(mdev->mq);
-	}
+	kmem_cache_free(maple_queue_cache, mdev->mq->recvbuf);
+	kfree(mdev->mq);
 	kfree(mdev);
 }
 
@@ -320,7 +258,7 @@ static void maple_build_block(struct mapleq *mq)
 	maple_lastptr = maple_sendptr;
 
 	*maple_sendptr++ = (port << 16) | len | 0x80000000;
-	*maple_sendptr++ = PHYSADDR(mq->recvbuf);
+	*maple_sendptr++ = PHYSADDR(mq->recvbuf->buf);
 	*maple_sendptr++ =
 	    mq->command | (to << 8) | (from << 16) | (len << 24);
 	while (len-- > 0)
@@ -333,20 +271,28 @@ static void maple_send(void)
 	int i, maple_packets = 0;
 	struct mapleq *mq, *nmq;
 
-	if (!list_empty(&maple_sentq))
+	if (!maple_dma_done())
 		return;
+
+	/* disable DMA */
+	ctrl_outl(0, MAPLE_ENABLE);
+
+	if (!list_empty(&maple_sentq))
+		goto finish;
+
 	mutex_lock(&maple_wlist_lock);
-	if (list_empty(&maple_waitq) || !maple_dma_done()) {
+	if (list_empty(&maple_waitq)) {
 		mutex_unlock(&maple_wlist_lock);
-		return;
+		goto finish;
 	}
-	mutex_unlock(&maple_wlist_lock);
+
 	maple_lastptr = maple_sendbuf;
 	maple_sendptr = maple_sendbuf;
-	mutex_lock(&maple_wlist_lock);
+
 	list_for_each_entry_safe(mq, nmq, &maple_waitq, list) {
 		maple_build_block(mq);
-		list_move(&mq->list, &maple_sentq);
+		list_del_init(&mq->list);
+		list_add_tail(&mq->list, &maple_sentq);
 		if (maple_packets++ > MAPLE_MAXPACKETS)
 			break;
 	}
@@ -356,10 +302,13 @@ static void maple_send(void)
 			dma_cache_sync(0, maple_sendbuf + i * PAGE_SIZE,
 				       PAGE_SIZE, DMA_BIDIRECTIONAL);
 	}
+
+finish:
+	maple_dma_reset();
 }
 
 /* check if there is a driver registered likely to match this device */
-static int check_matching_maple_driver(struct device_driver *driver,
+static int maple_check_matching_driver(struct device_driver *driver,
 					void *devptr)
 {
 	struct maple_driver *maple_drv;
@@ -374,10 +323,7 @@ static int check_matching_maple_driver(struct device_driver *driver,
 
 static void maple_detach_driver(struct maple_device *mdev)
 {
-	if (!mdev)
-		return;
 	device_unregister(&mdev->dev);
-	mdev = NULL;
 }
 
 /* process initial MAPLE_COMMAND_DEVINFO for each device or port */
@@ -385,9 +331,9 @@ static void maple_attach_driver(struct maple_device *mdev)
 {
 	char *p, *recvbuf;
 	unsigned long function;
-	int matched, retval;
+	int matched, error;
 
-	recvbuf = mdev->mq->recvbuf;
+	recvbuf = mdev->mq->recvbuf->buf;
 	/* copy the data as individual elements in
 	* case of memory optimisation */
 	memcpy(&mdev->devinfo.function, recvbuf + 4, 4);
@@ -395,7 +341,6 @@ static void maple_attach_driver(struct maple_device *mdev)
 	memcpy(&mdev->devinfo.area_code, recvbuf + 20, 1);
 	memcpy(&mdev->devinfo.connector_direction, recvbuf + 21, 1);
 	memcpy(&mdev->devinfo.product_name[0], recvbuf + 22, 30);
-	memcpy(&mdev->devinfo.product_licence[0], recvbuf + 52, 60);
 	memcpy(&mdev->devinfo.standby_power, recvbuf + 112, 2);
 	memcpy(&mdev->devinfo.max_power, recvbuf + 114, 2);
 	memcpy(mdev->product_name, mdev->devinfo.product_name, 30);
@@ -414,43 +359,41 @@ static void maple_attach_driver(struct maple_device *mdev)
 		else
 			break;
 
-	printk(KERN_INFO "Maple device detected: %s\n",
-		mdev->product_name);
-	printk(KERN_INFO "Maple device: %s\n", mdev->product_licence);
-
 	function = be32_to_cpu(mdev->devinfo.function);
 
+	dev_info(&mdev->dev, "detected %s: function 0x%lX: at (%d, %d)\n",
+		mdev->product_name, function, mdev->port, mdev->unit);
+
 	if (function > 0x200) {
 		/* Do this silently - as not a real device */
 		function = 0;
-		mdev->driver = &maple_dummy_driver;
+		mdev->driver = &maple_unsupported_device;
 		sprintf(mdev->dev.bus_id, "%d:0.port", mdev->port);
+
 	} else {
-		printk(KERN_INFO
-			"Maple bus at (%d, %d): Function 0x%lX\n",
-			mdev->port, mdev->unit, function);
 
 		matched =
 			bus_for_each_drv(&maple_bus_type, NULL, mdev,
-				check_matching_maple_driver);
+				maple_check_matching_driver);
 
 		if (matched == 0) {
 			/* Driver does not exist yet */
-			printk(KERN_INFO
-				"No maple driver found.\n");
-			mdev->driver = &maple_dummy_driver;
+			dev_info(&mdev->dev, "no driver found\n");
+			mdev->driver = &maple_unsupported_device;
 		}
 		sprintf(mdev->dev.bus_id, "%d:0%d.%lX", mdev->port,
 			mdev->unit, function);
 	}
+
 	mdev->function = function;
 	mdev->dev.release = &maple_release_device;
-	retval = device_register(&mdev->dev);
-	if (retval) {
-		printk(KERN_INFO
-		"Maple bus: Attempt to register device"
-		" (%x, %x) failed.\n",
-		mdev->port, mdev->unit);
+
+	atomic_set(&mdev->busy, 0);
+	error = device_register(&mdev->dev);
+	if (error) {
+		dev_warn(&mdev->dev, "could not register device at"
+			" (%d, %d), with error 0x%X\n", mdev->unit,
+			mdev->port, error);
 		maple_free_dev(mdev);
 		mdev = NULL;
 		return;
@@ -462,7 +405,7 @@ static void maple_attach_driver(struct maple_device *mdev)
  * port and unit then return 1 - allows identification
  * of which devices need to be attached or detached
  */
-static int detach_maple_device(struct device *device, void *portptr)
+static int check_maple_device(struct device *device, void *portptr)
 {
 	struct maple_device_specify *ds;
 	struct maple_device *mdev;
@@ -477,21 +420,25 @@ static int detach_maple_device(struct device *device, void *portptr)
 static int setup_maple_commands(struct device *device, void *ignored)
 {
 	int add;
-	struct maple_device *maple_dev = to_maple_dev(device);
-
-	if ((maple_dev->interval > 0)
-	    && time_after(jiffies, maple_dev->when)) {
-		/* bounce if we cannot lock */
-		add = maple_add_packet(maple_dev,
-			be32_to_cpu(maple_dev->devinfo.function),
+	struct maple_device *mdev = to_maple_dev(device);
+	if (mdev->interval > 0 && atomic_read(&mdev->busy) == 0 &&
+		time_after(jiffies, mdev->when)) {
+		/* bounce if we cannot add */
+		add = maple_add_packet(mdev,
+			be32_to_cpu(mdev->devinfo.function),
 			MAPLE_COMMAND_GETCOND, 1, NULL);
 		if (!add)
-			maple_dev->when = jiffies + maple_dev->interval;
+			mdev->when = jiffies + mdev->interval;
 	} else {
 		if (time_after(jiffies, maple_pnp_time))
-			/* This will also bounce */
-			maple_add_packet(maple_dev, 0,
-				MAPLE_COMMAND_DEVINFO, 0, NULL);
+			/* Ensure we don't have block reads and devinfo
+			* calls interfering with one another - so flag the
+			* device as busy */
+			if (atomic_read(&mdev->busy) == 0) {
+				atomic_set(&mdev->busy, 1);
+				maple_add_packet(mdev, 0,
+					MAPLE_COMMAND_DEVINFO, 0, NULL);
+			}
 	}
 	return 0;
 }
@@ -499,29 +446,50 @@ static int setup_maple_commands(struct device *device, void *ignored)
 /* VBLANK bottom half - implemented via workqueue */
 static void maple_vblank_handler(struct work_struct *work)
 {
-	if (!list_empty(&maple_sentq) || !maple_dma_done())
+	int x, locking;
+	struct maple_device *mdev;
+
+	if (!maple_dma_done())
 		return;
 
 	ctrl_outl(0, MAPLE_ENABLE);
 
+	if (!list_empty(&maple_sentq))
+		goto finish;
+
+	/*
+	* Set up essential commands - to fetch data and
+	* check devices are still present
+	*/
 	bus_for_each_dev(&maple_bus_type, NULL, NULL,
-			 setup_maple_commands);
+		setup_maple_commands);
+
+	if (time_after(jiffies, maple_pnp_time)) {
+		/*
+		* Scan the empty ports - bus is flakey and may have
+		* mis-reported emptyness
+		*/
+		for (x = 0; x < MAPLE_PORTS; x++) {
+			if (checked[x] && empty[x]) {
+				mdev = baseunits[x];
+				if (!mdev)
+					break;
+				atomic_set(&mdev->busy, 1);
+				locking = maple_add_packet(mdev, 0,
+					MAPLE_COMMAND_DEVINFO, 0, NULL);
+				if (!locking)
+					break;
+				}
+			}
 
-	if (time_after(jiffies, maple_pnp_time))
 		maple_pnp_time = jiffies + MAPLE_PNP_INTERVAL;
-
-	mutex_lock(&maple_wlist_lock);
-	if (!list_empty(&maple_waitq) && list_empty(&maple_sentq)) {
-		mutex_unlock(&maple_wlist_lock);
-		maple_send();
-	} else {
-		mutex_unlock(&maple_wlist_lock);
 	}
 
-	maplebus_dma_reset();
+finish:
+	maple_send();
 }
 
-/* handle devices added via hotplugs - placing them on queue for DEVINFO*/
+/* handle devices added via hotplugs - placing them on queue for DEVINFO */
 static void maple_map_subunits(struct maple_device *mdev, int submask)
 {
 	int retval, k, devcheck;
@@ -533,7 +501,7 @@ static void maple_map_subunits(struct maple_device *mdev, int submask)
 		ds.unit = k + 1;
 		retval =
 		    bus_for_each_dev(&maple_bus_type, NULL, &ds,
-				     detach_maple_device);
+				     check_maple_device);
 		if (retval) {
 			submask = submask >> 1;
 			continue;
@@ -543,6 +511,7 @@ static void maple_map_subunits(struct maple_device *mdev, int submask)
 			mdev_add = maple_alloc_dev(mdev->port, k + 1);
 			if (!mdev_add)
 				return;
+			atomic_set(&mdev_add->busy, 1);
 			maple_add_packet(mdev_add, 0, MAPLE_COMMAND_DEVINFO,
 				0, NULL);
 			/* mark that we are checking sub devices */
@@ -564,27 +533,45 @@ static void maple_clean_submap(struct maple_device *mdev)
 }
 
 /* handle empty port or hotplug removal */
-static void maple_response_none(struct maple_device *mdev,
-				struct mapleq *mq)
-{
-	if (mdev->unit != 0) {
-		list_del(&mq->list);
-		maple_clean_submap(mdev);
-		printk(KERN_INFO
-		       "Maple bus device detaching at (%d, %d)\n",
-		       mdev->port, mdev->unit);
+static void maple_response_none(struct maple_device *mdev)
+{
+	maple_clean_submap(mdev);
+
+	if (likely(mdev->unit != 0)) {
+		/*
+		* Block devices play up
+		* and give the impression they have
+		* been removed even when still in place or
+		* trip the mtd layer when they have
+		* really gone - this code traps that eventuality
+		* and ensures we aren't overloaded with useless
+		* error messages
+		*/
+		if (mdev->can_unload) {
+			if (!mdev->can_unload(mdev)) {
+				atomic_set(&mdev->busy, 2);
+				wake_up(&mdev->maple_wait);
+				return;
+			}
+		}
+
+		dev_info(&mdev->dev, "detaching device at (%d, %d)\n",
+			mdev->port, mdev->unit);
 		maple_detach_driver(mdev);
 		return;
-	}
-	if (!started || !fullscan) {
-		if (checked[mdev->port] == false) {
-			checked[mdev->port] = true;
-			printk(KERN_INFO "No maple devices attached"
-				" to port %d\n", mdev->port);
+	} else {
+		if (!started || !fullscan) {
+			if (checked[mdev->port] == false) {
+				checked[mdev->port] = true;
+				empty[mdev->port] = true;
+				dev_info(&mdev->dev, "no devices"
+					" to port %d\n", mdev->port);
+			}
+			return;
 		}
-		return;
 	}
-	maple_clean_submap(mdev);
+	/* Some hardware devices generate false detach messages on unit 0 */
+	atomic_set(&mdev->busy, 0);
 }
 
 /* preprocess hotplugs or scans */
@@ -599,8 +586,11 @@ static void maple_response_devinfo(struct maple_device *mdev,
 		} else {
 			if (mdev->unit != 0)
 				maple_attach_driver(mdev);
+			if (mdev->unit == 0) {
+				empty[mdev->port] = false;
+				maple_attach_driver(mdev);
+			}
 		}
-		return;
 	}
 	if (mdev->unit == 0) {
 		submask = recvbuf[2] & 0x1F;
@@ -611,6 +601,17 @@ static void maple_response_devinfo(struct maple_device *mdev,
 	}
 }
 
+static void maple_response_fileerr(struct maple_device *mdev, void *recvbuf)
+{
+	if (mdev->fileerr_handler) {
+		mdev->fileerr_handler(mdev, recvbuf);
+		return;
+	} else
+		dev_warn(&mdev->dev, "device at (%d, %d) reports"
+			"file error 0x%X\n", mdev->port, mdev->unit,
+			((int *)recvbuf)[1]);
+}
+
 static void maple_port_rescan(void)
 {
 	int i;
@@ -621,12 +622,6 @@ static void maple_port_rescan(void)
 		if (checked[i] == false) {
 			fullscan = 0;
 			mdev = baseunits[i];
-			/*
-			 *  test lock in case scan has failed
-			 *  but device is still locked
-			 */
-			if (mutex_is_locked(&mdev->mq->mutex))
-				mutex_unlock(&mdev->mq->mutex);
 			maple_add_packet(mdev, 0, MAPLE_COMMAND_DEVINFO,
 				0, NULL);
 		}
@@ -637,7 +632,7 @@ static void maple_port_rescan(void)
 static void maple_dma_handler(struct work_struct *work)
 {
 	struct mapleq *mq, *nmq;
-	struct maple_device *dev;
+	struct maple_device *mdev;
 	char *recvbuf;
 	enum maple_code code;
 
@@ -646,43 +641,56 @@ static void maple_dma_handler(struct work_struct *work)
 	ctrl_outl(0, MAPLE_ENABLE);
 	if (!list_empty(&maple_sentq)) {
 		list_for_each_entry_safe(mq, nmq, &maple_sentq, list) {
-			recvbuf = mq->recvbuf;
+			mdev = mq->dev;
+			recvbuf = mq->recvbuf->buf;
+			dma_cache_sync(&mdev->dev, recvbuf, 0x400,
+				DMA_FROM_DEVICE);
 			code = recvbuf[0];
-			dev = mq->dev;
 			kfree(mq->sendbuf);
-			mutex_unlock(&mq->mutex);
 			list_del_init(&mq->list);
-
 			switch (code) {
 			case MAPLE_RESPONSE_NONE:
-				maple_response_none(dev, mq);
+				maple_response_none(mdev);
 				break;
 
 			case MAPLE_RESPONSE_DEVINFO:
-				maple_response_devinfo(dev, recvbuf);
+				maple_response_devinfo(mdev, recvbuf);
+				atomic_set(&mdev->busy, 0);
 				break;
 
 			case MAPLE_RESPONSE_DATATRF:
-				if (dev->callback)
-					dev->callback(mq);
+				if (mdev->callback)
+					mdev->callback(mq);
+				atomic_set(&mdev->busy, 0);
+				wake_up(&mdev->maple_wait);
 				break;
 
 			case MAPLE_RESPONSE_FILEERR:
+				maple_response_fileerr(mdev, recvbuf);
+				atomic_set(&mdev->busy, 0);
+				wake_up(&mdev->maple_wait);
+				break;
+
 			case MAPLE_RESPONSE_AGAIN:
 			case MAPLE_RESPONSE_BADCMD:
 			case MAPLE_RESPONSE_BADFUNC:
-				printk(KERN_DEBUG
-				       "Maple non-fatal error 0x%X\n",
-				       code);
+				dev_warn(&mdev->dev, "non-fatal error"
+					" 0x%X at (%d, %d)\n", code,
+					mdev->port, mdev->unit);
+				atomic_set(&mdev->busy, 0);
 				break;
 
 			case MAPLE_RESPONSE_ALLINFO:
-				printk(KERN_DEBUG
-				       "Maple - extended device information"
-					" not supported\n");
+				dev_notice(&mdev->dev, "extended"
+				" device information request for (%d, %d)"
+				" but call is not supported\n", mdev->port,
+				mdev->unit);
+				atomic_set(&mdev->busy, 0);
 				break;
 
 			case MAPLE_RESPONSE_OK:
+				atomic_set(&mdev->busy, 0);
+				wake_up(&mdev->maple_wait);
 				break;
 
 			default:
@@ -699,20 +707,19 @@ static void maple_dma_handler(struct work_struct *work)
 		if (!fullscan)
 			maple_port_rescan();
 		/* mark that we have been through the first scan */
-		if (started == 0)
-			started = 1;
+		started = 1;
 	}
-	maplebus_dma_reset();
+	maple_send();
 }
 
-static irqreturn_t maplebus_dma_interrupt(int irq, void *dev_id)
+static irqreturn_t maple_dma_interrupt(int irq, void *dev_id)
 {
 	/* Load everything into the bottom half */
 	schedule_work(&maple_dma_process);
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t maplebus_vblank_interrupt(int irq, void *dev_id)
+static irqreturn_t maple_vblank_interrupt(int irq, void *dev_id)
 {
 	schedule_work(&maple_vblank_process);
 	return IRQ_HANDLED;
@@ -720,14 +727,14 @@ static irqreturn_t maplebus_vblank_interrupt(int irq, void *dev_id)
 
 static int maple_set_dma_interrupt_handler(void)
 {
-	return request_irq(HW_EVENT_MAPLE_DMA, maplebus_dma_interrupt,
-		IRQF_SHARED, "maple bus DMA", &maple_dummy_driver);
+	return request_irq(HW_EVENT_MAPLE_DMA, maple_dma_interrupt,
+		IRQF_SHARED, "maple bus DMA", &maple_unsupported_device);
 }
 
 static int maple_set_vblank_interrupt_handler(void)
 {
-	return request_irq(HW_EVENT_VSYNC, maplebus_vblank_interrupt,
-		IRQF_SHARED, "maple bus VBLANK", &maple_dummy_driver);
+	return request_irq(HW_EVENT_VSYNC, maple_vblank_interrupt,
+		IRQF_SHARED, "maple bus VBLANK", &maple_unsupported_device);
 }
 
 static int maple_get_dma_buffer(void)
@@ -740,7 +747,7 @@ static int maple_get_dma_buffer(void)
 	return 0;
 }
 
-static int match_maple_bus_driver(struct device *devptr,
+static int maple_match_bus_driver(struct device *devptr,
 				  struct device_driver *drvptr)
 {
 	struct maple_driver *maple_drv = to_maple_driver(drvptr);
@@ -765,16 +772,18 @@ static void maple_bus_release(struct device *dev)
 {
 }
 
-static struct maple_driver maple_dummy_driver = {
+static struct maple_driver maple_unsupported_device = {
 	.drv = {
-		.name = "maple_dummy_driver",
+		.name = "maple_unsupported_device",
 		.bus = &maple_bus_type,
 	},
 };
-
+/**
+ * maple_bus_type - core maple bus structure
+ */
 struct bus_type maple_bus_type = {
 	.name = "maple",
-	.match = match_maple_bus_driver,
+	.match = maple_match_bus_driver,
 	.uevent = maple_bus_uevent,
 };
 EXPORT_SYMBOL_GPL(maple_bus_type);
@@ -788,7 +797,8 @@ static int __init maple_bus_init(void)
 {
 	int retval, i;
 	struct maple_device *mdev[MAPLE_PORTS];
-	ctrl_outl(0, MAPLE_STATE);
+
+	ctrl_outl(0, MAPLE_ENABLE);
 
 	retval = device_register(&maple_bus);
 	if (retval)
@@ -798,36 +808,33 @@ static int __init maple_bus_init(void)
 	if (retval)
 		goto cleanup_device;
 
-	retval = driver_register(&maple_dummy_driver.drv);
+	retval = driver_register(&maple_unsupported_device.drv);
 	if (retval)
 		goto cleanup_bus;
 
 	/* allocate memory for maple bus dma */
 	retval = maple_get_dma_buffer();
 	if (retval) {
-		printk(KERN_INFO
-		       "Maple bus: Failed to allocate Maple DMA buffers\n");
+		dev_err(&maple_bus, "failed to allocate DMA buffers\n");
 		goto cleanup_basic;
 	}
 
 	/* set up DMA interrupt handler */
 	retval = maple_set_dma_interrupt_handler();
 	if (retval) {
-		printk(KERN_INFO
-		       "Maple bus: Failed to grab maple DMA IRQ\n");
+		dev_err(&maple_bus, "bus failed to grab maple "
+			"DMA IRQ\n");
 		goto cleanup_dma;
 	}
 
 	/* set up VBLANK interrupt handler */
 	retval = maple_set_vblank_interrupt_handler();
 	if (retval) {
-		printk(KERN_INFO "Maple bus: Failed to grab VBLANK IRQ\n");
+		dev_err(&maple_bus, "bus failed to grab VBLANK IRQ\n");
 		goto cleanup_irq;
 	}
 
-	maple_queue_cache =
-	    kmem_cache_create("maple_queue_cache", 0x400, 0,
-			      SLAB_HWCACHE_ALIGN, NULL);
+	maple_queue_cache = KMEM_CACHE(maple_buffer, SLAB_HWCACHE_ALIGN);
 
 	if (!maple_queue_cache)
 		goto cleanup_bothirqs;
@@ -838,23 +845,23 @@ static int __init maple_bus_init(void)
 	/* setup maple ports */
 	for (i = 0; i < MAPLE_PORTS; i++) {
 		checked[i] = false;
+		empty[i] = false;
 		mdev[i] = maple_alloc_dev(i, 0);
-		baseunits[i] = mdev[i];
 		if (!mdev[i]) {
 			while (i-- > 0)
 				maple_free_dev(mdev[i]);
 			goto cleanup_cache;
 		}
+		baseunits[i] = mdev[i];
+		atomic_set(&mdev[i]->busy, 1);
 		maple_add_packet(mdev[i], 0, MAPLE_COMMAND_DEVINFO, 0, NULL);
 		subdevice_map[i] = 0;
 	}
 
-	/* setup maplebus hardware */
-	maplebus_dma_reset();
-	/* initial detection */
+	maple_pnp_time = jiffies + HZ;
+	/* prepare initial queue */
 	maple_send();
-	maple_pnp_time = jiffies;
-	printk(KERN_INFO "Maple bus core now registered.\n");
+	dev_info(&maple_bus, "bus core now registered\n");
 
 	return 0;
 
@@ -871,7 +878,7 @@ cleanup_dma:
 	free_pages((unsigned long) maple_sendbuf, MAPLE_DMA_PAGES);
 
 cleanup_basic:
-	driver_unregister(&maple_dummy_driver.drv);
+	driver_unregister(&maple_unsupported_device.drv);
 
 cleanup_bus:
 	bus_unregister(&maple_bus_type);
@@ -880,7 +887,7 @@ cleanup_device:
 	device_unregister(&maple_bus);
 
 cleanup:
-	printk(KERN_INFO "Maple bus registration failed\n");
+	printk(KERN_ERR "Maple bus registration failed\n");
 	return retval;
 }
 /* Push init to later to ensure hardware gets detected */
diff --git a/include/linux/maple.h b/include/linux/maple.h
index c23d3f51ba40..d9a51b9b3300 100644
--- a/include/linux/maple.h
+++ b/include/linux/maple.h
@@ -8,33 +8,49 @@ extern struct bus_type maple_bus_type;
 
 /* Maple Bus command and response codes */
 enum maple_code {
-	MAPLE_RESPONSE_FILEERR = -5,
-	MAPLE_RESPONSE_AGAIN = -4,	/* request should be retransmitted */
-	MAPLE_RESPONSE_BADCMD = -3,
-	MAPLE_RESPONSE_BADFUNC = -2,
-	MAPLE_RESPONSE_NONE = -1,	/* unit didn't respond at all */
-	MAPLE_COMMAND_DEVINFO = 1,
-	MAPLE_COMMAND_ALLINFO = 2,
-	MAPLE_COMMAND_RESET = 3,
-	MAPLE_COMMAND_KILL = 4,
-	MAPLE_RESPONSE_DEVINFO = 5,
-	MAPLE_RESPONSE_ALLINFO = 6,
-	MAPLE_RESPONSE_OK = 7,
-	MAPLE_RESPONSE_DATATRF = 8,
-	MAPLE_COMMAND_GETCOND = 9,
-	MAPLE_COMMAND_GETMINFO = 10,
-	MAPLE_COMMAND_BREAD = 11,
-	MAPLE_COMMAND_BWRITE = 12,
-	MAPLE_COMMAND_SETCOND = 14
+	MAPLE_RESPONSE_FILEERR =	-5,
+	MAPLE_RESPONSE_AGAIN,	/* retransmit */
+	MAPLE_RESPONSE_BADCMD,
+	MAPLE_RESPONSE_BADFUNC,
+	MAPLE_RESPONSE_NONE,	/* unit didn't respond*/
+	MAPLE_COMMAND_DEVINFO =		1,
+	MAPLE_COMMAND_ALLINFO,
+	MAPLE_COMMAND_RESET,
+	MAPLE_COMMAND_KILL,
+	MAPLE_RESPONSE_DEVINFO,
+	MAPLE_RESPONSE_ALLINFO,
+	MAPLE_RESPONSE_OK,
+	MAPLE_RESPONSE_DATATRF,
+	MAPLE_COMMAND_GETCOND,
+	MAPLE_COMMAND_GETMINFO,
+	MAPLE_COMMAND_BREAD,
+	MAPLE_COMMAND_BWRITE,
+	MAPLE_COMMAND_BSYNC,
+	MAPLE_COMMAND_SETCOND,
+	MAPLE_COMMAND_MICCONTROL
+};
+
+enum maple_file_errors {
+	MAPLE_FILEERR_INVALID_PARTITION =	0x01000000,
+	MAPLE_FILEERR_PHASE_ERROR =		0x02000000,
+	MAPLE_FILEERR_INVALID_BLOCK =		0x04000000,
+	MAPLE_FILEERR_WRITE_ERROR =		0x08000000,
+	MAPLE_FILEERR_INVALID_WRITE_LENGTH =	0x10000000,
+	MAPLE_FILEERR_BAD_CRC = 		0x20000000
+};
+
+struct maple_buffer {
+	char bufx[0x400];
+	void *buf;
 };
 
 struct mapleq {
 	struct list_head list;
 	struct maple_device *dev;
-	void *sendbuf, *recvbuf, *recvbufdcsp;
+	struct maple_buffer *recvbuf;
+	void *sendbuf, *recvbuf_p2;
 	unsigned char length;
 	enum maple_code command;
-	struct mutex mutex;
 };
 
 struct maple_devinfo {
@@ -52,11 +68,15 @@ struct maple_device {
 	struct maple_driver *driver;
 	struct mapleq *mq;
 	void (*callback) (struct mapleq * mq);
+	void (*fileerr_handler)(struct maple_device *mdev, void *recvbuf);
+	int (*can_unload)(struct maple_device *mdev);
 	unsigned long when, interval, function;
 	struct maple_devinfo devinfo;
 	unsigned char port, unit;
 	char product_name[32];
 	char product_licence[64];
+	atomic_t busy;
+	wait_queue_head_t maple_wait;
 	struct device dev;
 };
 
@@ -72,7 +92,7 @@ void maple_getcond_callback(struct maple_device *dev,
 int maple_driver_register(struct maple_driver *);
 void maple_driver_unregister(struct maple_driver *);
 
-int maple_add_packet_sleeps(struct maple_device *mdev, u32 function,
+int maple_add_packet(struct maple_device *mdev, u32 function,
 	u32 command, u32 length, void *data);
 void maple_clear_dev(struct maple_device *mdev);
 
-- 
cgit v1.2.3-71-gd317


From 8a7c4c77267b1c77296cd03e6704813cb70706d1 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Thu, 26 Feb 2009 23:41:38 -0800
Subject: RDS: Add AF and PF #defines for RDS sockets

RDS is a reliable datagram protocol used for IPC on Oracle
database clusters. This adds address and protocol family numbers
for it.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 20fc4bbfca42..9c90dc403efc 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -179,6 +179,7 @@ struct ucred {
 #define AF_ASH		18	/* Ash				*/
 #define AF_ECONET	19	/* Acorn Econet			*/
 #define AF_ATMSVC	20	/* ATM SVCs			*/
+#define AF_RDS		21	/* RDS sockets 			*/
 #define AF_SNA		22	/* Linux SNA Project (nutters!) */
 #define AF_IRDA		23	/* IRDA sockets			*/
 #define AF_PPPOX	24	/* PPPoX sockets		*/
@@ -217,6 +218,7 @@ struct ucred {
 #define PF_ASH		AF_ASH
 #define PF_ECONET	AF_ECONET
 #define PF_ATMSVC	AF_ATMSVC
+#define PF_RDS		AF_RDS
 #define PF_SNA		AF_SNA
 #define PF_IRDA		AF_IRDA
 #define PF_PPPOX	AF_PPPOX
@@ -298,6 +300,7 @@ struct ucred {
 #define SOL_PPPOL2TP	273
 #define SOL_BLUETOOTH	274
 #define SOL_PNPIPE	275
+#define SOL_RDS		276
 
 /* IPX options */
 #define IPX_TYPE	1
-- 
cgit v1.2.3-71-gd317


From db49b9d26c1966c683efced9e1c37f391d8f8182 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Tue, 24 Feb 2009 15:30:42 +0000
Subject: RDS: Add userspace header

Applications include this header in order to use RDS sockets.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rds.h | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 include/linux/rds.h

(limited to 'include/linux')

diff --git a/include/linux/rds.h b/include/linux/rds.h
new file mode 100644
index 000000000000..d91dc91f5443
--- /dev/null
+++ b/include/linux/rds.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2008 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _LINUX_RDS_H
+#define _LINUX_RDS_H
+
+#include <linux/types.h>
+
+/* These sparse annotated types shouldn't be in any user
+ * visible header file. We should clean this up rather
+ * than kludging around them. */
+#ifndef __KERNEL__
+#define __be16	u_int16_t
+#define __be32	u_int32_t
+#define __be64	u_int64_t
+#endif
+
+#define RDS_IB_ABI_VERSION		0x301
+
+/*
+ * setsockopt/getsockopt for SOL_RDS
+ */
+#define RDS_CANCEL_SENT_TO      	1
+#define RDS_GET_MR			2
+#define RDS_FREE_MR			3
+/* deprecated: RDS_BARRIER 4 */
+#define RDS_RECVERR			5
+#define RDS_CONG_MONITOR		6
+
+/*
+ * Control message types for SOL_RDS.
+ *
+ * CMSG_RDMA_ARGS (sendmsg)
+ *	Request a RDMA transfer to/from the specified
+ *	memory ranges.
+ *	The cmsg_data is a struct rds_rdma_args.
+ * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg)
+ *	Kernel informs application about intended
+ *	source/destination of a RDMA transfer
+ * RDS_CMSG_RDMA_MAP (sendmsg)
+ *	Application asks kernel to map the given
+ *	memory range into a IB MR, and send the
+ *	R_Key along in an RDS extension header.
+ *	The cmsg_data is a struct rds_get_mr_args,
+ *	the same as for the GET_MR setsockopt.
+ * RDS_CMSG_RDMA_STATUS (recvmsg)
+ *	Returns the status of a completed RDMA operation.
+ */
+#define RDS_CMSG_RDMA_ARGS		1
+#define RDS_CMSG_RDMA_DEST		2
+#define RDS_CMSG_RDMA_MAP		3
+#define RDS_CMSG_RDMA_STATUS		4
+#define RDS_CMSG_CONG_UPDATE		5
+
+#define RDS_INFO_FIRST			10000
+#define RDS_INFO_COUNTERS		10000
+#define RDS_INFO_CONNECTIONS		10001
+/* 10002 aka RDS_INFO_FLOWS is deprecated */
+#define RDS_INFO_SEND_MESSAGES		10003
+#define RDS_INFO_RETRANS_MESSAGES       10004
+#define RDS_INFO_RECV_MESSAGES          10005
+#define RDS_INFO_SOCKETS                10006
+#define RDS_INFO_TCP_SOCKETS            10007
+#define RDS_INFO_IB_CONNECTIONS		10008
+#define RDS_INFO_CONNECTION_STATS	10009
+#define RDS_INFO_IWARP_CONNECTIONS	10010
+#define RDS_INFO_LAST			10010
+
+struct rds_info_counter {
+	u_int8_t	name[32];
+	u_int64_t	value;
+} __attribute__((packed));
+
+#define RDS_INFO_CONNECTION_FLAG_SENDING	0x01
+#define RDS_INFO_CONNECTION_FLAG_CONNECTING	0x02
+#define RDS_INFO_CONNECTION_FLAG_CONNECTED	0x04
+
+#define TRANSNAMSIZ	16
+
+struct rds_info_connection {
+	u_int64_t	next_tx_seq;
+	u_int64_t	next_rx_seq;
+	__be32		laddr;
+	__be32		faddr;
+	u_int8_t	transport[TRANSNAMSIZ];		/* null term ascii */
+	u_int8_t	flags;
+} __attribute__((packed));
+
+struct rds_info_flow {
+	__be32		laddr;
+	__be32		faddr;
+	u_int32_t	bytes;
+	__be16		lport;
+	__be16		fport;
+} __attribute__((packed));
+
+#define RDS_INFO_MESSAGE_FLAG_ACK               0x01
+#define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
+
+struct rds_info_message {
+	u_int64_t	seq;
+	u_int32_t	len;
+	__be32		laddr;
+	__be32		faddr;
+	__be16		lport;
+	__be16		fport;
+	u_int8_t	flags;
+} __attribute__((packed));
+
+struct rds_info_socket {
+	u_int32_t	sndbuf;
+	__be32		bound_addr;
+	__be32		connected_addr;
+	__be16		bound_port;
+	__be16		connected_port;
+	u_int32_t	rcvbuf;
+	u_int64_t	inum;
+} __attribute__((packed));
+
+#define RDS_IB_GID_LEN	16
+struct rds_info_rdma_connection {
+	__be32		src_addr;
+	__be32		dst_addr;
+	uint8_t		src_gid[RDS_IB_GID_LEN];
+	uint8_t		dst_gid[RDS_IB_GID_LEN];
+
+	uint32_t	max_send_wr;
+	uint32_t	max_recv_wr;
+	uint32_t	max_send_sge;
+	uint32_t	rdma_mr_max;
+	uint32_t	rdma_mr_size;
+};
+
+/*
+ * Congestion monitoring.
+ * Congestion control in RDS happens at the host connection
+ * level by exchanging a bitmap marking congested ports.
+ * By default, a process sleeping in poll() is always woken
+ * up when the congestion map is updated.
+ * With explicit monitoring, an application can have more
+ * fine-grained control.
+ * The application installs a 64bit mask value in the socket,
+ * where each bit corresponds to a group of ports.
+ * When a congestion update arrives, RDS checks the set of
+ * ports that are now uncongested against the list bit mask
+ * installed in the socket, and if they overlap, we queue a
+ * cong_notification on the socket.
+ *
+ * To install the congestion monitor bitmask, use RDS_CONG_MONITOR
+ * with the 64bit mask.
+ * Congestion updates are received via RDS_CMSG_CONG_UPDATE
+ * control messages.
+ *
+ * The correspondence between bits and ports is
+ *	1 << (portnum % 64)
+ */
+#define RDS_CONG_MONITOR_SIZE	64
+#define RDS_CONG_MONITOR_BIT(port)  (((unsigned int) port) % RDS_CONG_MONITOR_SIZE)
+#define RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port))
+
+/*
+ * RDMA related types
+ */
+
+/*
+ * This encapsulates a remote memory location.
+ * In the current implementation, it contains the R_Key
+ * of the remote memory region, and the offset into it
+ * (so that the application does not have to worry about
+ * alignment).
+ */
+typedef u_int64_t	rds_rdma_cookie_t;
+
+struct rds_iovec {
+	u_int64_t	addr;
+	u_int64_t	bytes;
+};
+
+struct rds_get_mr_args {
+	struct rds_iovec vec;
+	u_int64_t	cookie_addr;
+	uint64_t	flags;
+};
+
+struct rds_free_mr_args {
+	rds_rdma_cookie_t cookie;
+	u_int64_t	flags;
+};
+
+struct rds_rdma_args {
+	rds_rdma_cookie_t cookie;
+	struct rds_iovec remote_vec;
+	u_int64_t	local_vec_addr;
+	u_int64_t	nr_local;
+	u_int64_t	flags;
+	u_int64_t	user_token;
+};
+
+struct rds_rdma_notify {
+	u_int64_t	user_token;
+	int32_t		status;
+};
+
+#define RDS_RDMA_SUCCESS	0
+#define RDS_RDMA_REMOTE_ERROR	1
+#define RDS_RDMA_CANCELED	2
+#define RDS_RDMA_DROPPED	3
+#define RDS_RDMA_OTHER_ERROR	4
+
+/*
+ * Common set of flags for all RDMA related structs
+ */
+#define RDS_RDMA_READWRITE	0x0001
+#define RDS_RDMA_FENCE		0x0002	/* use FENCE for immediate send */
+#define RDS_RDMA_INVALIDATE	0x0004	/* invalidate R_Key after freeing MR */
+#define RDS_RDMA_USE_ONCE	0x0008	/* free MR after use */
+#define RDS_RDMA_DONTWAIT	0x0010	/* Don't wait in SET_BARRIER */
+#define RDS_RDMA_NOTIFY_ME	0x0020	/* Notify when operation completes */
+
+#endif /* IB_RDS_H */
-- 
cgit v1.2.3-71-gd317


From bdaa6e8062d7f8085d8ed94ff88c99406ad53d79 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 24 Feb 2009 22:58:57 +0900
Subject: sh: multiple vectors per irq - base

Instead of keeping the single vector -> single linux irq mapping
we extend the intc code to support merging of vectors to a single
linux irq. This helps processors such as sh7750, sh7780 and sh7785
which have more vectors than masking ability. With this patch in
place we can modify the intc tables to use one irq per maskable
irq source. Please note the following:

 - If multiple vectors share the same enum then only the
   first vector will be available as a linux irq.

 - Drivers may need to be rewritten to get pending irq
   source from the hardware block instead of irq number.

This patch together with the sh7785 specific intc tables solves
DMA controller irq issues related to buggy interrupt masking.

Reported-by: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/irq.c    |  2 +-
 drivers/sh/intc.c       | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sh_intc.h |  1 +
 3 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 64b7690c664c..90d63aefd275 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -106,7 +106,7 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs)
 	}
 #endif
 
-	irq = irq_demux(evt2irq(irq));
+	irq = irq_demux(intc_evt2irq(irq));
 
 #ifdef CONFIG_IRQSTACKS
 	curctx = (union irq_ctx *)current_thread_info();
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index 58d24c5a76ce..d7b8959d9d92 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -568,6 +568,10 @@ static void __init intc_register_irq(struct intc_desc *desc,
 	if (!data[0] && data[1])
 		primary = 1;
 
+	if (!data[0] && !data[1])
+		pr_warning("intc: missing unique irq mask for 0x%04x\n",
+			   irq2evt(irq));
+
 	data[0] = data[0] ? data[0] : intc_mask_data(desc, d, enum_id, 1);
 	data[1] = data[1] ? data[1] : intc_prio_data(desc, d, enum_id, 1);
 
@@ -641,6 +645,17 @@ static unsigned int __init save_reg(struct intc_desc_int *d,
 	return 0;
 }
 
+static unsigned char *intc_evt2irq_table;
+
+unsigned int intc_evt2irq(unsigned int vector)
+{
+	unsigned int irq = evt2irq(vector);
+
+	if (intc_evt2irq_table && intc_evt2irq_table[irq])
+		irq = intc_evt2irq_table[irq];
+
+	return irq;
+}
 
 void __init register_intc_controller(struct intc_desc *desc)
 {
@@ -705,9 +720,41 @@ void __init register_intc_controller(struct intc_desc *desc)
 
 	BUG_ON(k > 256); /* _INTC_ADDR_E() and _INTC_ADDR_D() are 8 bits */
 
+	/* keep the first vector only if same enum is used multiple times */
+	for (i = 0; i < desc->nr_vectors; i++) {
+		struct intc_vect *vect = desc->vectors + i;
+		int first_irq = evt2irq(vect->vect);
+
+		if (!vect->enum_id)
+			continue;
+
+		for (k = i + 1; k < desc->nr_vectors; k++) {
+			struct intc_vect *vect2 = desc->vectors + k;
+
+			if (vect->enum_id != vect2->enum_id)
+				continue;
+
+			vect2->enum_id = 0;
+
+			if (!intc_evt2irq_table)
+				intc_evt2irq_table = alloc_bootmem(NR_IRQS);
+
+			if (!intc_evt2irq_table) {
+				pr_warning("intc: cannot allocate evt2irq!\n");
+				continue;
+			}
+
+			intc_evt2irq_table[evt2irq(vect2->vect)] = first_irq;
+		}
+	}
+
+	/* register the vectors one by one */
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
 
+		if (!vect->enum_id)
+			continue;
+
 		intc_register_irq(desc, d, vect->enum_id, evt2irq(vect->vect));
 	}
 }
diff --git a/include/linux/sh_intc.h b/include/linux/sh_intc.h
index 68e212ff9dde..eb1423a0078d 100644
--- a/include/linux/sh_intc.h
+++ b/include/linux/sh_intc.h
@@ -85,6 +85,7 @@ struct intc_desc symbol __initdata = {					\
 }
 #endif
 
+unsigned int intc_evt2irq(unsigned int vector);
 void __init register_intc_controller(struct intc_desc *desc);
 int intc_set_priority(unsigned int irq, unsigned int prio);
 
-- 
cgit v1.2.3-71-gd317


From 98c8a60a04316e94ccea8221cf16768ce91bd214 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Tue, 17 Feb 2009 13:24:57 +0200
Subject: nl80211: Provide access to STA TX/RX packet counters

The TX/RX packet counters are needed to fill in RADIUS Accounting
attributes Acct-Output-Packets and Acct-Input-Packets. We already
collect the needed information, but only the TX/RX bytes were
previously exposed through nl80211. Allow applications to fetch the
packet counters, too, to provide more complete support for accounting.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 5 +++++
 include/net/cfg80211.h  | 8 ++++++++
 net/mac80211/cfg.c      | 4 ++++
 net/wireless/nl80211.c  | 6 ++++++
 4 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 8802d1bda382..f6e56370ea65 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -526,6 +526,9 @@ enum nl80211_rate_info {
  * @NL80211_STA_INFO_SIGNAL: signal strength of last received PPDU (u8, dBm)
  * @NL80211_STA_INFO_TX_BITRATE: current unicast tx rate, nested attribute
  * 	containing info as possible, see &enum nl80211_sta_info_txrate.
+ * @NL80211_STA_INFO_RX_PACKETS: total received packet (u32, from this station)
+ * @NL80211_STA_INFO_TX_PACKETS: total transmitted packets (u32, to this
+ *	station)
  */
 enum nl80211_sta_info {
 	__NL80211_STA_INFO_INVALID,
@@ -537,6 +540,8 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_PLINK_STATE,
 	NL80211_STA_INFO_SIGNAL,
 	NL80211_STA_INFO_TX_BITRATE,
+	NL80211_STA_INFO_RX_PACKETS,
+	NL80211_STA_INFO_TX_PACKETS,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 33f43b0d08fb..8dcc46444037 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -178,6 +178,8 @@ struct station_parameters {
  * @STATION_INFO_SIGNAL: @signal filled
  * @STATION_INFO_TX_BITRATE: @tx_bitrate fields are filled
  *  (tx_bitrate, tx_bitrate_flags and tx_bitrate_mcs)
+ * @STATION_INFO_RX_PACKETS: @rx_packets filled
+ * @STATION_INFO_TX_PACKETS: @tx_packets filled
  */
 enum station_info_flags {
 	STATION_INFO_INACTIVE_TIME	= 1<<0,
@@ -188,6 +190,8 @@ enum station_info_flags {
 	STATION_INFO_PLINK_STATE	= 1<<5,
 	STATION_INFO_SIGNAL		= 1<<6,
 	STATION_INFO_TX_BITRATE		= 1<<7,
+	STATION_INFO_RX_PACKETS		= 1<<8,
+	STATION_INFO_TX_PACKETS		= 1<<9,
 };
 
 /**
@@ -235,6 +239,8 @@ struct rate_info {
  * @plink_state: mesh peer link state
  * @signal: signal strength of last received packet in dBm
  * @txrate: current unicast bitrate to this station
+ * @rx_packets: packets received from this station
+ * @tx_packets: packets transmitted to this station
  */
 struct station_info {
 	u32 filled;
@@ -246,6 +252,8 @@ struct station_info {
 	u8 plink_state;
 	s8 signal;
 	struct rate_info txrate;
+	u32 rx_packets;
+	u32 tx_packets;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f453bb7c564b..c43129efc3bf 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -341,11 +341,15 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 	sinfo->filled = STATION_INFO_INACTIVE_TIME |
 			STATION_INFO_RX_BYTES |
 			STATION_INFO_TX_BYTES |
+			STATION_INFO_RX_PACKETS |
+			STATION_INFO_TX_PACKETS |
 			STATION_INFO_TX_BITRATE;
 
 	sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
 	sinfo->rx_bytes = sta->rx_bytes;
 	sinfo->tx_bytes = sta->tx_bytes;
+	sinfo->rx_packets = sta->rx_packets;
+	sinfo->tx_packets = sta->tx_packets;
 
 	if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
 		sinfo->filled |= STATION_INFO_SIGNAL;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 67b18b3a93a0..badccf98074e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1206,6 +1206,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
 
 		nla_nest_end(msg, txrate);
 	}
+	if (sinfo->filled & STATION_INFO_RX_PACKETS)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS,
+			    sinfo->rx_packets);
+	if (sinfo->filled & STATION_INFO_TX_PACKETS)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
+			    sinfo->tx_packets);
 	nla_nest_end(msg, sinfoattr);
 
 	return genlmsg_end(msg, hdr);
-- 
cgit v1.2.3-71-gd317


From 629928041c53771f9902753d50fef6b35f36d33d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 28 Feb 2009 02:47:59 -0500
Subject: tracing: create the C style tracing for the sched subsystem

This patch utilizes the TRACE_EVENT_FORMAT macro to enable the C style
faster tracing for the sched subsystem trace points.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h        |   3 +
 include/trace/sched_event_types.h | 119 ++++++++++++++++++++++++++++++--------
 2 files changed, 97 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 62d13391a240..152b2f03fb86 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,4 +157,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
+	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
+
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 2ada206565a3..ba059c10b58a 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,6 +1,6 @@
 
 /* use <trace/sched.h> instead */
-#ifndef TRACE_FORMAT
+#ifndef TRACE_EVENT_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
@@ -8,70 +8,139 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM sched
 
-TRACE_FORMAT(sched_kthread_stop,
+TRACE_EVENT_FORMAT(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 	TPARGS(t),
-	TPFMT("task %s:%d", t->comm, t->pid));
+	TPFMT("task %s:%d", t->comm, t->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, t->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_kthread_stop_ret,
+TRACE_EVENT_FORMAT(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 	TPARGS(ret),
-	TPFMT("ret=%d", ret));
+	TPFMT("ret=%d", ret),
+	TRACE_STRUCT(
+		TRACE_FIELD(int, ret, ret)
+	),
+	TPRAWFMT("ret=%d")
+	);
 
-TRACE_FORMAT(sched_wait_task,
+TRACE_EVENT_FORMAT(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 	TPARGS(rq, p),
-	TPFMT("task %s:%d", p->comm, p->pid));
+	TPFMT("task %s:%d", p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_wakeup,
+TRACE_EVENT_FORMAT(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d %s",
-	      p->comm, p->pid, success?"succeeded":"failed"));
+	      p->comm, p->pid, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("task %d success=%d")
+	);
 
-TRACE_FORMAT(sched_wakeup_new,
+TRACE_EVENT_FORMAT(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d",
-	      p->comm, p->pid, success?"succeeded":"failed"));
+	      p->comm, p->pid, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("task %d success=%d")
+	);
 
-TRACE_FORMAT(sched_switch,
+TRACE_EVENT_FORMAT(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 	TPARGS(rq, prev, next),
 	TPFMT("task %s:%d ==> %s:%d",
-	      prev->comm, prev->pid, next->comm, next->pid));
+	      prev->comm, prev->pid, next->comm, next->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, prev_pid, prev->pid)
+		TRACE_FIELD(int, prev_prio, prev->prio)
+		TRACE_FIELD(pid_t, next_pid, next->pid)
+		TRACE_FIELD(int, next_prio, next->prio)
+	),
+	TPRAWFMT("prev %d:%d ==> next %d:%d")
+	);
 
-TRACE_FORMAT(sched_migrate_task,
+TRACE_EVENT_FORMAT(sched_migrate_task,
 	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
 	TPARGS(p, orig_cpu, dest_cpu),
 	TPFMT("task %s:%d from: %d  to: %d",
-	      p->comm, p->pid, orig_cpu, dest_cpu));
+	      p->comm, p->pid, orig_cpu, dest_cpu),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+		TRACE_FIELD(int, orig_cpu, orig_cpu)
+		TRACE_FIELD(int, dest_cpu, dest_cpu)
+	),
+	TPRAWFMT("task %d  from: %d to: %d")
+	);
 
-TRACE_FORMAT(sched_process_free,
+TRACE_EVENT_FORMAT(sched_process_free,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid));
+	TPFMT("task %s:%d", p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_process_exit,
+TRACE_EVENT_FORMAT(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid));
+	TPFMT("task %s:%d", p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_process_wait,
+TRACE_EVENT_FORMAT(sched_process_wait,
 	TPPROTO(struct pid *pid),
 	TPARGS(pid),
-	TPFMT("pid %d", pid));
+	TPFMT("pid %d", pid_nr(pid)),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, pid, pid_nr(pid))
+	),
+	TPRAWFMT("task %d")
+	);
 
-TRACE_FORMAT(sched_process_fork,
+TRACE_EVENT_FORMAT(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 	TPARGS(parent, child),
 	TPFMT("parent %s:%d  child %s:%d",
-	      parent->comm, parent->pid, child->comm, child->pid));
+	      parent->comm, parent->pid, child->comm, child->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(pid_t, parent, parent->pid)
+		TRACE_FIELD(pid_t, child, child->pid)
+	),
+	TPRAWFMT("parent %d  child %d")
+	);
 
-TRACE_FORMAT(sched_signal_send,
+TRACE_EVENT_FORMAT(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 	TPARGS(sig, p),
-	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
+	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
+	TRACE_STRUCT(
+		TRACE_FIELD(int, sig, sig)
+		TRACE_FIELD(pid_t, pid, p->pid)
+	),
+	TPRAWFMT("sig: %d  task %d")
+	);
 
 #undef TRACE_SYSTEM
-- 
cgit v1.2.3-71-gd317


From f5c1aa1537be39d8b9bb5279b5881d81898fd3cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 1 Mar 2009 12:32:08 +0100
Subject: Revert "gpu/drm, x86, PAT: PAT support for io_mapping_*"

This reverts commit 17581ad812a9abb0182260374ef2e52d4a808a64.

Sitsofe Wheeler reported that /dev/dri/card0 is MIA on his EeePC 900
and bisected it to this commit.

Graphics card is an i915 in an EeePC 900:

 00:02.0 VGA compatible controller [0300]:
   Intel Corporation Mobile 915GM/GMS/910GML
     Express Graphics Controller [8086:2592] (rev 04)

( Most likely the ioremap() of the driver failed and hence the card
  did not initialize. )

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Bisected-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/iomap.h |  5 +----
 arch/x86/mm/iomap_32.c       | 44 ++------------------------------------------
 include/linux/io-mapping.h   |  6 ++----
 3 files changed, 5 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index bd46495ff7de..86af26091d6c 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -24,10 +24,7 @@
 #include <asm/tlbflush.h>
 
 int
-reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot);
-
-void
-free_io_memtype(u64 base, unsigned long size);
+is_io_mapping_possible(resource_size_t base, unsigned long size);
 
 void *
 iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index d5e28424622c..6c2b1af16926 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,13 +21,13 @@
 #include <linux/module.h>
 
 #ifdef CONFIG_X86_PAE
-static int
+int
 is_io_mapping_possible(resource_size_t base, unsigned long size)
 {
 	return 1;
 }
 #else
-static int
+int
 is_io_mapping_possible(resource_size_t base, unsigned long size)
 {
 	/* There is no way to map greater than 1 << 32 address without PAE */
@@ -38,46 +38,6 @@ is_io_mapping_possible(resource_size_t base, unsigned long size)
 }
 #endif
 
-int
-reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot)
-{
-	unsigned long ret_flag;
-
-	if (!is_io_mapping_possible(base, size))
-		goto out_err;
-
-	if (!pat_enabled) {
-		*prot = pgprot_noncached(PAGE_KERNEL);
-		return 0;
-	}
-
-	if (reserve_memtype(base, base + size, _PAGE_CACHE_WC, &ret_flag))
-		goto out_err;
-
-	if (ret_flag == _PAGE_CACHE_WB)
-		goto out_free;
-
-	if (kernel_map_sync_memtype(base, size, ret_flag))
-		goto out_free;
-
-	*prot = __pgprot(__PAGE_KERNEL | ret_flag);
-	return 0;
-
-out_free:
-	free_memtype(base, base + size);
-out_err:
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(reserve_io_memtype_wc);
-
-void
-free_io_memtype(u64 base, unsigned long size)
-{
-	if (pat_enabled)
-		free_memtype(base, base + size);
-}
-EXPORT_SYMBOL_GPL(free_io_memtype);
-
 /* Map 'pfn' using fixed map 'type' and protections 'prot'
  */
 void *
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index f1ed66c43787..cbc2f0cd631b 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -49,9 +49,8 @@ static inline struct io_mapping *
 io_mapping_create_wc(resource_size_t base, unsigned long size)
 {
 	struct io_mapping *iomap;
-	pgprot_t prot;
 
-	if (!reserve_io_memtype_wc(base, size, &prot))
+	if (!is_io_mapping_possible(base, size))
 		return NULL;
 
 	iomap = kmalloc(sizeof(*iomap), GFP_KERNEL);
@@ -60,14 +59,13 @@ io_mapping_create_wc(resource_size_t base, unsigned long size)
 
 	iomap->base = base;
 	iomap->size = size;
-	iomap->prot = prot;
+	iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL));
 	return iomap;
 }
 
 static inline void
 io_mapping_free(struct io_mapping *mapping)
 {
-	free_io_memtype(mapping->base, mapping->size);
 	kfree(mapping);
 }
 
-- 
cgit v1.2.3-71-gd317


From f180053694b43d5714bf56cb95499a3c32ff155c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 2 Mar 2009 11:00:57 +0100
Subject: x86, mm: dont use non-temporal stores in pagecache accesses

Impact: standardize IO on cached ops

On modern CPUs it is almost always a bad idea to use non-temporal stores,
as the regression in this commit has shown it:

  30d697f: x86: fix performance regression in write() syscall

The kernel simply has no good information about whether using non-temporal
stores is a good idea or not - and trying to add heuristics only increases
complexity and inserts fragility.

The regression on cached write()s took very long to be found - over two
years. So dont take any chances and let the hardware decide how it makes
use of its caches.

The only exception is drivers/gpu/drm/i915/i915_gem.c: there were we are
absolutely sure that another entity (the GPU) will pick up the dirty
data immediately and that the CPU will not touch that data before the
GPU will.

Also, keep the _nocache() primitives to make it easier for people to
experiment with these details. There may be more clear-cut cases where
non-cached copies can be used, outside of filemap.c.

Cc: Salman Qazi <sqazi@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess_32.h |  4 ++--
 arch/x86/include/asm/uaccess_64.h | 25 +++++++------------------
 drivers/gpu/drm/i915/i915_gem.c   |  2 +-
 include/linux/uaccess.h           |  4 ++--
 mm/filemap.c                      | 11 ++++-------
 mm/filemap_xip.c                  |  2 +-
 6 files changed, 17 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index a0ba61386972..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -157,7 +157,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 }
 
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	might_fault();
 	if (__builtin_constant_p(n)) {
@@ -180,7 +180,7 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 static __always_inline unsigned long
 __copy_from_user_inatomic_nocache(void *to, const void __user *from,
-				  unsigned long n, unsigned long total)
+				  unsigned long n)
 {
        return __copy_from_user_ll_nocache_nozero(to, from, n);
 }
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index dcaa0404cf7b..8cc687326eb8 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -188,29 +188,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
-static inline int __copy_from_user_nocache(void *dst, const void __user *src,
-				   unsigned size, unsigned long total)
+static inline int
+__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
 	might_sleep();
-	/*
-	 * In practice this limit means that large file write()s
-	 * which get chunked to 4K copies get handled via
-	 * non-temporal stores here. Smaller writes get handled
-	 * via regular __copy_from_user():
-	 */
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 1);
-	else
-		return __copy_from_user(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 1);
 }
 
-static inline int __copy_from_user_inatomic_nocache(void *dst,
-	    const void __user *src, unsigned size, unsigned total)
+static inline int
+__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
+				  unsigned size)
 {
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 0);
-	else
-		return __copy_from_user_inatomic(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 0);
 }
 
 unsigned long
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6b209db8370d..818576654092 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -215,7 +215,7 @@ fast_user_write(struct io_mapping *mapping,
 
 	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base);
 	unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset,
-						      user_data, length, length);
+						      user_data, length);
 	io_mapping_unmap_atomic(vaddr_atomic);
 	if (unwritten)
 		return -EFAULT;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 6f3c603b0d67..6b58367d145e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -41,13 +41,13 @@ static inline void pagefault_enable(void)
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	return __copy_from_user_inatomic(to, from, n);
 }
 
 static inline unsigned long __copy_from_user_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	return __copy_from_user(to, from, n);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 60fd56772cc6..126d3973b3d1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1816,14 +1816,14 @@ EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
-	size_t copied = 0, left = 0, total = bytes;
+	size_t copied = 0, left = 0;
 
 	while (bytes) {
 		char __user *buf = iov->iov_base + base;
 		int copy = min(bytes, iov->iov_len - base);
 
 		base = 0;
-		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy, total);
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
 		copied += copy;
 		bytes -= copy;
 		vaddr += copy;
@@ -1851,9 +1851,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
-
-		left = __copy_from_user_inatomic_nocache(kaddr + offset,
-							buf, bytes, bytes);
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -1881,8 +1879,7 @@ size_t iov_iter_copy_from_user(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
-
-		left = __copy_from_user_nocache(kaddr + offset, buf, bytes, bytes);
+		left = __copy_from_user(kaddr + offset, buf, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index bf54f8a2cf1d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -354,7 +354,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 			break;
 
 		copied = bytes -
-			__copy_from_user_nocache(xip_mem + offset, buf, bytes, bytes);
+			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
 
 		if (likely(copied > 0)) {
 			status = copied;
-- 
cgit v1.2.3-71-gd317


From cabeccbd172cc305f4383f5a4808ae254745275f Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 28 Feb 2009 04:44:38 +0000
Subject: tcp: kill eff_sacks "cache", the sole user can calculate itself
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also fixes insignificant bug that would cause sending of stale
SACK block (would occur in some corner cases).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 -
 include/net/tcp.h        |  1 -
 net/ipv4/tcp_input.c     | 15 ++-------------
 net/ipv4/tcp_minisocks.c |  3 +--
 net/ipv4/tcp_output.c    | 12 ++++++------
 5 files changed, 9 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0cd99e6baca5..4b86ad71e054 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -218,7 +218,6 @@ struct tcp_options_received {
 		snd_wscale : 4,	/* Window scaling received from sender	*/
 		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
 /*	SACKs data	*/
-	u8	eff_sacks;	/* Size of SACK array to send with next packet */
 	u8	num_sacks;	/* Number of SACK blocks		*/
 	u16	user_mss;  	/* mss requested by user in ioctl */
 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0366a559afec..055e4946d4c8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -926,7 +926,6 @@ extern void tcp_done(struct sock *sk);
 static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
 {
 	rx_opt->dsack = 0;
-	rx_opt->eff_sacks = 0;
 	rx_opt->num_sacks = 0;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 03f5ede87224..e4442a293eb0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4099,7 +4099,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 		tp->rx_opt.dsack = 1;
 		tp->duplicate_sack[0].start_seq = seq;
 		tp->duplicate_sack[0].end_seq = end_seq;
-		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1;
 	}
 }
 
@@ -4154,8 +4153,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
 			 * Decrease num_sacks.
 			 */
 			tp->rx_opt.num_sacks--;
-			tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
-					       tp->rx_opt.dsack;
 			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
 				sp[i] = sp[i + 1];
 			continue;
@@ -4218,7 +4215,6 @@ new_sack:
 	sp->start_seq = seq;
 	sp->end_seq = end_seq;
 	tp->rx_opt.num_sacks++;
-	tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
 }
 
 /* RCV.NXT advances, some SACKs should be eaten. */
@@ -4232,7 +4228,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
 	if (skb_queue_empty(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
-		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
 		return;
 	}
 
@@ -4253,11 +4248,8 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 		this_sack++;
 		sp++;
 	}
-	if (num_sacks != tp->rx_opt.num_sacks) {
+	if (num_sacks != tp->rx_opt.num_sacks)
 		tp->rx_opt.num_sacks = num_sacks;
-		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
-				       tp->rx_opt.dsack;
-	}
 }
 
 /* This one checks to see if we can put data from the
@@ -4333,10 +4325,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 
 	TCP_ECN_accept_cwr(tp, skb);
 
-	if (tp->rx_opt.dsack) {
+	if (tp->rx_opt.dsack)
 		tp->rx_opt.dsack = 0;
-		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
-	}
 
 	/*  Queue data for delivery to the user.
 	 *  Packets in sequence go to the receive queue.
@@ -4456,7 +4446,6 @@ drop:
 		if (tcp_is_sack(tp)) {
 			tp->rx_opt.num_sacks = 1;
 			tp->rx_opt.dsack     = 0;
-			tp->rx_opt.eff_sacks = 1;
 			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
 			tp->selective_acks[0].end_seq =
 						TCP_SKB_CB(skb)->end_seq;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f67effbb102b..bb3d8b35f19a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->rx_opt.saw_tstamp = 0;
 
 		newtp->rx_opt.dsack = 0;
-		newtp->rx_opt.eff_sacks = 0;
-
 		newtp->rx_opt.num_sacks = 0;
+
 		newtp->urg_data = 0;
 
 		if (sock_flag(newsk, SOCK_KEEPOPEN))
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 61445b57610c..1555bb73b638 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -441,10 +441,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 			*ptr++ = htonl(sp[this_sack].end_seq);
 		}
 
-		if (tp->rx_opt.dsack) {
+		if (tp->rx_opt.dsack)
 			tp->rx_opt.dsack = 0;
-			tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
-		}
 	}
 }
 
@@ -550,6 +548,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
 	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned size = 0;
+	unsigned int eff_sacks;
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -568,10 +567,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
 
-	if (unlikely(tp->rx_opt.eff_sacks)) {
+	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+	if (unlikely(eff_sacks)) {
 		const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
 		opts->num_sack_blocks =
-			min_t(unsigned, tp->rx_opt.eff_sacks,
+			min_t(unsigned, eff_sacks,
 			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
 			      TCPOLEN_SACK_PERBLOCK);
 		size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -1418,7 +1418,7 @@ static int tcp_mtu_probe(struct sock *sk)
 	    icsk->icsk_mtup.probe_size ||
 	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
 	    tp->snd_cwnd < 11 ||
-	    tp->rx_opt.eff_sacks)
+	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
 		return -1;
 
 	/* Very simple search strategy: just double the MSS. */
-- 
cgit v1.2.3-71-gd317


From 8987691a4aa6622a1b58bb12c56abaf3d2098fad Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Sat, 28 Feb 2009 23:42:50 +0000
Subject: wimax/i2400m: allow control of the base-station idle mode timeout

For power saving reasons, WiMAX links can be put in idle mode while
connected after a certain time of the link not being used for tx or
rx. In this mode, the device pages the base-station regularly and when
data is ready to be transmitted, the link is revived.

This patch allows the user to control the time the device has to be
idle before it decides to go to idle mode from a sysfs
interace.

It also updates the initialization code to acknowledge the module
variable 'idle_mode_disabled' when the firmware is a newer version
(upcoming 1.4 vs 2.6.29's v1.3).

The method for setting the idle mode timeout in the older firmwares is
much more limited and can be only done at initialization time. Thus,
the sysfs file will return -ENOSYS on older ones.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wimax/i2400m/Makefile       |  1 +
 drivers/net/wimax/i2400m/control.c      | 98 ++++++++++++++++++++++++++++++---
 drivers/net/wimax/i2400m/debug-levels.h |  1 +
 drivers/net/wimax/i2400m/driver.c       | 10 ++++
 drivers/net/wimax/i2400m/i2400m.h       | 29 ++++++++++
 drivers/net/wimax/i2400m/sysfs.c        | 80 +++++++++++++++++++++++++++
 include/linux/wimax/i2400m.h            | 10 ++++
 7 files changed, 221 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/wimax/i2400m/sysfs.c

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/Makefile b/drivers/net/wimax/i2400m/Makefile
index 1696e936cf5a..5d9e018d31af 100644
--- a/drivers/net/wimax/i2400m/Makefile
+++ b/drivers/net/wimax/i2400m/Makefile
@@ -8,6 +8,7 @@ i2400m-y :=		\
 	driver.o	\
 	fw.o		\
 	op-rfkill.o	\
+	sysfs.o		\
 	netdev.o	\
 	tx.o		\
 	rx.o
diff --git a/drivers/net/wimax/i2400m/control.c b/drivers/net/wimax/i2400m/control.c
index c8b3a68b72b8..c3968b240d69 100644
--- a/drivers/net/wimax/i2400m/control.c
+++ b/drivers/net/wimax/i2400m/control.c
@@ -1221,6 +1221,77 @@ none:
 EXPORT_SYMBOL_GPL(i2400m_set_init_config);
 
 
+/**
+ * i2400m_set_idle_timeout - Set the device's idle mode timeout
+ *
+ * @i2400m: i2400m device descriptor
+ *
+ * @msecs: milliseconds for the timeout to enter idle mode. Between
+ *     100 to 300000 (5m); 0 to disable. In increments of 100.
+ *
+ * After this @msecs of the link being idle (no data being sent or
+ * received), the device will negotiate with the basestation entering
+ * idle mode for saving power. The connection is maintained, but
+ * getting out of it (done in tx.c) will require some negotiation,
+ * possible crypto re-handshake and a possible DHCP re-lease.
+ *
+ * Only available if fw_version >= 0x00090002.
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ */
+int i2400m_set_idle_timeout(struct i2400m *i2400m, unsigned msecs)
+{
+	int result;
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *ack_skb;
+	struct {
+		struct i2400m_l3l4_hdr hdr;
+		struct i2400m_tlv_config_idle_timeout cit;
+	} *cmd;
+	const struct i2400m_l3l4_hdr *ack;
+	size_t ack_len;
+	char strerr[32];
+
+	result = -ENOSYS;
+	if (i2400m_le_v1_3(i2400m))
+		goto error_alloc;
+	result = -ENOMEM;
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (cmd == NULL)
+		goto error_alloc;
+	cmd->hdr.type = cpu_to_le16(I2400M_MT_GET_STATE);
+	cmd->hdr.length = cpu_to_le16(sizeof(*cmd) - sizeof(cmd->hdr));
+	cmd->hdr.version = cpu_to_le16(I2400M_L3L4_VERSION);
+
+	cmd->cit.hdr.type =
+		cpu_to_le16(I2400M_TLV_CONFIG_IDLE_TIMEOUT);
+	cmd->cit.hdr.length = cpu_to_le16(sizeof(cmd->cit.timeout));
+	cmd->cit.timeout = cpu_to_le32(msecs);
+
+	ack_skb = i2400m_msg_to_dev(i2400m, cmd, sizeof(*cmd));
+	if (IS_ERR(ack_skb)) {
+		dev_err(dev, "Failed to issue 'set idle timeout' command: "
+			"%ld\n", PTR_ERR(ack_skb));
+		result = PTR_ERR(ack_skb);
+		goto error_msg_to_dev;
+	}
+	ack = wimax_msg_data_len(ack_skb, &ack_len);
+	result = i2400m_msg_check_status(ack, strerr, sizeof(strerr));
+	if (result < 0) {
+		dev_err(dev, "'set idle timeout' (0x%04x) command failed: "
+			"%d - %s\n", I2400M_MT_GET_STATE, result, strerr);
+		goto error_cmd_failed;
+	}
+	result = 0;
+	kfree_skb(ack_skb);
+error_cmd_failed:
+error_msg_to_dev:
+	kfree(cmd);
+error_alloc:
+	return result;
+}
+
+
 /**
  * i2400m_dev_initialize - Initialize the device once communications are ready
  *
@@ -1239,19 +1310,28 @@ int i2400m_dev_initialize(struct i2400m *i2400m)
 	int result;
 	struct device *dev = i2400m_dev(i2400m);
 	struct i2400m_tlv_config_idle_parameters idle_params;
+	struct i2400m_tlv_config_idle_timeout idle_timeout;
 	const struct i2400m_tlv_hdr *args[9];
 	unsigned argc = 0;
 
 	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
-	/* Useless for now...might change */
 	if (i2400m_idle_mode_disabled) {
-		idle_params.hdr.type =
-			cpu_to_le16(I2400M_TLV_CONFIG_IDLE_PARAMETERS);
-		idle_params.hdr.length = cpu_to_le16(
-			sizeof(idle_params) - sizeof(idle_params.hdr));
-		idle_params.idle_timeout = 0;
-		idle_params.idle_paging_interval = 0;
-		args[argc++] = &idle_params.hdr;
+		if (i2400m_le_v1_3(i2400m)) {
+			idle_params.hdr.type =
+				cpu_to_le16(I2400M_TLV_CONFIG_IDLE_PARAMETERS);
+			idle_params.hdr.length = cpu_to_le16(
+				sizeof(idle_params) - sizeof(idle_params.hdr));
+			idle_params.idle_timeout = 0;
+			idle_params.idle_paging_interval = 0;
+			args[argc++] = &idle_params.hdr;
+		} else {
+			idle_timeout.hdr.type =
+				cpu_to_le16(I2400M_TLV_CONFIG_IDLE_TIMEOUT);
+			idle_timeout.hdr.length = cpu_to_le16(
+				sizeof(idle_timeout) - sizeof(idle_timeout.hdr));
+			idle_timeout.timeout = 0;
+			args[argc++] = &idle_timeout.hdr;
+		}
 	}
 	result = i2400m_set_init_config(i2400m, args, argc);
 	if (result < 0)
@@ -1264,6 +1344,8 @@ int i2400m_dev_initialize(struct i2400m *i2400m)
 	 */
 	result = i2400m_cmd_get_state(i2400m);
 error:
+	if (result < 0)
+		dev_err(dev, "failed to initialize the device: %d\n", result);
 	d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result);
 	return result;
 }
diff --git a/drivers/net/wimax/i2400m/debug-levels.h b/drivers/net/wimax/i2400m/debug-levels.h
index 3183baa16a52..48fbfaa0d403 100644
--- a/drivers/net/wimax/i2400m/debug-levels.h
+++ b/drivers/net/wimax/i2400m/debug-levels.h
@@ -38,6 +38,7 @@ enum d_module {
 	D_SUBMODULE_DECLARE(netdev),
 	D_SUBMODULE_DECLARE(rfkill),
 	D_SUBMODULE_DECLARE(rx),
+	D_SUBMODULE_DECLARE(sysfs),
 	D_SUBMODULE_DECLARE(tx),
 };
 
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index 69a816e7c5db..f988771bfae0 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -662,6 +662,11 @@ int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
 	wimax_state_change(wimax_dev, WIMAX_ST_UNINITIALIZED);
 
 	/* Now setup all that requires a registered net and wimax device. */
+	result = sysfs_create_group(&net_dev->dev.kobj, &i2400m_dev_attr_group);
+	if (result < 0) {
+		dev_err(dev, "cannot setup i2400m's sysfs: %d\n", result);
+		goto error_sysfs_setup;
+	}
 	result = i2400m_debugfs_add(i2400m);
 	if (result < 0) {
 		dev_err(dev, "cannot setup i2400m's debugfs: %d\n", result);
@@ -671,6 +676,9 @@ int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
 	return result;
 
 error_debugfs_setup:
+	sysfs_remove_group(&i2400m->wimax_dev.net_dev->dev.kobj,
+			   &i2400m_dev_attr_group);
+error_sysfs_setup:
 	wimax_dev_rm(&i2400m->wimax_dev);
 error_wimax_dev_add:
 	i2400m_dev_stop(i2400m);
@@ -702,6 +710,8 @@ void i2400m_release(struct i2400m *i2400m)
 	netif_stop_queue(i2400m->wimax_dev.net_dev);
 
 	i2400m_debugfs_rm(i2400m);
+	sysfs_remove_group(&i2400m->wimax_dev.net_dev->dev.kobj,
+			   &i2400m_dev_attr_group);
 	wimax_dev_rm(&i2400m->wimax_dev);
 	i2400m_dev_stop(i2400m);
 	unregister_netdev(i2400m->wimax_dev.net_dev);
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h
index 5008cdb12b42..0c60d5c43007 100644
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -585,6 +585,8 @@ unsigned i2400m_brh_get_signature(const struct i2400m_bootrom_header *hdr)
  * Driver / device setup and internal functions
  */
 extern void i2400m_netdev_setup(struct net_device *net_dev);
+extern int i2400m_sysfs_setup(struct device_driver *);
+extern void i2400m_sysfs_release(struct device_driver *);
 extern int i2400m_tx_setup(struct i2400m *);
 extern void i2400m_wake_tx_work(struct work_struct *);
 extern void i2400m_tx_release(struct i2400m *);
@@ -728,6 +730,7 @@ extern struct sk_buff *i2400m_get_device_info(struct i2400m *);
 extern int i2400m_firmware_check(struct i2400m *);
 extern int i2400m_set_init_config(struct i2400m *,
 				  const struct i2400m_tlv_hdr **, size_t);
+extern int i2400m_set_idle_timeout(struct i2400m *, unsigned);
 
 static inline
 struct usb_endpoint_descriptor *usb_get_epd(struct usb_interface *iface, int ep)
@@ -740,6 +743,32 @@ extern int i2400m_op_rfkill_sw_toggle(struct wimax_dev *,
 extern void i2400m_report_tlv_rf_switches_status(
 	struct i2400m *, const struct i2400m_tlv_rf_switches_status *);
 
+/*
+ * Helpers for firmware backwards compability
+ *
+ * As we aim to support at least the firmware version that was
+ * released with the previous kernel/driver release, some code will be
+ * conditionally executed depending on the firmware version. On each
+ * release, the code to support fw releases past the last two ones
+ * will be purged.
+ *
+ * By making it depend on this macros, it is easier to keep it a tab
+ * on what has to go and what not.
+ */
+static inline
+unsigned i2400m_le_v1_3(struct i2400m *i2400m)
+{
+	/* running fw is lower or v1.3 */
+	return i2400m->fw_version <= 0x00090001;
+}
+
+static inline
+unsigned i2400m_ge_v1_4(struct i2400m *i2400m)
+{
+	/* running fw is higher or v1.4 */
+	return i2400m->fw_version >= 0x00090002;
+}
+
 
 /*
  * Do a millisecond-sleep for allowing wireshark to dump all the data
diff --git a/drivers/net/wimax/i2400m/sysfs.c b/drivers/net/wimax/i2400m/sysfs.c
new file mode 100644
index 000000000000..1237109f251a
--- /dev/null
+++ b/drivers/net/wimax/i2400m/sysfs.c
@@ -0,0 +1,80 @@
+/*
+ * Intel Wireless WiMAX Connection 2400m
+ * Sysfs interfaces to show driver and device information
+ *
+ *
+ * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include "i2400m.h"
+
+
+#define D_SUBMODULE sysfs
+#include "debug-levels.h"
+
+
+/*
+ * Set the idle timeout (msecs)
+ *
+ * FIXME: eventually this should be a common WiMAX stack method, but
+ * would like to wait to see how other devices manage it.
+ */
+static
+ssize_t i2400m_idle_timeout_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t size)
+{
+	ssize_t result;
+	struct i2400m *i2400m = net_dev_to_i2400m(to_net_dev(dev));
+	unsigned val;
+
+	result = -EINVAL;
+	if (sscanf(buf, "%u\n", &val) != 1)
+		goto error_no_unsigned;
+	if (val != 0 && (val < 100 || val > 300000 || val % 100 != 0)) {
+		dev_err(dev, "idle_timeout: %u: invalid msecs specification; "
+			"valid values are 0, 100-300000 in 100 increments\n",
+			val);
+		goto error_bad_value;
+	}
+	result = i2400m_set_idle_timeout(i2400m, val);
+	if (result >= 0)
+		result = size;
+error_no_unsigned:
+error_bad_value:
+	return result;
+}
+
+static
+DEVICE_ATTR(i2400m_idle_timeout, S_IWUSR,
+	    NULL, i2400m_idle_timeout_store);
+
+static
+struct attribute *i2400m_dev_attrs[] = {
+	&dev_attr_i2400m_idle_timeout.attr,
+	NULL,
+};
+
+struct attribute_group i2400m_dev_attr_group = {
+	.name = NULL,		/* we want them in the same directory */
+	.attrs = i2400m_dev_attrs,
+};
diff --git a/include/linux/wimax/i2400m.h b/include/linux/wimax/i2400m.h
index 74198f5bb4dc..686eeb2b9704 100644
--- a/include/linux/wimax/i2400m.h
+++ b/include/linux/wimax/i2400m.h
@@ -381,6 +381,7 @@ enum i2400m_tlv {
 	I2400M_TLV_RF_STATUS = 163,
 	I2400M_TLV_DEVICE_RESET_TYPE = 132,
 	I2400M_TLV_CONFIG_IDLE_PARAMETERS = 601,
+	I2400M_TLV_CONFIG_IDLE_TIMEOUT = 611,
 };
 
 
@@ -509,4 +510,13 @@ struct i2400m_tlv_media_status {
 	__le32 media_status;
 } __attribute__((packed));
 
+
+/* New in v1.4 */
+struct i2400m_tlv_config_idle_timeout {
+	struct i2400m_tlv_hdr hdr;
+	__le32 timeout;	/* 100 to 300000 ms [5min], 100 increments
+			 * 0 disabled */
+} __attribute__((packed));
+
+
 #endif /* #ifndef __LINUX__WIMAX__I2400M_H__ */
-- 
cgit v1.2.3-71-gd317


From 347707baa77d273d79258303e00200d40cf3b323 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sat, 28 Feb 2009 23:42:51 +0000
Subject: wimax: struct device - replace bus_id with dev_name(), dev_set_name()

Cc: inaky.perez-gonzalez@intel.com
Cc: linux-wimax@intel.com
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wimax/i2400m/driver.c    | 2 +-
 drivers/net/wimax/i2400m/usb-notif.c | 2 +-
 include/linux/wimax/debug.h          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index f988771bfae0..e4f1ce5bc294 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -618,7 +618,7 @@ int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
 	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
 
 	snprintf(wimax_dev->name, sizeof(wimax_dev->name),
-		 "i2400m-%s:%s", dev->bus->name, dev->bus_id);
+		 "i2400m-%s:%s", dev->bus->name, dev_name(dev));
 
 	i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL);
 	if (i2400m->bm_cmd_buf == NULL) {
diff --git a/drivers/net/wimax/i2400m/usb-notif.c b/drivers/net/wimax/i2400m/usb-notif.c
index 9702c22b2497..6add27c3f35c 100644
--- a/drivers/net/wimax/i2400m/usb-notif.c
+++ b/drivers/net/wimax/i2400m/usb-notif.c
@@ -102,7 +102,7 @@ int i2400mu_notification_grok(struct i2400mu *i2400mu, const void *buf,
 		dev_err(dev, "HW BUG? Unknown/unexpected data in notification "
 			"message (%zu bytes)\n", buf_len);
 		snprintf(prefix, sizeof(prefix), "%s %s: ",
-			 dev_driver_string(dev) , dev->bus_id);
+			 dev_driver_string(dev), dev_name(dev));
 		if (buf_len > 64) {
 			print_hex_dump(KERN_ERR, prefix, DUMP_PREFIX_OFFSET,
 				       8, 4, buf, 64, 0);
diff --git a/include/linux/wimax/debug.h b/include/linux/wimax/debug.h
index ba0c49399a83..c703e0340423 100644
--- a/include/linux/wimax/debug.h
+++ b/include/linux/wimax/debug.h
@@ -178,7 +178,7 @@ void __d_head(char *head, size_t head_size,
 		WARN_ON(1);
 	} else
 		snprintf(head, head_size, "%s %s: ",
-			 dev_driver_string(dev), dev->bus_id);
+			 dev_driver_string(dev), dev_name(dev));
 }
 
 
-- 
cgit v1.2.3-71-gd317


From fd5c565c0c04d2716cfdac3f1de3c2261d6a457d Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Sat, 28 Feb 2009 23:42:52 +0000
Subject: wimax/i2400m: support extended data RX protocol (no need to
 reallocate skbs)

Newer i2400m firmwares (>= v1.4) extend the data RX protocol so that
each packet has a 16 byte header. This header is mainly used to
implement host reordeing (which is addressed in later commits).

However, this header also allows us to overwrite it (once data has
been extracted) with an Ethernet header and deliver to the networking
stack without having to reallocate the skb (as it happened in fw <=
v1.3) to make room for it.

- control.c: indicate the device [dev_initialize()] that the driver
  wants to use the extended data RX protocol. Also involves adding the
  definition of the needed data types in include/linux/wimax/i2400m.h.

- rx.c: handle the new payload type for the extended RX data
  protocol. Prepares the skb for delivery to
  netdev.c:i2400m_net_erx().

- netdev.c: Introduce i2400m_net_erx() that adds the fake ethernet
  address to a prepared skb and delivers it to the networking
  stack.

- cleanup: in most instances in rx.c, the variable 'single' was
  renamed to 'single_last' for it better conveys its meaning.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wimax/i2400m/control.c |   9 +++
 drivers/net/wimax/i2400m/i2400m.h  |   2 +
 drivers/net/wimax/i2400m/netdev.c  | 104 +++++++++++++++++++++++++--------
 drivers/net/wimax/i2400m/rx.c      | 117 ++++++++++++++++++++++++++++++++++---
 include/linux/wimax/i2400m.h       |  35 +++++++++++
 5 files changed, 234 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/control.c b/drivers/net/wimax/i2400m/control.c
index c3968b240d69..4073c3e93bd4 100644
--- a/drivers/net/wimax/i2400m/control.c
+++ b/drivers/net/wimax/i2400m/control.c
@@ -1311,6 +1311,7 @@ int i2400m_dev_initialize(struct i2400m *i2400m)
 	struct device *dev = i2400m_dev(i2400m);
 	struct i2400m_tlv_config_idle_parameters idle_params;
 	struct i2400m_tlv_config_idle_timeout idle_timeout;
+	struct i2400m_tlv_config_d2h_data_format df;
 	const struct i2400m_tlv_hdr *args[9];
 	unsigned argc = 0;
 
@@ -1333,6 +1334,14 @@ int i2400m_dev_initialize(struct i2400m *i2400m)
 			args[argc++] = &idle_timeout.hdr;
 		}
 	}
+	if (i2400m_ge_v1_4(i2400m)) {
+		df.hdr.type =
+			cpu_to_le16(I2400M_TLV_CONFIG_D2H_DATA_FORMAT);
+		df.hdr.length = cpu_to_le16(
+			sizeof(df) - sizeof(df.hdr));
+		df.format = 1;
+		args[argc++] = &df.hdr;
+	}
 	result = i2400m_set_init_config(i2400m, args, argc);
 	if (result < 0)
 		goto error;
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h
index 0c60d5c43007..125c30594e63 100644
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -593,6 +593,8 @@ extern void i2400m_tx_release(struct i2400m *);
 
 extern void i2400m_net_rx(struct i2400m *, struct sk_buff *, unsigned,
 			  const void *, int);
+extern void i2400m_net_erx(struct i2400m *, struct sk_buff *,
+			   enum i2400m_cs);
 enum i2400m_pt;
 extern int i2400m_tx(struct i2400m *, const void *, size_t, enum i2400m_pt);
 
diff --git a/drivers/net/wimax/i2400m/netdev.c b/drivers/net/wimax/i2400m/netdev.c
index be8be4d0709c..2bdd0cdbb319 100644
--- a/drivers/net/wimax/i2400m/netdev.c
+++ b/drivers/net/wimax/i2400m/netdev.c
@@ -28,13 +28,12 @@
  * space and from the other side. The world is (sadly) configured to
  * take in only Ethernet devices...
  *
- * Because of this, currently there is an copy-each-rxed-packet
- * overhead on the RX path. Each IP packet has to be reallocated to
- * add an ethernet header (as there is no space in what we get from
- * the device). This is a known drawback and coming versions of the
- * device's firmware are being changed to add header space that can be
- * used to insert the ethernet header without having to reallocate and
- * copy.
+ * Because of this, when using firmwares <= v1.3, there is an
+ * copy-each-rxed-packet overhead on the RX path. Each IP packet has
+ * to be reallocated to add an ethernet header (as there is no space
+ * in what we get from the device). This is a known drawback and
+ * firmwares >= 1.4 add header space that can be used to insert the
+ * ethernet header without having to reallocate and copy.
  *
  * TX error handling is tricky; because we have to FIFO/queue the
  * buffers for transmission (as the hardware likes it aggregated), we
@@ -67,7 +66,9 @@
  * i2400m_tx_timeout      Called when the device times out
  *
  * i2400m_net_rx          Called by the RX code when a data frame is
- *                        available.
+ *                        available (firmware <= 1.3)
+ * i2400m_net_erx         Called by the RX code when a data frame is
+ *                        available (firmware >= 1.4).
  * i2400m_netdev_setup    Called to setup all the netdev stuff from
  *                        alloc_netdev.
  */
@@ -396,30 +397,18 @@ void i2400m_tx_timeout(struct net_device *net_dev)
  * Create a fake ethernet header
  *
  * For emulating an ethernet device, every received IP header has to
- * be prefixed with an ethernet header.
- *
- * What we receive has (potentially) many IP packets concatenated with
- * no ETH_HLEN bytes prefixed. Thus there is no space for an eth
- * header.
- *
- * We would have to reallocate or do ugly fragment tricks in order to
- * add it.
- *
- * But what we do is use the header space of the RX transaction
- * (*msg_hdr) as we don't need it anymore; then we'll point all the
- * data skbs there, as they share the same backing store.
- *
- * We only support IPv4 for v3 firmware.
+ * be prefixed with an ethernet header. Fake it with the given
+ * protocol.
  */
 static
 void i2400m_rx_fake_eth_header(struct net_device *net_dev,
-			       void *_eth_hdr)
+			       void *_eth_hdr, int protocol)
 {
 	struct ethhdr *eth_hdr = _eth_hdr;
 
 	memcpy(eth_hdr->h_dest, net_dev->dev_addr, sizeof(eth_hdr->h_dest));
 	memset(eth_hdr->h_source, 0, sizeof(eth_hdr->h_dest));
-	eth_hdr->h_proto = cpu_to_be16(ETH_P_IP);
+	eth_hdr->h_proto = cpu_to_be16(protocol);
 }
 
 
@@ -432,6 +421,13 @@ void i2400m_rx_fake_eth_header(struct net_device *net_dev,
  * @buf: pointer to the buffer containing the data
  * @len: buffer's length
  *
+ * This is only used now for the v1.3 firmware. It will be deprecated
+ * in >= 2.6.31.
+ *
+ * Note that due to firmware limitations, we don't have space to add
+ * an ethernet header, so we need to copy each packet. Firmware
+ * versions >= v1.4 fix this [see i2400m_net_erx()].
+ *
  * We just clone the skb and set it up so that it's skb->data pointer
  * points to "buf" and it's length.
  *
@@ -478,7 +474,7 @@ void i2400m_net_rx(struct i2400m *i2400m, struct sk_buff *skb_rx,
 		memcpy(skb_put(skb, buf_len), buf, buf_len);
 	}
 	i2400m_rx_fake_eth_header(i2400m->wimax_dev.net_dev,
-				  skb->data - ETH_HLEN);
+				  skb->data - ETH_HLEN, ETH_P_IP);
 	skb_set_mac_header(skb, -ETH_HLEN);
 	skb->dev = i2400m->wimax_dev.net_dev;
 	skb->protocol = htons(ETH_P_IP);
@@ -493,6 +489,64 @@ error_skb_realloc:
 		i2400m, buf, buf_len);
 }
 
+
+/*
+ * i2400m_net_erx - pass a network packet to the stack (extended version)
+ *
+ * @i2400m: device descriptor
+ * @skb: the skb where the packet is - the skb should be set to point
+ *     at the IP packet; this function will add ethernet headers if
+ *     needed.
+ * @cs: packet type
+ *
+ * This is only used now for firmware >= v1.4. Note it is quite
+ * similar to i2400m_net_rx() (used only for v1.3 firmware).
+ *
+ * This function is normally run from a thread context. However, we
+ * still use netif_rx() instead of netif_receive_skb() as was
+ * recommended in the mailing list. Reason is in some stress tests
+ * when sending/receiving a lot of data we seem to hit a softlock in
+ * the kernel's TCP implementation [aroudn tcp_delay_timer()]. Using
+ * netif_rx() took care of the issue.
+ *
+ * This is, of course, still open to do more research on why running
+ * with netif_receive_skb() hits this softlock. FIXME.
+ */
+void i2400m_net_erx(struct i2400m *i2400m, struct sk_buff *skb,
+		    enum i2400m_cs cs)
+{
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct device *dev = i2400m_dev(i2400m);
+	int protocol;
+
+	d_fnstart(2, dev, "(i2400m %p skb %p [%zu] cs %d)\n",
+		  i2400m, skb, skb->len, cs);
+	switch(cs) {
+	case I2400M_CS_IPV4_0:
+	case I2400M_CS_IPV4:
+		protocol = ETH_P_IP;
+		i2400m_rx_fake_eth_header(i2400m->wimax_dev.net_dev,
+					  skb->data - ETH_HLEN, ETH_P_IP);
+		skb_set_mac_header(skb, -ETH_HLEN);
+		skb->dev = i2400m->wimax_dev.net_dev;
+		skb->protocol = htons(ETH_P_IP);
+		net_dev->stats.rx_packets++;
+		net_dev->stats.rx_bytes += skb->len;
+		break;
+	default:
+		dev_err(dev, "ERX: BUG? CS type %u unsupported\n", cs);
+		goto error;
+
+	}
+	d_printf(3, dev, "ERX: receiving %d bytes to the network stack\n",
+		 skb->len);
+	d_dump(4, dev, skb->data, skb->len);
+	netif_rx_ni(skb);	/* see notes in function header */
+error:
+	d_fnend(2, dev, "(i2400m %p skb %p [%zu] cs %d) = void\n",
+		i2400m, skb, skb->len, cs);
+}
+
 static const struct net_device_ops i2400m_netdev_ops = {
 	.ndo_open = i2400m_open,
 	.ndo_stop = i2400m_stop,
diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
index c62b8c564161..cd525066d4b7 100644
--- a/drivers/net/wimax/i2400m/rx.c
+++ b/drivers/net/wimax/i2400m/rx.c
@@ -69,6 +69,22 @@
  * See tx.c for a deeper description on alignment requirements and
  * other fun facts of it.
  *
+ * DATA PACKETS
+ *
+ * In firmwares <= v1.3, data packets have no header for RX, but they
+ * do for TX (currently unused).
+ *
+ * In firmware >= 1.4, RX packets have an extended header (16
+ * bytes). This header conveys information for management of host
+ * reordering of packets (the device offloads storage of the packets
+ * for reordering to the host).
+ *
+ * Currently this information is not used as the current code doesn't
+ * enable host reordering.
+ *
+ * The header is used as dummy space to emulate an ethernet header and
+ * thus be able to act as an ethernet device without having to reallocate.
+ *
  * ROADMAP
  *
  * i2400m_rx
@@ -76,6 +92,8 @@
  *   i2400m_rx_pl_descr_check
  *   i2400m_rx_payload
  *     i2400m_net_rx
+ *     i2400m_rx_edata
+ *       i2400m_net_erx
  *     i2400m_rx_ctl
  *       i2400m_msg_size_check
  *       i2400m_report_hook_work    [in a workqueue]
@@ -264,8 +282,6 @@ error_check:
 }
 
 
-
-
 /*
  * Receive and send up a trace
  *
@@ -314,32 +330,112 @@ error_check:
 	return;
 }
 
+/*
+ * Receive and send up an extended data packet
+ *
+ * @i2400m: device descriptor
+ * @skb_rx: skb that contains the extended data packet
+ * @single_last: 1 if the payload is the only one or the last one of
+ *     the skb.
+ * @payload: pointer to the packet's data inside the skb
+ * @size: size of the payload
+ *
+ * Starting in v1.4 of the i2400m's firmware, the device can send data
+ * packets to the host in an extended format that; this incudes a 16
+ * byte header (struct i2400m_pl_edata_hdr). Using this header's space
+ * we can fake ethernet headers for ethernet device emulation without
+ * having to copy packets around.
+ *
+ * This function handles said path.
+ */
+static
+void i2400m_rx_edata(struct i2400m *i2400m, struct sk_buff *skb_rx,
+		     unsigned single_last, const void *payload, size_t size)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	const struct i2400m_pl_edata_hdr *hdr = payload;
+	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
+	struct sk_buff *skb;
+	enum i2400m_cs cs;
+	unsigned reorder_needed;
+
+	d_fnstart(4, dev, "(i2400m %p skb_rx %p single %u payload %p "
+		  "size %zu)\n", i2400m, skb_rx, single_last, payload, size);
+	if (size < sizeof(*hdr)) {
+		dev_err(dev, "ERX: HW BUG? message with short header (%zu "
+			"vs %zu bytes expected)\n", size, sizeof(*hdr));
+		goto error;
+	}
+	reorder_needed = le32_to_cpu(hdr->reorder & I2400M_REORDER_NEEDED);
+	cs = hdr->cs;
+	if (reorder_needed) {
+		dev_err(dev, "ERX: HW BUG? reorder needed, it was disabled\n");
+		goto error;
+	}
+	/* ok, so now decide if we want to clone or reuse the skb,
+	 * pull and trim it so the beginning is the space for the eth
+	 * header and pass it to i2400m_net_erx() for the stack */
+	if (single_last) {
+		skb = skb_get(skb_rx);
+		d_printf(3, dev, "ERX: reusing single payload skb %p\n", skb);
+	} else {
+		skb = skb_clone(skb_rx, GFP_KERNEL);
+		d_printf(3, dev, "ERX: cloning %p\n", skb);
+		if (skb == NULL) {
+			dev_err(dev, "ERX: no memory to clone skb\n");
+			net_dev->stats.rx_dropped++;
+			goto error_skb_clone;
+		}
+	}
+	/* now we have to pull and trim so that the skb points to the
+	 * beginning of the IP packet; the netdev part will add the
+	 * ethernet header as needed. */
+	BUILD_BUG_ON(ETH_HLEN > sizeof(*hdr));
+	skb_pull(skb, payload + sizeof(*hdr) - (void *) skb->data);
+	skb_trim(skb, (void *) skb_end_pointer(skb) - payload + sizeof(*hdr));
+	i2400m_net_erx(i2400m, skb, cs);
+error_skb_clone:
+error:
+	d_fnend(4, dev, "(i2400m %p skb_rx %p single %u payload %p "
+		"size %zu) = void\n", i2400m, skb_rx, single_last, payload, size);
+	return;
+}
+
+
+
 
 /*
  * Act on a received payload
  *
  * @i2400m: device instance
  * @skb_rx: skb where the transaction was received
- * @single: 1 if there is only one payload, 0 otherwise
+ * @single_last: 1 this is the only payload or the last one (so the
+ *     skb can be reused instead of cloned).
  * @pld: payload descriptor
  * @payload: payload data
  *
  * Upon reception of a payload, look at its guts in the payload
- * descriptor and decide what to do with it.
+ * descriptor and decide what to do with it. If it is a single payload
+ * skb or if the last skb is a data packet, the skb will be referenced
+ * and modified (so it doesn't have to be cloned).
  */
 static
 void i2400m_rx_payload(struct i2400m *i2400m, struct sk_buff *skb_rx,
-		       unsigned single, const struct i2400m_pld *pld,
+		       unsigned single_last, const struct i2400m_pld *pld,
 		       const void *payload)
 {
 	struct device *dev = i2400m_dev(i2400m);
 	size_t pl_size = i2400m_pld_size(pld);
 	enum i2400m_pt pl_type = i2400m_pld_type(pld);
 
+	d_printf(7, dev, "RX: received payload type %u, %zu bytes\n",
+		 pl_type, pl_size);
+	d_dump(8, dev, payload, pl_size);
+
 	switch (pl_type) {
 	case I2400M_PT_DATA:
 		d_printf(3, dev, "RX: data payload %zu bytes\n", pl_size);
-		i2400m_net_rx(i2400m, skb_rx, single, payload, pl_size);
+		i2400m_net_rx(i2400m, skb_rx, single_last, payload, pl_size);
 		break;
 	case I2400M_PT_CTRL:
 		i2400m_rx_ctl(i2400m, skb_rx, payload, pl_size);
@@ -347,6 +443,10 @@ void i2400m_rx_payload(struct i2400m *i2400m, struct sk_buff *skb_rx,
 	case I2400M_PT_TRACE:
 		i2400m_rx_trace(i2400m, payload, pl_size);
 		break;
+	case I2400M_PT_EDATA:
+		d_printf(3, dev, "ERX: data payload %zu bytes\n", pl_size);
+		i2400m_rx_edata(i2400m, skb_rx, single_last, payload, pl_size);
+		break;
 	default:	/* Anything else shouldn't come to the host */
 		if (printk_ratelimit())
 			dev_err(dev, "RX: HW BUG? unexpected payload type %u\n",
@@ -474,7 +574,7 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
 	const struct i2400m_msg_hdr *msg_hdr;
 	size_t pl_itr, pl_size, skb_len;
 	unsigned long flags;
-	unsigned num_pls;
+	unsigned num_pls, single_last;
 
 	skb_len = skb->len;
 	d_fnstart(4, dev, "(i2400m %p skb %p [size %zu])\n",
@@ -503,7 +603,8 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
 						  pl_itr, skb->len);
 		if (result < 0)
 			goto error_pl_descr_check;
-		i2400m_rx_payload(i2400m, skb, num_pls == 1, &msg_hdr->pld[i],
+		single_last = num_pls == 1 || i == num_pls - 1;
+		i2400m_rx_payload(i2400m, skb, single_last, &msg_hdr->pld[i],
 				  skb->data + pl_itr);
 		pl_itr += ALIGN(pl_size, I2400M_PL_PAD);
 		cond_resched();		/* Don't monopolize */
diff --git a/include/linux/wimax/i2400m.h b/include/linux/wimax/i2400m.h
index 686eeb2b9704..ad36e073a70c 100644
--- a/include/linux/wimax/i2400m.h
+++ b/include/linux/wimax/i2400m.h
@@ -207,6 +207,7 @@ enum i2400m_pt {
 	I2400M_PT_TRACE,	/* For device debug */
 	I2400M_PT_RESET_WARM,	/* device reset */
 	I2400M_PT_RESET_COLD,	/* USB[transport] reset, like reconnect */
+	I2400M_PT_EDATA,	/* Extended RX data */
 	I2400M_PT_ILLEGAL
 };
 
@@ -221,6 +222,32 @@ struct i2400m_pl_data_hdr {
 } __attribute__((packed));
 
 
+/*
+ * Payload for an extended data packet
+ *
+ * New in v1.4
+ *
+ * @cs: the type of data in the packet, as defined per (802.16e
+ *     T11.13.19.1). Currently only 2 (IPv4 packet) supported.
+ *
+ * This is prefixed to each and every INCOMING DATA packet.
+ */
+struct i2400m_pl_edata_hdr {
+	__le32 reorder;
+	__u8 cs;
+	__u8 reserved[11];
+} __attribute__((packed));
+
+enum i2400m_cs {
+	I2400M_CS_IPV4_0 = 0,
+	I2400M_CS_IPV4 = 2,
+};
+
+enum i2400m_reorder {
+	I2400M_REORDER_NEEDED     = 0x01,
+};
+
+
 /* Misc constants */
 enum {
 	I2400M_PL_PAD = 16,	/* Payload data size alignment */
@@ -382,6 +409,7 @@ enum i2400m_tlv {
 	I2400M_TLV_DEVICE_RESET_TYPE = 132,
 	I2400M_TLV_CONFIG_IDLE_PARAMETERS = 601,
 	I2400M_TLV_CONFIG_IDLE_TIMEOUT = 611,
+	I2400M_TLV_CONFIG_D2H_DATA_FORMAT = 614,
 };
 
 
@@ -518,5 +546,12 @@ struct i2400m_tlv_config_idle_timeout {
 			 * 0 disabled */
 } __attribute__((packed));
 
+/* New in v1.4 -- for backward compat, will be removed */
+struct i2400m_tlv_config_d2h_data_format {
+	struct i2400m_tlv_hdr hdr;
+	__u8 format; 		/* 0 old format, 1 enhanced */
+	__u8 reserved[3];
+} __attribute__((packed));
+
 
 #endif /* #ifndef __LINUX__WIMAX__I2400M_H__ */
-- 
cgit v1.2.3-71-gd317


From c747583d19d5d5147a9f0eae480c1fdbc84c4252 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Sat, 28 Feb 2009 23:42:54 +0000
Subject: wimax/i2400m: implement RX reorder support

Allow the device to give the driver RX data with reorder information.

When that is done, the device will indicate the driver if a packet has
to be held in a (sorted) queue. It will also tell the driver when held
packets have to be released to the OS.

This is done to improve the WiMAX-protocol level retransmission
support when missing frames are detected.

The code docs provide details about the implementation.

In general, this just hooks into the RX path in rx.c; if a packet with
the reorder bit in the RX header is detected, the reorder information
in the header is extracted and one of the four main reorder operations
are executed. In one case (queue) no packet will be delivered to the
networking stack, just queued, whereas in the others (reset, update_ws
and queue_update_ws), queued packet might be delivered depending on
the window start for the specific queue.

The modifications to files other than rx.c are:

- control.c: during device initialization, enable reordering support
  if the rx_reorder_disabled module parameter is not enabled

- driver.c: expose a rx_reorder_disable module parameter and call
  i2400m_rx_setup/release() to initialize/shutdown RX reorder
  support.

- i2400m.h: introduce members in 'struct i2400m' needed for
  implementing reorder support.

- linux/i2400m.h: introduce TLVs, commands and constant definitions
  related to RX reorder

Last but not least, the rx reorder code includes an small circular log
where the last N reorder operations are recorded to be displayed in
case of inconsistency. Otherwise diagnosing issues would be almost
impossible.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wimax/i2400m/control.c |  14 +
 drivers/net/wimax/i2400m/driver.c  |  11 +
 drivers/net/wimax/i2400m/i2400m.h  |  19 +-
 drivers/net/wimax/i2400m/rx.c      | 677 +++++++++++++++++++++++++++++++++++--
 include/linux/wimax/i2400m.h       |  32 +-
 5 files changed, 723 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/control.c b/drivers/net/wimax/i2400m/control.c
index 4073c3e93bd4..b3cadb626fe0 100644
--- a/drivers/net/wimax/i2400m/control.c
+++ b/drivers/net/wimax/i2400m/control.c
@@ -1312,10 +1312,12 @@ int i2400m_dev_initialize(struct i2400m *i2400m)
 	struct i2400m_tlv_config_idle_parameters idle_params;
 	struct i2400m_tlv_config_idle_timeout idle_timeout;
 	struct i2400m_tlv_config_d2h_data_format df;
+	struct i2400m_tlv_config_dl_host_reorder dlhr;
 	const struct i2400m_tlv_hdr *args[9];
 	unsigned argc = 0;
 
 	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
+	/* Disable idle mode? (enabled by default) */
 	if (i2400m_idle_mode_disabled) {
 		if (i2400m_le_v1_3(i2400m)) {
 			idle_params.hdr.type =
@@ -1335,12 +1337,24 @@ int i2400m_dev_initialize(struct i2400m *i2400m)
 		}
 	}
 	if (i2400m_ge_v1_4(i2400m)) {
+		/* Enable extended RX data format? */
 		df.hdr.type =
 			cpu_to_le16(I2400M_TLV_CONFIG_D2H_DATA_FORMAT);
 		df.hdr.length = cpu_to_le16(
 			sizeof(df) - sizeof(df.hdr));
 		df.format = 1;
 		args[argc++] = &df.hdr;
+
+		/* Enable RX data reordering?
+		 * (switch flipped in rx.c:i2400m_rx_setup() after fw upload) */
+		if (i2400m->rx_reorder) {
+			dlhr.hdr.type =
+				cpu_to_le16(I2400M_TLV_CONFIG_DL_HOST_REORDER);
+			dlhr.hdr.length = cpu_to_le16(
+				sizeof(dlhr) - sizeof(dlhr.hdr));
+			dlhr.reorder = 1;
+			args[argc++] = &dlhr.hdr;
+		}
 	}
 	result = i2400m_set_init_config(i2400m, args, argc);
 	if (result < 0)
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index e4f1ce5bc294..07a54bad237b 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -76,6 +76,11 @@ MODULE_PARM_DESC(idle_mode_disabled,
 		 "If true, the device will not enable idle mode negotiation "
 		 "with the base station (when connected) to save power.");
 
+int i2400m_rx_reorder_disabled;	/* 0 (rx reorder enabled) by default */
+module_param_named(rx_reorder_disabled, i2400m_rx_reorder_disabled, int, 0644);
+MODULE_PARM_DESC(rx_reorder_disabled,
+		 "If true, RX reordering will be disabled.");
+
 /**
  * i2400m_queue_work - schedule work on a i2400m's queue
  *
@@ -396,6 +401,9 @@ retry:
 	result = i2400m_tx_setup(i2400m);
 	if (result < 0)
 		goto error_tx_setup;
+	result = i2400m_rx_setup(i2400m);
+	if (result < 0)
+		goto error_rx_setup;
 	result = i2400m->bus_dev_start(i2400m);
 	if (result < 0)
 		goto error_bus_dev_start;
@@ -430,6 +438,8 @@ error_fw_check:
 error_create_workqueue:
 	i2400m->bus_dev_stop(i2400m);
 error_bus_dev_start:
+	i2400m_rx_release(i2400m);
+error_rx_setup:
 	i2400m_tx_release(i2400m);
 error_tx_setup:
 error_bootstrap:
@@ -477,6 +487,7 @@ void __i2400m_dev_stop(struct i2400m *i2400m)
 	i2400m->ready = 0;
 	destroy_workqueue(i2400m->work_queue);
 	i2400m->bus_dev_stop(i2400m);
+	i2400m_rx_release(i2400m);
 	i2400m_tx_release(i2400m);
 	wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
 	d_fnend(3, dev, "(i2400m %p) = 0\n", i2400m);
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h
index 125c30594e63..3ae2df38b59a 100644
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -174,6 +174,7 @@ enum i2400m_reset_type {
 };
 
 struct i2400m_reset_ctx;
+struct i2400m_roq;
 
 /**
  * struct i2400m - descriptor for an Intel 2400m
@@ -257,6 +258,9 @@ struct i2400m_reset_ctx;
  *     force this to be the first field so that we can get from
  *     netdev_priv() the right pointer.
  *
+ * @rx_reorder: 1 if RX reordering is enabled; this can only be
+ *     set at probe time.
+ *
  * @state: device's state (as reported by it)
  *
  * @state_wq: waitqueue that is woken up whenever the state changes
@@ -313,6 +317,12 @@ struct i2400m_reset_ctx;
  *
  * @rx_size_max: buggest RX message received.
  *
+ * @rx_roq: RX ReOrder queues. (fw >= v1.4) When packets are received
+ *     out of order, the device will ask the driver to hold certain
+ *     packets until the ones that are received out of order can be
+ *     delivered. Then the driver can release them to the host. See
+ *     drivers/net/i2400m/rx.c for details.
+ *
  * @init_mutex: Mutex used for serializing the device bringup
  *     sequence; this way if the device reboots in the middle, we
  *     don't try to do a bringup again while we are tearing down the
@@ -377,6 +387,7 @@ struct i2400m {
 	unsigned boot_mode:1;		/* is the device in boot mode? */
 	unsigned sboot:1;		/* signed or unsigned fw boot */
 	unsigned ready:1;		/* all probing steps done */
+	unsigned rx_reorder:1;		/* RX reorder is enabled */
 	u8 trace_msg_from_user;		/* echo rx msgs to 'trace' pipe */
 					/* typed u8 so debugfs/u8 can tweak */
 	enum i2400m_system_state state;
@@ -405,10 +416,11 @@ struct i2400m {
 	unsigned tx_pl_num, tx_pl_max, tx_pl_min,
 		tx_num, tx_size_acc, tx_size_min, tx_size_max;
 
-	/* RX stats */
+	/* RX stuff */
 	spinlock_t rx_lock;		/* protect RX state */
 	unsigned rx_pl_num, rx_pl_max, rx_pl_min,
 		rx_num, rx_size_acc, rx_size_min, rx_size_max;
+	struct i2400m_roq *rx_roq;	/* not under rx_lock! */
 
 	struct mutex msg_mutex;		/* serialize command execution */
 	struct completion msg_completion;
@@ -442,6 +454,7 @@ void i2400m_init(struct i2400m *i2400m)
 	wimax_dev_init(&i2400m->wimax_dev);
 
 	i2400m->boot_mode = 1;
+	i2400m->rx_reorder = 1;
 	init_waitqueue_head(&i2400m->state_wq);
 
 	spin_lock_init(&i2400m->tx_lock);
@@ -591,6 +604,9 @@ extern int i2400m_tx_setup(struct i2400m *);
 extern void i2400m_wake_tx_work(struct work_struct *);
 extern void i2400m_tx_release(struct i2400m *);
 
+extern int i2400m_rx_setup(struct i2400m *);
+extern void i2400m_rx_release(struct i2400m *);
+
 extern void i2400m_net_rx(struct i2400m *, struct sk_buff *, unsigned,
 			  const void *, int);
 extern void i2400m_net_erx(struct i2400m *, struct sk_buff *,
@@ -788,6 +804,7 @@ void __i2400m_msleep(unsigned ms)
 /* Module parameters */
 
 extern int i2400m_idle_mode_disabled;
+extern int i2400m_rx_reorder_disabled;
 
 
 #endif /* #ifndef __I2400M_H__ */
diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
index cd525066d4b7..02419bfd64b5 100644
--- a/drivers/net/wimax/i2400m/rx.c
+++ b/drivers/net/wimax/i2400m/rx.c
@@ -39,7 +39,7 @@
  *  - Use skb_clone(), break up processing in chunks
  *  - Split transport/device specific
  *  - Make buffer size dynamic to exert less memory pressure
- *
+ *  - RX reorder support
  *
  * This handles the RX path.
  *
@@ -77,14 +77,42 @@
  * In firmware >= 1.4, RX packets have an extended header (16
  * bytes). This header conveys information for management of host
  * reordering of packets (the device offloads storage of the packets
- * for reordering to the host).
- *
- * Currently this information is not used as the current code doesn't
- * enable host reordering.
+ * for reordering to the host). Read below for more information.
  *
  * The header is used as dummy space to emulate an ethernet header and
  * thus be able to act as an ethernet device without having to reallocate.
  *
+ * DATA RX REORDERING
+ *
+ * Starting in firmware v1.4, the device can deliver packets for
+ * delivery with special reordering information; this allows it to
+ * more effectively do packet management when some frames were lost in
+ * the radio traffic.
+ *
+ * Thus, for RX packets that come out of order, the device gives the
+ * driver enough information to queue them properly and then at some
+ * point, the signal to deliver the whole (or part) of the queued
+ * packets to the networking stack. There are 16 such queues.
+ *
+ * This only happens when a packet comes in with the "need reorder"
+ * flag set in the RX header. When such bit is set, the following
+ * operations might be indicated:
+ *
+ *  - reset queue: send all queued packets to the OS
+ *
+ *  - queue: queue a packet
+ *
+ *  - update ws: update the queue's window start and deliver queued
+ *    packets that meet the criteria
+ *
+ *  - queue & update ws: queue a packet, update the window start and
+ *    deliver queued packets that meet the criteria
+ *
+ * (delivery criteria: the packet's [normalized] sequence number is
+ * lower than the new [normalized] window start).
+ *
+ * See the i2400m_roq_*() functions for details.
+ *
  * ROADMAP
  *
  * i2400m_rx
@@ -94,6 +122,17 @@
  *     i2400m_net_rx
  *     i2400m_rx_edata
  *       i2400m_net_erx
+ *       i2400m_roq_reset
+ *         i2400m_net_erx
+ *       i2400m_roq_queue
+ *         __i2400m_roq_queue
+ *       i2400m_roq_update_ws
+ *         __i2400m_roq_update_ws
+ *           i2400m_net_erx
+ *       i2400m_roq_queue_update_ws
+ *         __i2400m_roq_queue
+ *         __i2400m_roq_update_ws
+ *           i2400m_net_erx
  *     i2400m_rx_ctl
  *       i2400m_msg_size_check
  *       i2400m_report_hook_work    [in a workqueue]
@@ -330,6 +369,469 @@ error_check:
 	return;
 }
 
+
+/*
+ * Reorder queue data stored on skb->cb while the skb is queued in the
+ * reorder queues.
+ */
+struct i2400m_roq_data {
+	unsigned sn;		/* Serial number for the skb */
+	enum i2400m_cs cs;	/* packet type for the skb */
+};
+
+
+/*
+ * ReOrder Queue
+ *
+ * @ws: Window Start; sequence number where the current window start
+ *     is for this queue
+ * @queue: the skb queue itself
+ * @log: circular ring buffer used to log information about the
+ *     reorder process in this queue that can be displayed in case of
+ *     error to help diagnose it.
+ *
+ * This is the head for a list of skbs. In the skb->cb member of the
+ * skb when queued here contains a 'struct i2400m_roq_data' were we
+ * store the sequence number (sn) and the cs (packet type) coming from
+ * the RX payload header from the device.
+ */
+struct i2400m_roq
+{
+	unsigned ws;
+	struct sk_buff_head queue;
+	struct i2400m_roq_log *log;
+};
+
+
+static
+void __i2400m_roq_init(struct i2400m_roq *roq)
+{
+	roq->ws = 0;
+	skb_queue_head_init(&roq->queue);
+}
+
+
+static
+unsigned __i2400m_roq_index(struct i2400m *i2400m, struct i2400m_roq *roq)
+{
+	return ((unsigned long) roq - (unsigned long) i2400m->rx_roq)
+		/ sizeof(*roq);
+}
+
+
+/*
+ * Normalize a sequence number based on the queue's window start
+ *
+ * nsn = (sn - ws) % 2048
+ *
+ * Note that if @sn < @roq->ws, we still need a positive number; %'s
+ * sign is implementation specific, so we normalize it by adding 2048
+ * to bring it to be positive.
+ */
+static
+unsigned __i2400m_roq_nsn(struct i2400m_roq *roq, unsigned sn)
+{
+	int r;
+	r =  ((int) sn - (int) roq->ws) % 2048;
+	if (r < 0)
+		r += 2048;
+	return r;
+}
+
+
+/*
+ * Circular buffer to keep the last N reorder operations
+ *
+ * In case something fails, dumb then to try to come up with what
+ * happened.
+ */
+enum {
+	I2400M_ROQ_LOG_LENGTH = 32,
+};
+
+struct i2400m_roq_log {
+	struct i2400m_roq_log_entry {
+		enum i2400m_ro_type type;
+		unsigned ws, count, sn, nsn, new_ws;
+	} entry[I2400M_ROQ_LOG_LENGTH];
+	unsigned in, out;
+};
+
+
+/* Print a log entry */
+static
+void i2400m_roq_log_entry_print(struct i2400m *i2400m, unsigned index,
+				unsigned e_index,
+				struct i2400m_roq_log_entry *e)
+{
+	struct device *dev = i2400m_dev(i2400m);
+
+	switch(e->type) {
+	case I2400M_RO_TYPE_RESET:
+		dev_err(dev, "q#%d reset           ws %u cnt %u sn %u/%u"
+			" - new nws %u\n",
+			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
+		break;
+	case I2400M_RO_TYPE_PACKET:
+		dev_err(dev, "q#%d queue           ws %u cnt %u sn %u/%u\n",
+			index, e->ws, e->count, e->sn, e->nsn);
+		break;
+	case I2400M_RO_TYPE_WS:
+		dev_err(dev, "q#%d update_ws       ws %u cnt %u sn %u/%u"
+			" - new nws %u\n",
+			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
+		break;
+	case I2400M_RO_TYPE_PACKET_WS:
+		dev_err(dev, "q#%d queue_update_ws ws %u cnt %u sn %u/%u"
+			" - new nws %u\n",
+			index, e->ws, e->count, e->sn, e->nsn, e->new_ws);
+		break;
+	default:
+		dev_err(dev, "q#%d BUG? entry %u - unknown type %u\n",
+			index, e_index, e->type);
+		break;
+	}
+}
+
+
+static
+void i2400m_roq_log_add(struct i2400m *i2400m,
+			struct i2400m_roq *roq, enum i2400m_ro_type type,
+			unsigned ws, unsigned count, unsigned sn,
+			unsigned nsn, unsigned new_ws)
+{
+	struct i2400m_roq_log_entry *e;
+	unsigned cnt_idx;
+	int index = __i2400m_roq_index(i2400m, roq);
+
+	/* if we run out of space, we eat from the end */
+	if (roq->log->in - roq->log->out == I2400M_ROQ_LOG_LENGTH)
+		roq->log->out++;
+	cnt_idx = roq->log->in++ % I2400M_ROQ_LOG_LENGTH;
+	e = &roq->log->entry[cnt_idx];
+
+	e->type = type;
+	e->ws = ws;
+	e->count = count;
+	e->sn = sn;
+	e->nsn = nsn;
+	e->new_ws = new_ws;
+
+	if (d_test(1))
+		i2400m_roq_log_entry_print(i2400m, index, cnt_idx, e);
+}
+
+
+/* Dump all the entries in the FIFO and reinitialize it */
+static
+void i2400m_roq_log_dump(struct i2400m *i2400m, struct i2400m_roq *roq)
+{
+	unsigned cnt, cnt_idx;
+	struct i2400m_roq_log_entry *e;
+	int index = __i2400m_roq_index(i2400m, roq);
+
+	BUG_ON(roq->log->out > roq->log->in);
+	for (cnt = roq->log->out; cnt < roq->log->in; cnt++) {
+		cnt_idx = cnt % I2400M_ROQ_LOG_LENGTH;
+		e = &roq->log->entry[cnt_idx];
+		i2400m_roq_log_entry_print(i2400m, index, cnt_idx, e);
+		memset(e, 0, sizeof(*e));
+	}
+	roq->log->in = roq->log->out = 0;
+}
+
+
+/*
+ * Backbone for the queuing of an skb (by normalized sequence number)
+ *
+ * @i2400m: device descriptor
+ * @roq: reorder queue where to add
+ * @skb: the skb to add
+ * @sn: the sequence number of the skb
+ * @nsn: the normalized sequence number of the skb (pre-computed by the
+ *     caller from the @sn and @roq->ws).
+ *
+ * We try first a couple of quick cases:
+ *
+ *   - the queue is empty
+ *   - the skb would be appended to the queue
+ *
+ * These will be the most common operations.
+ *
+ * If these fail, then we have to do a sorted insertion in the queue,
+ * which is the slowest path.
+ *
+ * We don't have to acquire a reference count as we are going to own it.
+ */
+static
+void __i2400m_roq_queue(struct i2400m *i2400m, struct i2400m_roq *roq,
+			struct sk_buff *skb, unsigned sn, unsigned nsn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb_itr;
+	struct i2400m_roq_data *roq_data_itr, *roq_data;
+	unsigned nsn_itr;
+
+	d_fnstart(4, dev, "(i2400m %p roq %p skb %p sn %u nsn %u)\n",
+		  i2400m, roq, skb, sn, nsn);
+
+	roq_data = (struct i2400m_roq_data *) &skb->cb;
+	BUILD_BUG_ON(sizeof(*roq_data) > sizeof(skb->cb));
+	roq_data->sn = sn;
+	d_printf(3, dev, "ERX: roq %p [ws %u] nsn %d sn %u\n",
+		 roq, roq->ws, nsn, roq_data->sn);
+
+	/* Queues will be empty on not-so-bad environments, so try
+	 * that first */
+	if (skb_queue_empty(&roq->queue)) {
+		d_printf(2, dev, "ERX: roq %p - first one\n", roq);
+		__skb_queue_head(&roq->queue, skb);
+		goto out;
+	}
+	/* Now try append, as most of the operations will be that */
+	skb_itr = skb_peek_tail(&roq->queue);
+	roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+	nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+	/* NSN bounds assumed correct (checked when it was queued) */
+	if (nsn >= nsn_itr) {
+		d_printf(2, dev, "ERX: roq %p - appended after %p (nsn %d sn %u)\n",
+			 roq, skb_itr, nsn_itr, roq_data_itr->sn);
+		__skb_queue_tail(&roq->queue, skb);
+		goto out;
+	}
+	/* None of the fast paths option worked. Iterate to find the
+	 * right spot where to insert the packet; we know the queue is
+	 * not empty, so we are not the first ones; we also know we
+	 * are not going to be the last ones. The list is sorted, so
+	 * we have to insert before the the first guy with an nsn_itr
+	 * greater that our nsn. */
+	skb_queue_walk(&roq->queue, skb_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+		/* NSN bounds assumed correct (checked when it was queued) */
+		if (nsn_itr > nsn) {
+			d_printf(2, dev, "ERX: roq %p - queued before %p "
+				 "(nsn %d sn %u)\n", roq, skb_itr, nsn_itr,
+				 roq_data_itr->sn);
+			__skb_queue_before(&roq->queue, skb_itr, skb);
+			goto out;
+		}
+	}
+	/* If we get here, that is VERY bad -- print info to help
+	 * diagnose and crash it */
+	dev_err(dev, "SW BUG? failed to insert packet\n");
+	dev_err(dev, "ERX: roq %p [ws %u] skb %p nsn %d sn %u\n",
+		roq, roq->ws, skb, nsn, roq_data->sn);
+	skb_queue_walk(&roq->queue, skb_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+		/* NSN bounds assumed correct (checked when it was queued) */
+		dev_err(dev, "ERX: roq %p skb_itr %p nsn %d sn %u\n",
+			roq, skb_itr, nsn_itr, roq_data_itr->sn);
+	}
+	BUG();
+out:
+	d_fnend(4, dev, "(i2400m %p roq %p skb %p sn %u nsn %d) = void\n",
+		i2400m, roq, skb, sn, nsn);
+	return;
+}
+
+
+/*
+ * Backbone for the update window start operation
+ *
+ * @i2400m: device descriptor
+ * @roq: Reorder queue
+ * @sn: New sequence number
+ *
+ * Updates the window start of a queue; when doing so, it must deliver
+ * to the networking stack all the queued skb's whose normalized
+ * sequence number is lower than the new normalized window start.
+ */
+static
+unsigned __i2400m_roq_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
+				unsigned sn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb_itr, *tmp_itr;
+	struct i2400m_roq_data *roq_data_itr;
+	unsigned new_nws, nsn_itr;
+
+	new_nws = __i2400m_roq_nsn(roq, sn);
+	if (unlikely(new_nws >= 1024) && d_test(1)) {
+		dev_err(dev, "SW BUG? __update_ws new_nws %u (sn %u ws %u)\n",
+			new_nws, sn, roq->ws);
+		WARN_ON(1);
+		i2400m_roq_log_dump(i2400m, roq);
+	}
+	skb_queue_walk_safe(&roq->queue, skb_itr, tmp_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
+		/* NSN bounds assumed correct (checked when it was queued) */
+		if (nsn_itr < new_nws) {
+			d_printf(2, dev, "ERX: roq %p - release skb %p "
+				 "(nsn %u/%u new nws %u)\n",
+				 roq, skb_itr, nsn_itr, roq_data_itr->sn,
+				 new_nws);
+			__skb_unlink(skb_itr, &roq->queue);
+			i2400m_net_erx(i2400m, skb_itr, roq_data_itr->cs);
+		}
+		else
+			break;	/* rest of packets all nsn_itr > nws */
+	}
+	roq->ws = sn;
+	return new_nws;
+}
+
+
+/*
+ * Reset a queue
+ *
+ * @i2400m: device descriptor
+ * @cin: Queue Index
+ *
+ * Deliver all the packets and reset the window-start to zero. Name is
+ * kind of misleading.
+ */
+static
+void i2400m_roq_reset(struct i2400m *i2400m, struct i2400m_roq *roq)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	struct sk_buff *skb_itr, *tmp_itr;
+	struct i2400m_roq_data *roq_data_itr;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p)\n", i2400m, roq);
+	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_RESET,
+			     roq->ws, skb_queue_len(&roq->queue),
+			     ~0, ~0, 0);
+	skb_queue_walk_safe(&roq->queue, skb_itr, tmp_itr) {
+		roq_data_itr = (struct i2400m_roq_data *) &skb_itr->cb;
+		d_printf(2, dev, "ERX: roq %p - release skb %p (sn %u)\n",
+			 roq, skb_itr, roq_data_itr->sn);
+		__skb_unlink(skb_itr, &roq->queue);
+		i2400m_net_erx(i2400m, skb_itr, roq_data_itr->cs);
+	}
+	roq->ws = 0;
+	d_fnend(2, dev, "(i2400m %p roq %p) = void\n", i2400m, roq);
+	return;
+}
+
+
+/*
+ * Queue a packet
+ *
+ * @i2400m: device descriptor
+ * @cin: Queue Index
+ * @skb: containing the packet data
+ * @fbn: First block number of the packet in @skb
+ * @lbn: Last block number of the packet in @skb
+ *
+ * The hardware is asking the driver to queue a packet for later
+ * delivery to the networking stack.
+ */
+static
+void i2400m_roq_queue(struct i2400m *i2400m, struct i2400m_roq *roq,
+		      struct sk_buff * skb, unsigned lbn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned nsn, len;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p skb %p lbn %u) = void\n",
+		  i2400m, roq, skb, lbn);
+	len = skb_queue_len(&roq->queue);
+	nsn = __i2400m_roq_nsn(roq, lbn);
+	if (unlikely(nsn >= 1024)) {
+		dev_err(dev, "SW BUG? queue nsn %d (lbn %u ws %u)\n",
+			nsn, lbn, roq->ws);
+		i2400m_roq_log_dump(i2400m, roq);
+		i2400m->bus_reset(i2400m, I2400M_RT_WARM);
+	} else {
+		__i2400m_roq_queue(i2400m, roq, skb, lbn, nsn);
+		i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_PACKET,
+				     roq->ws, len, lbn, nsn, ~0);
+	}
+	d_fnend(2, dev, "(i2400m %p roq %p skb %p lbn %u) = void\n",
+		i2400m, roq, skb, lbn);
+	return;
+}
+
+
+/*
+ * Update the window start in a reorder queue and deliver all skbs
+ * with a lower window start
+ *
+ * @i2400m: device descriptor
+ * @roq: Reorder queue
+ * @sn: New sequence number
+ */
+static
+void i2400m_roq_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
+			  unsigned sn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned old_ws, nsn, len;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p sn %u)\n", i2400m, roq, sn);
+	old_ws = roq->ws;
+	len = skb_queue_len(&roq->queue);
+	nsn = __i2400m_roq_update_ws(i2400m, roq, sn);
+	i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_WS,
+			     old_ws, len, sn, nsn, roq->ws);
+	d_fnstart(2, dev, "(i2400m %p roq %p sn %u) = void\n", i2400m, roq, sn);
+	return;
+}
+
+
+/*
+ * Queue a packet and update the window start
+ *
+ * @i2400m: device descriptor
+ * @cin: Queue Index
+ * @skb: containing the packet data
+ * @fbn: First block number of the packet in @skb
+ * @sn: Last block number of the packet in @skb
+ *
+ * Note that unlike i2400m_roq_update_ws(), which sets the new window
+ * start to @sn, in here we'll set it to @sn + 1.
+ */
+static
+void i2400m_roq_queue_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
+				struct sk_buff * skb, unsigned sn)
+{
+	struct device *dev = i2400m_dev(i2400m);
+	unsigned nsn, old_ws, len;
+
+	d_fnstart(2, dev, "(i2400m %p roq %p skb %p sn %u)\n",
+		  i2400m, roq, skb, sn);
+	len = skb_queue_len(&roq->queue);
+	nsn = __i2400m_roq_nsn(roq, sn);
+	old_ws = roq->ws;
+	if (unlikely(nsn >= 1024)) {
+		dev_err(dev, "SW BUG? queue_update_ws nsn %u (sn %u ws %u)\n",
+			nsn, sn, roq->ws);
+		i2400m_roq_log_dump(i2400m, roq);
+		i2400m->bus_reset(i2400m, I2400M_RT_WARM);
+	} else {
+		/* if the queue is empty, don't bother as we'd queue
+		 * it and inmediately unqueue it -- just deliver it */
+		if (len == 0) {
+			struct i2400m_roq_data *roq_data;
+			roq_data = (struct i2400m_roq_data *) &skb->cb;
+			i2400m_net_erx(i2400m, skb, roq_data->cs);
+		}
+		else {
+			__i2400m_roq_queue(i2400m, roq, skb, sn, nsn);
+			__i2400m_roq_update_ws(i2400m, roq, sn + 1);
+		}
+		i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_PACKET_WS,
+				   old_ws, len, sn, nsn, roq->ws);
+	}
+	d_fnend(2, dev, "(i2400m %p roq %p skb %p sn %u) = void\n",
+		i2400m, roq, skb, sn);
+	return;
+}
+
+
 /*
  * Receive and send up an extended data packet
  *
@@ -347,6 +849,28 @@ error_check:
  * having to copy packets around.
  *
  * This function handles said path.
+ *
+ *
+ * Receive and send up an extended data packet that requires no reordering
+ *
+ * @i2400m: device descriptor
+ * @skb_rx: skb that contains the extended data packet
+ * @single_last: 1 if the payload is the only one or the last one of
+ *     the skb.
+ * @payload: pointer to the packet's data (past the actual extended
+ *     data payload header).
+ * @size: size of the payload
+ *
+ * Pass over to the networking stack a data packet that might have
+ * reordering requirements.
+ *
+ * This needs to the decide if the skb in which the packet is
+ * contained can be reused or if it needs to be cloned. Then it has to
+ * be trimmed in the edges so that the beginning is the space for eth
+ * header and then pass it to i2400m_net_erx() for the stack
+ *
+ * Assumes the caller has verified the sanity of the payload (size,
+ * etc) already.
  */
 static
 void i2400m_rx_edata(struct i2400m *i2400m, struct sk_buff *skb_rx,
@@ -357,53 +881,86 @@ void i2400m_rx_edata(struct i2400m *i2400m, struct sk_buff *skb_rx,
 	struct net_device *net_dev = i2400m->wimax_dev.net_dev;
 	struct sk_buff *skb;
 	enum i2400m_cs cs;
-	unsigned reorder_needed;
+	u32 reorder;
+	unsigned ro_needed, ro_type, ro_cin, ro_sn;
+	struct i2400m_roq *roq;
+	struct i2400m_roq_data *roq_data;
 
-	d_fnstart(4, dev, "(i2400m %p skb_rx %p single %u payload %p "
+	BUILD_BUG_ON(ETH_HLEN > sizeof(*hdr));
+
+	d_fnstart(2, dev, "(i2400m %p skb_rx %p single %u payload %p "
 		  "size %zu)\n", i2400m, skb_rx, single_last, payload, size);
 	if (size < sizeof(*hdr)) {
 		dev_err(dev, "ERX: HW BUG? message with short header (%zu "
 			"vs %zu bytes expected)\n", size, sizeof(*hdr));
 		goto error;
 	}
-	reorder_needed = le32_to_cpu(hdr->reorder & I2400M_REORDER_NEEDED);
-	cs = hdr->cs;
-	if (reorder_needed) {
-		dev_err(dev, "ERX: HW BUG? reorder needed, it was disabled\n");
-		goto error;
-	}
-	/* ok, so now decide if we want to clone or reuse the skb,
-	 * pull and trim it so the beginning is the space for the eth
-	 * header and pass it to i2400m_net_erx() for the stack */
+
 	if (single_last) {
 		skb = skb_get(skb_rx);
-		d_printf(3, dev, "ERX: reusing single payload skb %p\n", skb);
+		d_printf(3, dev, "ERX: skb %p reusing\n", skb);
 	} else {
 		skb = skb_clone(skb_rx, GFP_KERNEL);
-		d_printf(3, dev, "ERX: cloning %p\n", skb);
 		if (skb == NULL) {
 			dev_err(dev, "ERX: no memory to clone skb\n");
 			net_dev->stats.rx_dropped++;
 			goto error_skb_clone;
 		}
+		d_printf(3, dev, "ERX: skb %p cloned from %p\n", skb, skb_rx);
 	}
 	/* now we have to pull and trim so that the skb points to the
 	 * beginning of the IP packet; the netdev part will add the
-	 * ethernet header as needed. */
-	BUILD_BUG_ON(ETH_HLEN > sizeof(*hdr));
+	 * ethernet header as needed - we know there is enough space
+	 * because we checked in i2400m_rx_edata(). */
 	skb_pull(skb, payload + sizeof(*hdr) - (void *) skb->data);
-	skb_trim(skb, (void *) skb_end_pointer(skb) - payload + sizeof(*hdr));
-	i2400m_net_erx(i2400m, skb, cs);
+	skb_trim(skb, (void *) skb_end_pointer(skb) - payload - sizeof(*hdr));
+
+	reorder = le32_to_cpu(hdr->reorder);
+	ro_needed = reorder & I2400M_RO_NEEDED;
+	cs = hdr->cs;
+	if (ro_needed) {
+		ro_type = (reorder >> I2400M_RO_TYPE_SHIFT) & I2400M_RO_TYPE;
+		ro_cin = (reorder >> I2400M_RO_CIN_SHIFT) & I2400M_RO_CIN;
+		ro_sn = (reorder >> I2400M_RO_SN_SHIFT) & I2400M_RO_SN;
+
+		roq = &i2400m->rx_roq[ro_cin];
+		roq_data = (struct i2400m_roq_data *) &skb->cb;
+		roq_data->sn = ro_sn;
+		roq_data->cs = cs;
+		d_printf(2, dev, "ERX: reorder needed: "
+			 "type %u cin %u [ws %u] sn %u/%u len %zuB\n",
+			 ro_type, ro_cin, roq->ws, ro_sn,
+			 __i2400m_roq_nsn(roq, ro_sn), size);
+		d_dump(2, dev, payload, size);
+		switch(ro_type) {
+		case I2400M_RO_TYPE_RESET:
+			i2400m_roq_reset(i2400m, roq);
+			kfree_skb(skb);	/* no data here */
+			break;
+		case I2400M_RO_TYPE_PACKET:
+			i2400m_roq_queue(i2400m, roq, skb, ro_sn);
+			break;
+		case I2400M_RO_TYPE_WS:
+			i2400m_roq_update_ws(i2400m, roq, ro_sn);
+			kfree_skb(skb);	/* no data here */
+			break;
+		case I2400M_RO_TYPE_PACKET_WS:
+			i2400m_roq_queue_update_ws(i2400m, roq, skb, ro_sn);
+			break;
+		default:
+			dev_err(dev, "HW BUG? unknown reorder type %u\n", ro_type);
+		}
+	}
+	else
+		i2400m_net_erx(i2400m, skb, cs);
 error_skb_clone:
 error:
-	d_fnend(4, dev, "(i2400m %p skb_rx %p single %u payload %p "
+	d_fnend(2, dev, "(i2400m %p skb_rx %p single %u payload %p "
 		"size %zu) = void\n", i2400m, skb_rx, single_last, payload, size);
 	return;
 }
 
 
-
-
 /*
  * Act on a received payload
  *
@@ -632,3 +1189,73 @@ error_msg_hdr_check:
 	return result;
 }
 EXPORT_SYMBOL_GPL(i2400m_rx);
+
+
+/*
+ * Initialize the RX queue and infrastructure
+ *
+ * This sets up all the RX reordering infrastructures, which will not
+ * be used if reordering is not enabled or if the firmware does not
+ * support it. The device is told to do reordering in
+ * i2400m_dev_initialize(), where it also looks at the value of the
+ * i2400m->rx_reorder switch before taking a decission.
+ *
+ * Note we allocate the roq queues in one chunk and the actual logging
+ * support for it (logging) in another one and then we setup the
+ * pointers from the first to the last.
+ */
+int i2400m_rx_setup(struct i2400m *i2400m)
+{
+	int result = 0;
+	struct device *dev = i2400m_dev(i2400m);
+
+	i2400m->rx_reorder = i2400m_rx_reorder_disabled? 0 : 1;
+	if (i2400m->rx_reorder) {
+		unsigned itr;
+		size_t size;
+		struct i2400m_roq_log *rd;
+
+		result = -ENOMEM;
+
+		size = sizeof(i2400m->rx_roq[0]) * (I2400M_RO_CIN + 1);
+		i2400m->rx_roq = kzalloc(size, GFP_KERNEL);
+		if (i2400m->rx_roq == NULL) {
+			dev_err(dev, "RX: cannot allocate %zu bytes for "
+				"reorder queues\n", size);
+			goto error_roq_alloc;
+		}
+
+		size = sizeof(*i2400m->rx_roq[0].log) * (I2400M_RO_CIN + 1);
+		rd = kzalloc(size, GFP_KERNEL);
+		if (rd == NULL) {
+			dev_err(dev, "RX: cannot allocate %zu bytes for "
+				"reorder queues log areas\n", size);
+			result = -ENOMEM;
+			goto error_roq_log_alloc;
+		}
+
+		for(itr = 0; itr < I2400M_RO_CIN + 1; itr++) {
+			__i2400m_roq_init(&i2400m->rx_roq[itr]);
+			i2400m->rx_roq[itr].log = &rd[itr];
+		}
+	}
+	return 0;
+
+error_roq_log_alloc:
+	kfree(i2400m->rx_roq);
+error_roq_alloc:
+	return result;
+}
+
+
+/* Tear down the RX queue and infrastructure */
+void i2400m_rx_release(struct i2400m *i2400m)
+{
+	if (i2400m->rx_reorder) {
+		unsigned itr;
+		for(itr = 0; itr < I2400M_RO_CIN + 1; itr++)
+			__skb_queue_purge(&i2400m->rx_roq[itr].queue);
+		kfree(i2400m->rx_roq[0].log);
+		kfree(i2400m->rx_roq);
+	}
+}
diff --git a/include/linux/wimax/i2400m.h b/include/linux/wimax/i2400m.h
index ad36e073a70c..d5148a7889a6 100644
--- a/include/linux/wimax/i2400m.h
+++ b/include/linux/wimax/i2400m.h
@@ -225,15 +225,16 @@ struct i2400m_pl_data_hdr {
 /*
  * Payload for an extended data packet
  *
- * New in v1.4
+ * New in fw v1.4
  *
+ * @reorder: if this payload has to be reorder or not (and how)
  * @cs: the type of data in the packet, as defined per (802.16e
  *     T11.13.19.1). Currently only 2 (IPv4 packet) supported.
  *
  * This is prefixed to each and every INCOMING DATA packet.
  */
 struct i2400m_pl_edata_hdr {
-	__le32 reorder;
+	__le32 reorder;		/* bits defined in i2400m_ro */
 	__u8 cs;
 	__u8 reserved[11];
 } __attribute__((packed));
@@ -243,8 +244,23 @@ enum i2400m_cs {
 	I2400M_CS_IPV4 = 2,
 };
 
-enum i2400m_reorder {
-	I2400M_REORDER_NEEDED     = 0x01,
+enum i2400m_ro {
+	I2400M_RO_NEEDED     = 0x01,
+	I2400M_RO_TYPE       = 0x03,
+	I2400M_RO_TYPE_SHIFT = 1,
+	I2400M_RO_CIN        = 0x0f,
+	I2400M_RO_CIN_SHIFT  = 4,
+	I2400M_RO_FBN        = 0x07ff,
+	I2400M_RO_FBN_SHIFT  = 8,
+	I2400M_RO_SN         = 0x07ff,
+	I2400M_RO_SN_SHIFT   = 21,
+};
+
+enum i2400m_ro_type {
+	I2400M_RO_TYPE_RESET = 0,
+	I2400M_RO_TYPE_PACKET,
+	I2400M_RO_TYPE_WS,
+	I2400M_RO_TYPE_PACKET_WS,
 };
 
 
@@ -410,6 +426,7 @@ enum i2400m_tlv {
 	I2400M_TLV_CONFIG_IDLE_PARAMETERS = 601,
 	I2400M_TLV_CONFIG_IDLE_TIMEOUT = 611,
 	I2400M_TLV_CONFIG_D2H_DATA_FORMAT = 614,
+	I2400M_TLV_CONFIG_DL_HOST_REORDER = 615,
 };
 
 
@@ -553,5 +570,12 @@ struct i2400m_tlv_config_d2h_data_format {
 	__u8 reserved[3];
 } __attribute__((packed));
 
+/* New in v1.4 */
+struct i2400m_tlv_config_dl_host_reorder {
+	struct i2400m_tlv_hdr hdr;
+	__u8 reorder; 		/* 0 disabled, 1 enabled */
+	__u8 reserved[3];
+} __attribute__((packed));
+
 
 #endif /* #ifndef __LINUX__WIMAX__I2400M_H__ */
-- 
cgit v1.2.3-71-gd317


From d3a21be86c178964167aa54c39a01260d33e7509 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 2 Mar 2009 03:15:58 -0800
Subject: skbuff.h: fix timestamps kernel-doc

Fix skbuff.h kernel-doc for timestamps: must include "struct" keyword,
otherwise there are kernel-doc errors:

Error(linux-next-20090227//include/linux/skbuff.h:161): cannot understand prototype: 'struct skb_shared_hwtstamps '
Error(linux-next-20090227//include/linux/skbuff.h:177): cannot understand prototype: 'union skb_shared_tx '

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 61ce97a8b868..1f659e8c2b88 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -135,8 +135,7 @@ struct skb_frag_struct {
 #define HAVE_HW_TIME_STAMP
 
 /**
- * skb_shared_hwtstamps - hardware time stamps
- *
+ * struct skb_shared_hwtstamps - hardware time stamps
  * @hwtstamp:	hardware time stamp transformed into duration
  *		since arbitrary point in time
  * @syststamp:	hwtstamp transformed to system time base
@@ -164,8 +163,7 @@ struct skb_shared_hwtstamps {
 };
 
 /**
- * skb_shared_tx - instructions for time stamping of outgoing packets
- *
+ * struct skb_shared_tx - instructions for time stamping of outgoing packets
  * @hardware:		generate hardware time stamp
  * @software:		generate software time stamp
  * @in_progress:	device driver is going to provide
-- 
cgit v1.2.3-71-gd317


From c79a61f55773d2519fd0525bf58385f7d20752d3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Date: Fri, 27 Feb 2009 21:30:03 +0100
Subject: tracing: make CALLER_ADDRx overwriteable

The current definition of CALLER_ADDRx isn't suitable for all platforms.
E.g. for ARM __builtin_return_address(N) doesn't work for N > 0 and
AFAIK for powerpc there are no frame pointers needed to have a working
__builtin_return_address.  This patch allows defining the CALLER_ADDRx
macros in <asm/ftrace.h> and let these take precedence.

Because now <asm/ftrace.h> is included unconditionally in
<linux/ftrace.h> all archs that don't already had this include get an
empty one for free.

Signed-off-by: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/alpha/include/asm/ftrace.h    |  1 +
 arch/avr32/include/asm/ftrace.h    |  1 +
 arch/blackfin/include/asm/ftrace.h |  1 +
 arch/cris/include/asm/ftrace.h     |  1 +
 arch/h8300/include/asm/ftrace.h    |  1 +
 arch/m68k/include/asm/ftrace.h     |  1 +
 arch/mips/include/asm/ftrace.h     |  1 +
 arch/parisc/include/asm/ftrace.h   |  1 +
 arch/um/include/asm/ftrace.h       |  1 +
 arch/xtensa/include/asm/ftrace.h   |  1 +
 include/asm-frv/ftrace.h           |  1 +
 include/asm-m32r/ftrace.h          |  1 +
 include/asm-mn10300/ftrace.h       |  1 +
 include/linux/ftrace.h             | 41 +++++++++++++++++++-------------------
 14 files changed, 34 insertions(+), 20 deletions(-)
 create mode 100644 arch/alpha/include/asm/ftrace.h
 create mode 100644 arch/avr32/include/asm/ftrace.h
 create mode 100644 arch/blackfin/include/asm/ftrace.h
 create mode 100644 arch/cris/include/asm/ftrace.h
 create mode 100644 arch/h8300/include/asm/ftrace.h
 create mode 100644 arch/m68k/include/asm/ftrace.h
 create mode 100644 arch/mips/include/asm/ftrace.h
 create mode 100644 arch/parisc/include/asm/ftrace.h
 create mode 100644 arch/um/include/asm/ftrace.h
 create mode 100644 arch/xtensa/include/asm/ftrace.h
 create mode 100644 include/asm-frv/ftrace.h
 create mode 100644 include/asm-m32r/ftrace.h
 create mode 100644 include/asm-mn10300/ftrace.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/ftrace.h b/arch/alpha/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/alpha/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/avr32/include/asm/ftrace.h b/arch/avr32/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/avr32/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/blackfin/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/cris/include/asm/ftrace.h b/arch/cris/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/cris/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/h8300/include/asm/ftrace.h b/arch/h8300/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/h8300/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/m68k/include/asm/ftrace.h b/arch/m68k/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/m68k/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/mips/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/parisc/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/um/include/asm/ftrace.h b/arch/um/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/um/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/xtensa/include/asm/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/asm-frv/ftrace.h b/include/asm-frv/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-frv/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/asm-m32r/ftrace.h b/include/asm-m32r/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-m32r/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/asm-mn10300/ftrace.h b/include/asm-mn10300/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-mn10300/ftrace.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 847bb3c48dd0..1f69ac7c1587 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -11,6 +11,8 @@
 #include <linux/bitops.h>
 #include <linux/sched.h>
 
+#include <asm/ftrace.h>
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -103,8 +105,6 @@ struct ftrace_func_command {
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
-#include <asm/ftrace.h>
 
 int ftrace_arch_code_modify_prepare(void);
 int ftrace_arch_code_modify_post_process(void);
@@ -282,24 +282,25 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 }
 
-#ifdef CONFIG_FRAME_POINTER
-/* TODO: need to fix this for ARM */
-# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
-# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
-# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
-# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
-# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
-# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
-#else
-# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-# define CALLER_ADDR1 0UL
-# define CALLER_ADDR2 0UL
-# define CALLER_ADDR3 0UL
-# define CALLER_ADDR4 0UL
-# define CALLER_ADDR5 0UL
-# define CALLER_ADDR6 0UL
-#endif
+#ifndef HAVE_ARCH_CALLER_ADDR
+# ifdef CONFIG_FRAME_POINTER
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+#  define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+#  define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+#  define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
+# else
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 0UL
+#  define CALLER_ADDR2 0UL
+#  define CALLER_ADDR3 0UL
+#  define CALLER_ADDR4 0UL
+#  define CALLER_ADDR5 0UL
+#  define CALLER_ADDR6 0UL
+# endif
+#endif /* ifndef HAVE_ARCH_CALLER_ADDR */
 
 #ifdef CONFIG_IRQSOFF_TRACER
   extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
-- 
cgit v1.2.3-71-gd317


From ef7a4a161472b952941bf78855a9cd95703c024e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 00:27:49 -0500
Subject: ring-buffer: fix ring_buffer_read_page

The ring_buffer_read_page was broken if it were to only copy part
of the page. This patch fixes that up as well as adds a parameter
to allow a length field, in order to only copy part of the buffer page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++-
 kernel/trace/ring_buffer.c  | 92 +++++++++++++++++++++++++++++----------------
 2 files changed, 64 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f5e793d69bd3..79fcbc4b09d6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -121,6 +121,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+size_t ring_buffer_page_len(void *page);
+
+
 /*
  * The below functions are fine to use outside the tracing facility.
  */
@@ -138,8 +141,8 @@ static inline int tracing_is_on(void) { return 0; }
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
-int ring_buffer_read_page(struct ring_buffer *buffer,
-			  void **data_page, int cpu, int full);
+int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
+			  size_t len, int cpu, int full);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9baad7ee4b36..2ad6bae95a3d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -234,6 +234,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+size_t ring_buffer_page_len(void *page)
+{
+	return local_read(&((struct buffer_data_page *)page)->commit);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -2378,8 +2383,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 {
-	unsigned long addr;
 	struct buffer_data_page *bpage;
+	unsigned long addr;
 
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
@@ -2387,6 +2392,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	bpage = (void *)addr;
 
+	rb_init_page(bpage);
+
 	return bpage;
 }
 
@@ -2406,6 +2413,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * ring_buffer_read_page - extract a page from the ring buffer
  * @buffer: buffer to extract from
  * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @len: amount to extract
  * @cpu: the cpu of the buffer to extract
  * @full: should the extraction only happen when the page is full.
  *
@@ -2418,7 +2426,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
- *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
  *	if (ret >= 0)
  *		process_page(rpage, ret);
  *
@@ -2435,71 +2443,89 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
-			    void **data_page, int cpu, int full)
+			  void **data_page, size_t len, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
+	struct buffer_page *reader;
 	unsigned long flags;
+	unsigned int commit;
 	unsigned int read;
 	int ret = -1;
 
 	if (!data_page)
-		return 0;
+		return -1;
 
 	bpage = *data_page;
 	if (!bpage)
-		return 0;
+		return -1;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
-	/*
-	 * rb_buffer_peek will get the next ring buffer if
-	 * the current reader page is empty.
-	 */
-	event = rb_buffer_peek(buffer, cpu, NULL);
-	if (!event)
+	reader = rb_get_reader_page(cpu_buffer);
+	if (!reader)
 		goto out;
 
-	/* check for data */
-	if (!local_read(&cpu_buffer->reader_page->page->commit))
-		goto out;
+	event = rb_reader_event(cpu_buffer);
+
+	read = reader->read;
+	commit = rb_page_commit(reader);
 
-	read = cpu_buffer->reader_page->read;
 	/*
-	 * If the writer is already off of the read page, then simply
-	 * switch the read page with the given page. Otherwise
-	 * we need to copy the data from the reader to the writer.
+	 * If len > what's left on the page, and the writer is also off of
+	 * the read page, then simply switch the read page with the given
+	 * page. Otherwise we need to copy the data from the reader to the
+	 * writer.
 	 */
-	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+	if ((len < (commit - read)) ||
+	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
+		unsigned int pos = read;
+		unsigned int size;
 
 		if (full)
 			goto out;
-		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
-		/* consume what was read */
-		cpu_buffer->reader_page->read = commit;
+		if (len > (commit - read))
+			len = (commit - read);
+
+		size = rb_event_length(event);
+
+		if (len < size)
+			goto out;
+
+		/* Need to copy one event at a time */
+		do {
+			memcpy(bpage->data + pos, rpage->data + pos, size);
+
+			len -= size;
+
+			rb_advance_reader(cpu_buffer);
+			pos = reader->read;
+
+			event = rb_reader_event(cpu_buffer);
+			size = rb_event_length(event);
+		} while (len > size);
 
 		/* update bpage */
-		local_set(&bpage->commit, commit);
-		if (!read)
-			bpage->time_stamp = rpage->time_stamp;
+		local_set(&bpage->commit, pos);
+		bpage->time_stamp = rpage->time_stamp;
+
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-		bpage = cpu_buffer->reader_page->page;
-		cpu_buffer->reader_page->page = *data_page;
-		local_set(&cpu_buffer->reader_page->write, 0);
-		cpu_buffer->reader_page->read = 0;
+		bpage = reader->page;
+		reader->page = *data_page;
+		local_set(&reader->write, 0);
+		reader->read = 0;
 		*data_page = bpage;
+
+		/* update the entry counter */
+		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3-71-gd317


From a1d2f09544065b60598b8167d94a6371bff3e892 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 4 Mar 2009 15:05:33 +0800
Subject: crypto: compress - Add pcomp interface

The current "comp" crypto interface supports one-shot (de)compression only,
i.e. the whole data buffer to be (de)compressed must be passed at once, and
the whole (de)compressed data buffer will be received at once.
In several use-cases (e.g. compressed file systems that store files in big
compressed blocks), this workflow is not suitable.
Furthermore, the "comp" type doesn't provide for the configuration of
(de)compression parameters, and always allocates workspace memory for both
compression and decompression, which may waste memory.

To solve this, add a "pcomp" partial (de)compression interface that provides
the following operations:
  - crypto_compress_{init,update,final}() for compression,
  - crypto_decompress_{init,update,final}() for decompression,
  - crypto_{,de}compress_setup(), to configure (de)compression parameters
    (incl. allocating workspace memory).

The (de)compression methods take a struct comp_request, which was mimicked
after the z_stream object in zlib, and contains buffer pointer and length
pairs for input and output.

The setup methods take an opaque parameter pointer and length pair. Parameters
are supposed to be encoded using netlink attributes, whose meanings depend on
the actual (name of the) (de)compression algorithm.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                     |   4 ++
 crypto/Makefile                    |   2 +
 crypto/pcompress.c                 |  97 ++++++++++++++++++++++++++++
 include/crypto/compress.h          | 125 +++++++++++++++++++++++++++++++++++++
 include/crypto/internal/compress.h |  28 +++++++++
 include/linux/crypto.h             |   1 +
 6 files changed, 257 insertions(+)
 create mode 100644 crypto/pcompress.c
 create mode 100644 include/crypto/compress.h
 create mode 100644 include/crypto/internal/compress.h

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 4a3e6b225189..1676f171c54b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -76,6 +76,10 @@ config CRYPTO_RNG2
 	tristate
 	select CRYPTO_ALGAPI2
 
+config CRYPTO_PCOMP
+	tristate
+	select CRYPTO_ALGAPI2
+
 config CRYPTO_MANAGER
 	tristate "Cryptographic algorithm manager"
 	select CRYPTO_MANAGER2
diff --git a/crypto/Makefile b/crypto/Makefile
index e05a844e08d5..1132a678b253 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -27,6 +27,8 @@ crypto_hash-objs += ahash.o
 crypto_hash-objs += shash.o
 obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o
 
+obj-$(CONFIG_CRYPTO_PCOMP) += pcompress.o
+
 cryptomgr-objs := algboss.o testmgr.o
 
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
diff --git a/crypto/pcompress.c b/crypto/pcompress.c
new file mode 100644
index 000000000000..ca9a4af91efe
--- /dev/null
+++ b/crypto/pcompress.c
@@ -0,0 +1,97 @@
+/*
+ * Cryptographic API.
+ *
+ * Partial (de)compression operations.
+ *
+ * Copyright 2008 Sony Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/crypto.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+
+#include <crypto/compress.h>
+
+#include "internal.h"
+
+
+static int crypto_pcomp_init(struct crypto_tfm *tfm, u32 type, u32 mask)
+{
+	return 0;
+}
+
+static unsigned int crypto_pcomp_extsize(struct crypto_alg *alg,
+					 const struct crypto_type *frontend)
+{
+	return alg->cra_ctxsize;
+}
+
+static int crypto_pcomp_init_tfm(struct crypto_tfm *tfm,
+				 const struct crypto_type *frontend)
+{
+	return 0;
+}
+
+static void crypto_pcomp_show(struct seq_file *m, struct crypto_alg *alg)
+	__attribute__ ((unused));
+static void crypto_pcomp_show(struct seq_file *m, struct crypto_alg *alg)
+{
+	seq_printf(m, "type         : pcomp\n");
+}
+
+static const struct crypto_type crypto_pcomp_type = {
+	.extsize	= crypto_pcomp_extsize,
+	.init		= crypto_pcomp_init,
+	.init_tfm	= crypto_pcomp_init_tfm,
+#ifdef CONFIG_PROC_FS
+	.show		= crypto_pcomp_show,
+#endif
+	.maskclear	= ~CRYPTO_ALG_TYPE_MASK,
+	.maskset	= CRYPTO_ALG_TYPE_MASK,
+	.type		= CRYPTO_ALG_TYPE_PCOMPRESS,
+	.tfmsize	= offsetof(struct crypto_pcomp, base),
+};
+
+struct crypto_pcomp *crypto_alloc_pcomp(const char *alg_name, u32 type,
+					u32 mask)
+{
+	return crypto_alloc_tfm(alg_name, &crypto_pcomp_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_alloc_pcomp);
+
+int crypto_register_pcomp(struct pcomp_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+
+	base->cra_type = &crypto_pcomp_type;
+	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+	base->cra_flags |= CRYPTO_ALG_TYPE_PCOMPRESS;
+
+	return crypto_register_alg(base);
+}
+EXPORT_SYMBOL_GPL(crypto_register_pcomp);
+
+int crypto_unregister_pcomp(struct pcomp_alg *alg)
+{
+	return crypto_unregister_alg(&alg->base);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_pcomp);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Partial (de)compression type");
+MODULE_AUTHOR("Sony Corporation");
diff --git a/include/crypto/compress.h b/include/crypto/compress.h
new file mode 100644
index 000000000000..b7d228708d6b
--- /dev/null
+++ b/include/crypto/compress.h
@@ -0,0 +1,125 @@
+/*
+ * Compress: Compression algorithms under the cryptographic API.
+ *
+ * Copyright 2008 Sony Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _CRYPTO_COMPRESS_H
+#define _CRYPTO_COMPRESS_H
+
+#include <linux/crypto.h>
+
+
+struct comp_request {
+	const void *next_in;		/* next input byte */
+	void *next_out;			/* next output byte */
+	unsigned int avail_in;		/* bytes available at next_in */
+	unsigned int avail_out;		/* bytes available at next_out */
+};
+
+struct crypto_pcomp {
+	struct crypto_tfm base;
+};
+
+struct pcomp_alg {
+	int (*compress_setup)(struct crypto_pcomp *tfm, void *params,
+			      unsigned int len);
+	int (*compress_init)(struct crypto_pcomp *tfm);
+	int (*compress_update)(struct crypto_pcomp *tfm,
+			       struct comp_request *req);
+	int (*compress_final)(struct crypto_pcomp *tfm,
+			      struct comp_request *req);
+	int (*decompress_setup)(struct crypto_pcomp *tfm, void *params,
+				unsigned int len);
+	int (*decompress_init)(struct crypto_pcomp *tfm);
+	int (*decompress_update)(struct crypto_pcomp *tfm,
+				 struct comp_request *req);
+	int (*decompress_final)(struct crypto_pcomp *tfm,
+				struct comp_request *req);
+
+	struct crypto_alg base;
+};
+
+extern struct crypto_pcomp *crypto_alloc_pcomp(const char *alg_name, u32 type,
+					       u32 mask);
+
+static inline struct crypto_tfm *crypto_pcomp_tfm(struct crypto_pcomp *tfm)
+{
+	return &tfm->base;
+}
+
+static inline void crypto_free_pcomp(struct crypto_pcomp *tfm)
+{
+	crypto_destroy_tfm(tfm, crypto_pcomp_tfm(tfm));
+}
+
+static inline struct pcomp_alg *__crypto_pcomp_alg(struct crypto_alg *alg)
+{
+	return container_of(alg, struct pcomp_alg, base);
+}
+
+static inline struct pcomp_alg *crypto_pcomp_alg(struct crypto_pcomp *tfm)
+{
+	return __crypto_pcomp_alg(crypto_pcomp_tfm(tfm)->__crt_alg);
+}
+
+static inline int crypto_compress_setup(struct crypto_pcomp *tfm,
+					void *params, unsigned int len)
+{
+	return crypto_pcomp_alg(tfm)->compress_setup(tfm, params, len);
+}
+
+static inline int crypto_compress_init(struct crypto_pcomp *tfm)
+{
+	return crypto_pcomp_alg(tfm)->compress_init(tfm);
+}
+
+static inline int crypto_compress_update(struct crypto_pcomp *tfm,
+					 struct comp_request *req)
+{
+	return crypto_pcomp_alg(tfm)->compress_update(tfm, req);
+}
+
+static inline int crypto_compress_final(struct crypto_pcomp *tfm,
+					struct comp_request *req)
+{
+	return crypto_pcomp_alg(tfm)->compress_final(tfm, req);
+}
+
+static inline int crypto_decompress_setup(struct crypto_pcomp *tfm,
+					  void *params, unsigned int len)
+{
+	return crypto_pcomp_alg(tfm)->decompress_setup(tfm, params, len);
+}
+
+static inline int crypto_decompress_init(struct crypto_pcomp *tfm)
+{
+	return crypto_pcomp_alg(tfm)->decompress_init(tfm);
+}
+
+static inline int crypto_decompress_update(struct crypto_pcomp *tfm,
+					   struct comp_request *req)
+{
+	return crypto_pcomp_alg(tfm)->decompress_update(tfm, req);
+}
+
+static inline int crypto_decompress_final(struct crypto_pcomp *tfm,
+					  struct comp_request *req)
+{
+	return crypto_pcomp_alg(tfm)->decompress_final(tfm, req);
+}
+
+#endif	/* _CRYPTO_COMPRESS_H */
diff --git a/include/crypto/internal/compress.h b/include/crypto/internal/compress.h
new file mode 100644
index 000000000000..178a888d1d93
--- /dev/null
+++ b/include/crypto/internal/compress.h
@@ -0,0 +1,28 @@
+/*
+ * Compress: Compression algorithms under the cryptographic API.
+ *
+ * Copyright 2008 Sony Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _CRYPTO_INTERNAL_COMPRESS_H
+#define _CRYPTO_INTERNAL_COMPRESS_H
+
+#include <crypto/compress.h>
+
+extern int crypto_register_pcomp(struct pcomp_alg *alg);
+extern int crypto_unregister_pcomp(struct pcomp_alg *alg);
+
+#endif	/* _CRYPTO_INTERNAL_COMPRESS_H */
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 29729b834380..ec29fa268b94 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -40,6 +40,7 @@
 #define CRYPTO_ALG_TYPE_SHASH		0x00000009
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000a
 #define CRYPTO_ALG_TYPE_RNG		0x0000000c
+#define CRYPTO_ALG_TYPE_PCOMPRESS	0x0000000f
 
 #define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
 #define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000c
-- 
cgit v1.2.3-71-gd317


From 0c5c2d3089068d4aa378f7a40d2b5ad9d4f52ce8 Mon Sep 17 00:00:00 2001
From: Eric Biederman <ebiederm@aristanetworks.com>
Date: Wed, 4 Mar 2009 00:03:08 -0800
Subject: neigh: Allow for user space users of the neighbour table

Currently it is possible to do just about everything with the arp table
from user space except treat an entry like you are using it.  To that end
implement and a flag NTF_USE that when set in a netwlink update request
treats the neighbour table entry like the kernel does on the output path.

This allows user space applications to share the kernel's arp cache.

Signed-off-by: Eric Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/neighbour.h | 1 +
 net/core/neighbour.c      | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
index 8730d5dae1bc..12c9de138451 100644
--- a/include/linux/neighbour.h
+++ b/include/linux/neighbour.h
@@ -31,6 +31,7 @@ enum
  *	Neighbor Cache Entry Flags
  */
 
+#define NTF_USE		0x01
 #define NTF_PROXY	0x08	/* == ATF_PUBL */
 #define NTF_ROUTER	0x80
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 417b6d739fb7..a1cbce7fdae5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1654,7 +1654,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 				flags &= ~NEIGH_UPDATE_F_OVERRIDE;
 		}
 
-		err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+		if (ndm->ndm_flags & NTF_USE) {
+			neigh_event_send(neigh, NULL);
+			err = 0;
+		} else
+			err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
 		neigh_release(neigh);
 		goto out_dev_put;
 	}
-- 
cgit v1.2.3-71-gd317


From 9d40bbda599def1e1d155d7f7dca14fe8744bd2b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 4 Mar 2009 23:46:25 -0800
Subject: vlan: Fix vlan-in-vlan crashes.

As analyzed by Patrick McHardy, vlan needs to reset it's
netdev_ops pointer in it's ->init() function but this
leaves the compat method pointers stale.

Add a netdev_resync_ops() and call it from the vlan code.

Any other driver which changes ->netdev_ops after register_netdevice()
will need to call this new function after doing so too.

With help from Patrick McHardy.

Tested-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/8021q/vlan_dev.c      |  3 ++-
 net/core/dev.c            | 56 ++++++++++++++++++++++++++++-------------------
 3 files changed, 37 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ec54785d34f9..659366734f3f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1079,6 +1079,7 @@ extern void		synchronize_net(void);
 extern int 		register_netdevice_notifier(struct notifier_block *nb);
 extern int		unregister_netdevice_notifier(struct notifier_block *nb);
 extern int		init_dummy_netdev(struct net_device *dev);
+extern void		netdev_resync_ops(struct net_device *dev);
 
 extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 extern struct net_device	*dev_get_by_index(struct net *net, int ifindex);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4a19acd3a32b..1b34135cf990 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
 	int err = 0;
 
 	if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
-		err = ops->ndo_neigh_setup(dev, pa);
+		err = ops->ndo_neigh_setup(real_dev, pa);
 
 	return err;
 }
@@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev)
 		dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
 		dev->netdev_ops         = &vlan_netdev_ops;
 	}
+	netdev_resync_ops(dev);
 
 	if (is_vlan_dev(real_dev))
 		subclass = 1;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2dd484ed3dbb..f1129706ce7b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4282,6 +4282,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
 }
 EXPORT_SYMBOL(netdev_fix_features);
 
+/* Some devices need to (re-)set their netdev_ops inside
+ * ->init() or similar.  If that happens, we have to setup
+ * the compat pointers again.
+ */
+void netdev_resync_ops(struct net_device *dev)
+{
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	dev->init = ops->ndo_init;
+	dev->uninit = ops->ndo_uninit;
+	dev->open = ops->ndo_open;
+	dev->change_rx_flags = ops->ndo_change_rx_flags;
+	dev->set_rx_mode = ops->ndo_set_rx_mode;
+	dev->set_multicast_list = ops->ndo_set_multicast_list;
+	dev->set_mac_address = ops->ndo_set_mac_address;
+	dev->validate_addr = ops->ndo_validate_addr;
+	dev->do_ioctl = ops->ndo_do_ioctl;
+	dev->set_config = ops->ndo_set_config;
+	dev->change_mtu = ops->ndo_change_mtu;
+	dev->neigh_setup = ops->ndo_neigh_setup;
+	dev->tx_timeout = ops->ndo_tx_timeout;
+	dev->get_stats = ops->ndo_get_stats;
+	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
+	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
+	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	dev->poll_controller = ops->ndo_poll_controller;
+#endif
+#endif
+}
+EXPORT_SYMBOL(netdev_resync_ops);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -4326,28 +4359,7 @@ int register_netdevice(struct net_device *dev)
 	 * This is temporary until all network devices are converted.
 	 */
 	if (dev->netdev_ops) {
-		const struct net_device_ops *ops = dev->netdev_ops;
-
-		dev->init = ops->ndo_init;
-		dev->uninit = ops->ndo_uninit;
-		dev->open = ops->ndo_open;
-		dev->change_rx_flags = ops->ndo_change_rx_flags;
-		dev->set_rx_mode = ops->ndo_set_rx_mode;
-		dev->set_multicast_list = ops->ndo_set_multicast_list;
-		dev->set_mac_address = ops->ndo_set_mac_address;
-		dev->validate_addr = ops->ndo_validate_addr;
-		dev->do_ioctl = ops->ndo_do_ioctl;
-		dev->set_config = ops->ndo_set_config;
-		dev->change_mtu = ops->ndo_change_mtu;
-		dev->neigh_setup = ops->ndo_neigh_setup;
-		dev->tx_timeout = ops->ndo_tx_timeout;
-		dev->get_stats = ops->ndo_get_stats;
-		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
-		dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
-		dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
-#ifdef CONFIG_NET_POLL_CONTROLLER
-		dev->poll_controller = ops->ndo_poll_controller;
-#endif
+		netdev_resync_ops(dev);
 	} else {
 		char drivername[64];
 		pr_info("%s (%s): not using net_device_ops yet\n",
-- 
cgit v1.2.3-71-gd317


From 5e1607a00bd082972629d3d68c95c8bcf902b55a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 10:24:48 +0100
Subject: tracing: rename ftrace_printk() => trace_printk()

Impact: cleanup

Use a more generic name - this also allows the prototype to move
to kernel.h and be generally available to kernel developers who
want to do some quick tracing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt |  6 +++---
 include/linux/ftrace.h   | 18 +++++++++---------
 kernel/trace/trace.c     |  8 ++++----
 kernel/trace/trace.h     |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 2041ee951c1a..22614bef6359 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -1466,11 +1466,11 @@ want, depending on your needs.
 
 
 You can put some comments on specific functions by using
-ftrace_printk() For example, if you want to put a comment inside
+trace_printk() For example, if you want to put a comment inside
 the __might_sleep() function, you just have to include
-<linux/ftrace.h> and call ftrace_printk() inside __might_sleep()
+<linux/ftrace.h> and call trace_printk() inside __might_sleep()
 
-ftrace_printk("I'm a comment!\n")
+trace_printk("I'm a comment!\n")
 
 will produce:
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..fbb9c364e166 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -329,11 +329,11 @@ extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
 /**
- * ftrace_printk - printf formatting in the ftrace buffer
+ * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
  *
- * Note: __ftrace_printk is an internal function for ftrace_printk and
- *       the @ip is passed in via the ftrace_printk macro.
+ * Note: __trace_printk is an internal function for trace_printk and
+ *       the @ip is passed in via the trace_printk macro.
  *
  * This function allows a kernel developer to debug fast path sections
  * that printk is not appropriate for. By scattering in various
@@ -341,14 +341,14 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * where problems are occurring.
  *
  * This is intended as a debugging tool for the developer only.
- * Please refrain from leaving ftrace_printks scattered around in
+ * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
+# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
 extern int
-__ftrace_printk(unsigned long ip, const char *fmt, ...)
+__trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
@@ -356,13 +356,13 @@ extern void ftrace_dump(void);
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
 static inline void ftrace_off_permanent(void) { }
 static inline int
-ftrace_printk(const char *fmt, ...)
+trace_printk(const char *fmt, ...)
 {
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d1ef43999d9e..c0e9c1263393 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -48,7 +48,7 @@ unsigned long __read_mostly	tracing_thresh;
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
- * insertions into the ring-buffer such as ftrace_printk could occurred
+ * insertions into the ring-buffer such as trace_printk could occurred
  * at the same time, giving false positive or negative results.
  */
 static bool __read_mostly tracing_selftest_running;
@@ -291,7 +291,7 @@ static const char *trace_options[] = {
 	"block",
 	"stacktrace",
 	"sched-tree",
-	"ftrace_printk",
+	"trace_printk",
 	"ftrace_preempt",
 	"branch",
 	"annotate",
@@ -3768,7 +3768,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	int ret;
 	va_list ap;
@@ -3781,7 +3781,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ftrace_printk);
+EXPORT_SYMBOL_GPL(__trace_printk);
 
 int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 12cd119cca32..8beff03fda68 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -115,7 +115,7 @@ struct userstack_entry {
 };
 
 /*
- * ftrace_printk entry:
+ * trace_printk entry:
  */
 struct print_entry {
 	struct trace_entry	ent;
-- 
cgit v1.2.3-71-gd317


From 526211bc58c4b3265352801c5a7f469af5c34711 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 10:28:45 +0100
Subject: tracing: move utility functions from ftrace.h to kernel.h

Make common utility functions such as trace_printk() and
tracing_start()/tracing_stop() generally available to kernel
code.

Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 58 ++------------------------------------------------
 include/linux/kernel.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index fbb9c364e166..5b64303ec9f2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -318,62 +318,6 @@ static inline void __ftrace_enabled_restore(int enabled)
 # define trace_preempt_off(a0, a1)		do { } while (0)
 #endif
 
-#ifdef CONFIG_TRACING
-extern int ftrace_dump_on_oops;
-
-extern void tracing_start(void);
-extern void tracing_stop(void);
-extern void ftrace_off_permanent(void);
-
-extern void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
-/**
- * trace_printk - printf formatting in the ftrace buffer
- * @fmt: the printf format for printing
- *
- * Note: __trace_printk is an internal function for trace_printk and
- *       the @ip is passed in via the trace_printk macro.
- *
- * This function allows a kernel developer to debug fast path sections
- * that printk is not appropriate for. By scattering in various
- * printk like tracing in the code, a developer can quickly see
- * where problems are occurring.
- *
- * This is intended as a debugging tool for the developer only.
- * Please refrain from leaving trace_printks scattered around in
- * your code.
- */
-# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
-extern int
-__trace_printk(unsigned long ip, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
-extern int
-__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
-extern void ftrace_dump(void);
-#else
-static inline void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
-static inline int
-trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
-
-static inline void tracing_start(void) { }
-static inline void tracing_stop(void) { }
-static inline void ftrace_off_permanent(void) { }
-static inline int
-trace_printk(const char *fmt, ...)
-{
-	return 0;
-}
-static inline int
-ftrace_vprintk(const char *fmt, va_list ap)
-{
-	return 0;
-}
-static inline void ftrace_dump(void) { }
-#endif
-
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init_module(struct module *mod,
@@ -542,6 +486,8 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 	return tsk->trace & TSK_TRACE_FL_GRAPH;
 }
 
+extern int ftrace_dump_on_oops;
+
 #endif /* CONFIG_TRACING */
 
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7fa371898e3e..08bf5da86676 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -367,6 +367,64 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 	({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; })
 #endif
 
+/*
+ * General tracing related utility functions - trace_printk(),
+ * tracing_start()/tracing_stop:
+ */
+#ifdef CONFIG_TRACING
+extern void tracing_start(void);
+extern void tracing_stop(void);
+extern void ftrace_off_permanent(void);
+
+extern void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+/**
+ * trace_printk - printf formatting in the ftrace buffer
+ * @fmt: the printf format for printing
+ *
+ * Note: __trace_printk is an internal function for trace_printk and
+ *       the @ip is passed in via the trace_printk macro.
+ *
+ * This function allows a kernel developer to debug fast path sections
+ * that printk is not appropriate for. By scattering in various
+ * printk like tracing in the code, a developer can quickly see
+ * where problems are occurring.
+ *
+ * This is intended as a debugging tool for the developer only.
+ * Please refrain from leaving trace_printks scattered around in
+ * your code.
+ */
+# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
+extern int
+__trace_printk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
+extern int
+__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
+extern void ftrace_dump(void);
+#else
+static inline void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+static inline int
+trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
+static inline void ftrace_off_permanent(void) { }
+static inline int
+trace_printk(const char *fmt, ...)
+{
+	return 0;
+}
+static inline int
+ftrace_vprintk(const char *fmt, va_list ap)
+{
+	return 0;
+}
+static inline void ftrace_dump(void) { }
+#endif
+
 /*
  *      Display an IP address in readable format.
  */
-- 
cgit v1.2.3-71-gd317


From 0012693ad4f636c720fed3802027f9427962f540 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Mar 2009 01:49:22 +0100
Subject: tracing/function-graph-tracer: use the more lightweight local clock

Impact: decrease hangs risks with the graph tracer on slow systems

Since the function graph tracer can spend too much time on timer
interrupts, it's better now to use the more lightweight local
clock. Anyway, the function graph traces are more reliable on a
per cpu trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49af243d.06e9300a.53ad.ffff840c@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c             |  2 +-
 include/linux/ftrace.h               | 13 +++++++------
 kernel/trace/trace_functions_graph.c |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3925ec0184b1..a85da1764b1c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -436,7 +436,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = cpu_clock(raw_smp_processor_id());
+	calltime = trace_clock_local();
 
 	if (ftrace_push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..6ea62acbe4b6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,15 +1,16 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-#include <linux/linkage.h>
-#include <linux/fs.h>
-#include <linux/ktime.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/module.h>
+#include <linux/trace_clock.h>
 #include <linux/kallsyms.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/fs.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c009553a8e81..e527f2f66c73 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -112,7 +112,7 @@ unsigned long ftrace_return_to_handler(void)
 	unsigned long ret;
 
 	ftrace_pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
+	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 
 	if (unlikely(!ret)) {
-- 
cgit v1.2.3-71-gd317


From e7d3ef13d52a126438f687a1a32da65ff926ed57 Mon Sep 17 00:00:00 2001
From: Stuart Hayes <stuart_hayes@dell.com>
Date: Wed, 4 Mar 2009 11:59:46 -0800
Subject: libata: change drive ready wait after hard reset to 5s

This fixes problems during resume with drives that take longer than 1s to
be ready.  The ATA-6 spec appears to allow 5 seconds for a drive to be
ready.

On one affected system, this patch changes "PM: resume devices took..."
message from 17 seconds to 4 seconds, and gets rid of a lot of ugly
timeout/error messages.

Without this patch, the libata code moves on after 1s, tries to send a
soft reset (which the drive doesn't see because it isn't ready) which also
times out, then an IDENTIFY command is sent to the drive which times out,
and finally the error handler will try to send another hard reset which
will finally get things working.

Signed-off-by: Stuart Hayes <stuart_hayes@dell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5d87bc09a1f5..dd818c7decd7 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -275,7 +275,7 @@ enum {
 	 * advised to wait only for the following duration before
 	 * doing SRST.
 	 */
-	ATA_TMOUT_PMP_SRST_WAIT	= 1000,
+	ATA_TMOUT_PMP_SRST_WAIT	= 5000,
 
 	/* ATA bus states */
 	BUS_UNKNOWN		= 0,
-- 
cgit v1.2.3-71-gd317


From 5825627c9463581fd9e70f8285685889ae5bb9bb Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 27 Feb 2009 17:35:43 +0900
Subject: libata: fix dma_unmap_sg misuse

libata passes the returned value of dma_map_sg() to
dma_unmap_sg(),which is the misuse of dma_unmap_sg().

DMA-mapping.txt says:

To unmap a scatterlist, just call:

	pci_unmap_sg(pdev, sglist, nents, direction);

Again, make sure DMA activity has already finished.

PLEASE NOTE:  The 'nents' argument to the pci_unmap_sg call must be
              the _same_ one you passed into the pci_map_sg call,
	      it should _NOT_ be the 'count' value _returned_ from the
              pci_map_sg call.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 4 ++--
 include/linux/libata.h    | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 9fbf0595f3d4..5e324cea3019 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4612,7 +4612,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
 	if (qc->n_elem)
-		dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
+		dma_unmap_sg(ap->dev, sg, qc->orig_n_elem, dir);
 
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
 	qc->sg = NULL;
@@ -4727,7 +4727,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
 		return -1;
 
 	DPRINTK("%d sg elements mapped\n", n_elem);
-
+	qc->orig_n_elem = qc->n_elem;
 	qc->n_elem = n_elem;
 	qc->flags |= ATA_QCFLAG_DMAMAP;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index dd818c7decd7..fbf064e13ad5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -530,6 +530,7 @@ struct ata_queued_cmd {
 	unsigned long		flags;		/* ATA_QCFLAG_xxx */
 	unsigned int		tag;
 	unsigned int		n_elem;
+	unsigned int		orig_n_elem;
 
 	int			dma_dir;
 
-- 
cgit v1.2.3-71-gd317


From 84bda12af31f930e4200c5244aa111de2485d7b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 Mar 2009 18:53:26 +0900
Subject: libata: align ap->sector_buf

ap->sector_buf is used as DMA target and should at least be aligned on
cacheline.  This caused problems on some embedded machines.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/libata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index fbf064e13ad5..dc18b87ed722 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -751,7 +751,8 @@ struct ata_port {
 	acpi_handle		acpi_handle;
 	struct ata_acpi_gtm	__acpi_init_gtm; /* use ata_acpi_init_gtm() */
 #endif
-	u8			sector_buf[ATA_SECT_SIZE]; /* owned by EH */
+	/* owned by EH */
+	u8			sector_buf[ATA_SECT_SIZE] ____cacheline_aligned;
 };
 
 /* The following initializer overrides a method to NULL whether one of
-- 
cgit v1.2.3-71-gd317


From f2f45e5f3c921c73c913e9a9c00f21ec01c86b4d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 12:19:52 +0100
Subject: dma-debug: add header file and core data structures

Impact: add groundwork for DMA-API debugging

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 25 +++++++++++++++++++++++++
 lib/Makefile              |  2 ++
 lib/dma-debug.c           | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)
 create mode 100644 include/linux/dma-debug.h
 create mode 100644 lib/dma-debug.c

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
new file mode 100644
index 000000000000..ce4ace707c7b
--- /dev/null
+++ b/include/linux/dma-debug.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2008 Advanced Micro Devices, Inc.
+ *
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __DMA_DEBUG_H
+#define __DMA_DEBUG_H
+
+struct device;
+
+#endif /* __DMA_DEBUG_H */
diff --git a/lib/Makefile b/lib/Makefile
index 32b0e64ded27..50b48cf63e4a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -84,6 +84,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
 
 obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o
 
+obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
new file mode 100644
index 000000000000..31099712328c
--- /dev/null
+++ b/lib/dma-debug.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2008 Advanced Micro Devices, Inc.
+ *
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/dma-debug.h>
+#include <linux/types.h>
+#include <linux/list.h>
+
+enum {
+	dma_debug_single,
+	dma_debug_page,
+	dma_debug_sg,
+	dma_debug_coherent,
+};
+
+struct dma_debug_entry {
+	struct list_head list;
+	struct device    *dev;
+	int              type;
+	phys_addr_t      paddr;
+	u64              dev_addr;
+	u64              size;
+	int              direction;
+	int		 sg_call_ents;
+	int		 sg_mapped_ents;
+};
+
-- 
cgit v1.2.3-71-gd317


From 849d7130001ab740a5a4778a561049841fdd77c9 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stf_xl@wp.pl>
Date: Thu, 5 Mar 2009 16:10:57 +0100
Subject: ide: allow to wrap interrupt handler

Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Cc: Andrew Victor <linux@maxim.org.za>
[bart: minor checkpatch.pl / CodingStyle fixups]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c    | 1 +
 drivers/ide/ide-probe.c | 7 ++++++-
 include/linux/ide.h     | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9ff90cb1dbf1..a9a6c208288a 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1162,6 +1162,7 @@ out_early:
 
 	return irq_ret;
 }
+EXPORT_SYMBOL_GPL(ide_intr);
 
 /**
  *	ide_do_drive_cmd	-	issue IDE special command
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index ce0818a993f6..ee8e3e7cad51 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -950,6 +950,7 @@ static int ide_port_setup_devices(ide_hwif_t *hwif)
 static int init_irq (ide_hwif_t *hwif)
 {
 	struct ide_io_ports *io_ports = &hwif->io_ports;
+	irq_handler_t irq_handler;
 	int sa = 0;
 
 	mutex_lock(&ide_cfg_mtx);
@@ -959,6 +960,10 @@ static int init_irq (ide_hwif_t *hwif)
 	hwif->timer.function = &ide_timer_expiry;
 	hwif->timer.data = (unsigned long)hwif;
 
+	irq_handler = hwif->host->irq_handler;
+	if (irq_handler == NULL)
+		irq_handler = ide_intr;
+
 #if defined(__mc68000__)
 	sa = IRQF_SHARED;
 #endif /* __mc68000__ */
@@ -969,7 +974,7 @@ static int init_irq (ide_hwif_t *hwif)
 	if (io_ports->ctl_addr)
 		hwif->tp_ops->set_irq(hwif, 1);
 
-	if (request_irq(hwif->irq, &ide_intr, sa, hwif->name, hwif))
+	if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif))
 		goto out_up;
 
 	if (!hwif->rqsize) {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index fe235b65207e..e0cedfe9fad4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -866,6 +866,7 @@ struct ide_host {
 	unsigned int	n_ports;
 	struct device	*dev[2];
 	unsigned int	(*init_chipset)(struct pci_dev *);
+	irq_handler_t	irq_handler;
 	unsigned long	host_flags;
 	void		*host_priv;
 	ide_hwif_t	*cur_port;	/* for hosts requiring serialization */
-- 
cgit v1.2.3-71-gd317


From ebcad5aaea26da3cb2ca90b7f31a67a027eb60db Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 5 Mar 2009 16:10:59 +0100
Subject: remove stale comment from <linux/hdreg.h>

HDIO_GET_IDENTITY returns 256 words currently.

Noticed by Norman Diamond.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/hdreg.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index c37e9241fae7..ed21bd3dbd25 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -511,7 +511,6 @@ struct hd_driveid {
 	unsigned short	words69_70[2];	/* reserved words 69-70
 					 * future command overlap and queuing
 					 */
-	/* HDIO_GET_IDENTITY currently returns only words 0 through 70 */
 	unsigned short	words71_74[4];	/* reserved words 71-74
 					 * for IDENTIFY PACKET DEVICE command
 					 */
-- 
cgit v1.2.3-71-gd317


From 2002c258faaa8f89543df284fdbaa9e4b171547f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 10:35:56 -0500
Subject: tracing: add tracing_on/tracing_off to kernel.h

Impact: cleanup

The functions tracing_start/tracing_stop have been moved to kernel.h.
These are not the functions a developer most likely wants to use
when they want to insert a place to stop tracing and restart it from
user space.

tracing_start/tracing_stop was created to work with things like
suspend to ram, where even calling smp_processor_id() can crash the
system. The tracing_start/tracing_stop was used to stop the tracer from
doing anything. These are still light weight functions, but add a bit
more overhead to be able to stop the tracers. They also have no interface
back to userland. That is, if the kernel calls tracing_stop, userland
can not start tracing.

What a developer most likely wants to use is tracing_on/tracing_off.
These are very light weight functions (simply sets or clears a bit).
These functions just stop recording into the ring buffer. The tracers
don't even know that this happens except that they would receive NULL
from the ring_buffer_lock_reserve function.

Also, there's a way for the user land to enable or disable this bit.
In debugfs/tracing/tracing_on, a user may echo "0" (same as tracing_off())
or echo "1" (same as tracing_on()) into this file. This becomes handy when
a kernel developer is debugging and wants tracing to turn off when it
hits an anomaly. Then the developer can examine the trace, and restart
tracing if they want to try again (echo 1 > tracing_on).

This patch moves the prototypes for tracing_on/tracing_off to kernel.h
and comments their use, so that a kernel developer will know how
to use them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/kernel.h      | 29 ++++++++++++++++++++++++++++-
 include/linux/ring_buffer.h | 15 ---------------
 2 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 08bf5da86676..d4614a8a034b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -369,8 +369,35 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 
 /*
  * General tracing related utility functions - trace_printk(),
- * tracing_start()/tracing_stop:
+ * tracing_on/tracing_off and tracing_start()/tracing_stop
+ *
+ * Use tracing_on/tracing_off when you want to quickly turn on or off
+ * tracing. It simply enables or disables the recording of the trace events.
+ * This also corresponds to the user space debugfs/tracing/tracing_on
+ * file, which gives a means for the kernel and userspace to interact.
+ * Place a tracing_off() in the kernel where you want tracing to end.
+ * From user space, examine the trace, and then echo 1 > tracing_on
+ * to continue tracing.
+ *
+ * tracing_stop/tracing_start has slightly more overhead. It is used
+ * by things like suspend to ram where disabling the recording of the
+ * trace is not enough, but tracing must actually stop because things
+ * like calling smp_processor_id() may crash the system.
+ *
+ * Most likely, you want to use tracing_on/tracing_off.
  */
+#ifdef CONFIG_RING_BUFFER
+void tracing_on(void);
+void tracing_off(void);
+/* trace_off_permanent stops recording with no way to bring it back */
+void tracing_off_permanent(void);
+int tracing_is_on(void);
+#else
+static inline void tracing_on(void) { }
+static inline void tracing_off(void) { }
+static inline void tracing_off_permanent(void) { }
+static inline int tracing_is_on(void) { return 0; }
+#endif
 #ifdef CONFIG_TRACING
 extern void tracing_start(void);
 extern void tracing_stop(void);
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 79fcbc4b09d6..b1a0068a5557 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -124,21 +124,6 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 size_t ring_buffer_page_len(void *page);
 
 
-/*
- * The below functions are fine to use outside the tracing facility.
- */
-#ifdef CONFIG_RING_BUFFER
-void tracing_on(void);
-void tracing_off(void);
-void tracing_off_permanent(void);
-int tracing_is_on(void);
-#else
-static inline void tracing_on(void) { }
-static inline void tracing_off(void) { }
-static inline void tracing_off_permanent(void) { }
-static inline int tracing_is_on(void) { return 0; }
-#endif
-
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
-- 
cgit v1.2.3-71-gd317


From d42ad15b759d05a87f22b484af63987eff38ea88 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Thu, 5 Mar 2009 17:20:55 +0100
Subject: ata: add CFA specific identify data words

Declare CFA specific identify data words 162 and 163 for future use.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
[bart: update patch summary/description]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ata.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 08a86d5cdf1b..9a061accd8b8 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -89,6 +89,8 @@ enum {
 	ATA_ID_DLF		= 128,
 	ATA_ID_CSFO		= 129,
 	ATA_ID_CFA_POWER	= 160,
+	ATA_ID_CFA_KEY_MGMT	= 162,
+	ATA_ID_CFA_MODES	= 163,
 	ATA_ID_ROT_SPEED	= 217,
 	ATA_ID_PIO4		= (1 << 1),
 
-- 
cgit v1.2.3-71-gd317


From 6bf078715c1998d4d10716251cc10ce45908594c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 12:54:42 +0100
Subject: dma-debug: add initialization code

Impact: add code to initialize dma-debug core data structures

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 14 ++++++++++
 lib/dma-debug.c           | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index ce4ace707c7b..345d5387a30d 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -20,6 +20,20 @@
 #ifndef __DMA_DEBUG_H
 #define __DMA_DEBUG_H
 
+#include <linux/types.h>
+
 struct device;
 
+#ifdef CONFIG_DMA_API_DEBUG
+
+extern void dma_debug_init(u32 num_entries);
+
+#else /* CONFIG_DMA_API_DEBUG */
+
+static inline void dma_debug_init(u32 num_entries)
+{
+}
+
+#endif /* CONFIG_DMA_API_DEBUG */
+
 #endif /* __DMA_DEBUG_H */
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index b60914669656..5b50bb31f7c6 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -21,6 +21,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/slab.h>
 
 #define HASH_SIZE       1024ULL
 #define HASH_FN_SHIFT   13
@@ -198,3 +199,68 @@ static void dma_entry_free(struct dma_debug_entry *entry)
 	spin_unlock_irqrestore(&free_entries_lock, flags);
 }
 
+/*
+ * DMA-API debugging init code
+ *
+ * The init code does two things:
+ *   1. Initialize core data structures
+ *   2. Preallocate a given number of dma_debug_entry structs
+ */
+
+static int prealloc_memory(u32 num_entries)
+{
+	struct dma_debug_entry *entry, *next_entry;
+	int i;
+
+	for (i = 0; i < num_entries; ++i) {
+		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry)
+			goto out_err;
+
+		list_add_tail(&entry->list, &free_entries);
+	}
+
+	num_free_entries = num_entries;
+	min_free_entries = num_entries;
+
+	printk(KERN_INFO "DMA-API: preallocated %d debug entries\n",
+			num_entries);
+
+	return 0;
+
+out_err:
+
+	list_for_each_entry_safe(entry, next_entry, &free_entries, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	return -ENOMEM;
+}
+
+/*
+ * Let the architectures decide how many entries should be preallocated.
+ */
+void dma_debug_init(u32 num_entries)
+{
+	int i;
+
+	if (global_disable)
+		return;
+
+	for (i = 0; i < HASH_SIZE; ++i) {
+		INIT_LIST_HEAD(&dma_entry_hash[i].list);
+		dma_entry_hash[i].lock = SPIN_LOCK_UNLOCKED;
+	}
+
+	if (prealloc_memory(num_entries) != 0) {
+		printk(KERN_ERR "DMA-API: debugging out of memory error "
+				"- disabled\n");
+		global_disable = true;
+
+		return;
+	}
+
+	printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
+}
+
-- 
cgit v1.2.3-71-gd317


From f62bc980e6fd26434012c0d5676ecb17179d9ee4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 14:14:49 +0100
Subject: dma-debug: add checking for map/unmap_page/single

Impact: add debug callbacks for dma_{un}map_[page|single]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 23 ++++++++++++++++++++
 lib/dma-debug.c           | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 345d5387a30d..65f73526ba2c 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -28,12 +28,35 @@ struct device;
 
 extern void dma_debug_init(u32 num_entries);
 
+extern void debug_dma_map_page(struct device *dev, struct page *page,
+			       size_t offset, size_t size,
+			       int direction, dma_addr_t dma_addr,
+			       bool map_single);
+
+extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+				 size_t size, int direction, bool map_single);
+
+
 #else /* CONFIG_DMA_API_DEBUG */
 
 static inline void dma_debug_init(u32 num_entries)
 {
 }
 
+static inline void debug_dma_map_page(struct device *dev, struct page *page,
+				      size_t offset, size_t size,
+				      int direction, dma_addr_t dma_addr,
+				      bool map_single)
+{
+}
+
+static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+					size_t size, int direction,
+					bool map_single)
+{
+}
+
+
 #endif /* CONFIG_DMA_API_DEBUG */
 
 #endif /* __DMA_DEBUG_H */
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d0cb47a4211e..a2ed2b769685 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -566,3 +566,56 @@ out:
 
 }
 
+void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
+			size_t size, int direction, dma_addr_t dma_addr,
+			bool map_single)
+{
+	struct dma_debug_entry *entry;
+
+	if (unlikely(global_disable))
+		return;
+
+	if (unlikely(dma_mapping_error(dev, dma_addr)))
+		return;
+
+	entry = dma_entry_alloc();
+	if (!entry)
+		return;
+
+	entry->dev       = dev;
+	entry->type      = dma_debug_page;
+	entry->paddr     = page_to_phys(page) + offset;
+	entry->dev_addr  = dma_addr;
+	entry->size      = size;
+	entry->direction = direction;
+
+	if (map_single) {
+		entry->type = dma_debug_single;
+		check_for_stack(dev, page_address(page) + offset);
+	}
+
+	add_dma_entry(entry);
+}
+EXPORT_SYMBOL(debug_dma_map_page);
+
+void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+			  size_t size, int direction, bool map_single)
+{
+	struct dma_debug_entry ref = {
+		.type           = dma_debug_page,
+		.dev            = dev,
+		.dev_addr       = addr,
+		.size           = size,
+		.direction      = direction,
+	};
+
+	if (unlikely(global_disable))
+		return;
+
+	if (map_single)
+		ref.type = dma_debug_single;
+
+	check_unmap(&ref);
+}
+EXPORT_SYMBOL(debug_dma_unmap_page);
+
-- 
cgit v1.2.3-71-gd317


From 972aa45ceaf65376f33aa75958fcaefc9e752fa4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 14:19:54 +0100
Subject: dma-debug: add add checking for map/unmap_sg

Impact: add debug callbacks for dma_{un}map_sg

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 16 +++++++++++
 lib/dma-debug.c           | 73 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 65f73526ba2c..ee9fdb328549 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 
 struct device;
+struct scatterlist;
 
 #ifdef CONFIG_DMA_API_DEBUG
 
@@ -36,6 +37,11 @@ extern void debug_dma_map_page(struct device *dev, struct page *page,
 extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
 				 size_t size, int direction, bool map_single);
 
+extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+			     int nents, int mapped_ents, int direction);
+
+extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+			       int nelems, int dir);
 
 #else /* CONFIG_DMA_API_DEBUG */
 
@@ -56,6 +62,16 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
 {
 }
 
+static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+				    int nents, int mapped_ents, int direction)
+{
+}
+
+static inline void debug_dma_unmap_sg(struct device *dev,
+				      struct scatterlist *sglist,
+				      int nelems, int dir)
+{
+}
 
 #endif /* CONFIG_DMA_API_DEBUG */
 
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index a2ed2b769685..26e40e93e0f2 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -17,6 +17,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
@@ -619,3 +620,75 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
 }
 EXPORT_SYMBOL(debug_dma_unmap_page);
 
+void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+		      int nents, int mapped_ents, int direction)
+{
+	struct dma_debug_entry *entry;
+	struct scatterlist *s;
+	int i;
+
+	if (unlikely(global_disable))
+		return;
+
+	for_each_sg(sg, s, mapped_ents, i) {
+		entry = dma_entry_alloc();
+		if (!entry)
+			return;
+
+		entry->type           = dma_debug_sg;
+		entry->dev            = dev;
+		entry->paddr          = sg_phys(s);
+		entry->size           = s->length;
+		entry->dev_addr       = s->dma_address;
+		entry->direction      = direction;
+		entry->sg_call_ents   = nents;
+		entry->sg_mapped_ents = mapped_ents;
+
+		check_for_stack(dev, sg_virt(s));
+
+		add_dma_entry(entry);
+	}
+}
+EXPORT_SYMBOL(debug_dma_map_sg);
+
+void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+			int nelems, int dir)
+{
+	struct dma_debug_entry *entry;
+	struct scatterlist *s;
+	int mapped_ents = 0, i;
+	unsigned long flags;
+
+	if (unlikely(global_disable))
+		return;
+
+	for_each_sg(sglist, s, nelems, i) {
+
+		struct dma_debug_entry ref = {
+			.type           = dma_debug_sg,
+			.dev            = dev,
+			.paddr          = sg_phys(s),
+			.dev_addr       = s->dma_address,
+			.size           = s->length,
+			.direction      = dir,
+			.sg_call_ents   = 0,
+		};
+
+		if (mapped_ents && i >= mapped_ents)
+			break;
+
+		if (mapped_ents == 0) {
+			struct hash_bucket *bucket;
+			ref.sg_call_ents = nelems;
+			bucket = get_hash_bucket(&ref, &flags);
+			entry = hash_bucket_find(bucket, &ref);
+			if (entry)
+				mapped_ents = entry->sg_mapped_ents;
+			put_hash_bucket(bucket, &flags);
+		}
+
+		check_unmap(&ref);
+	}
+}
+EXPORT_SYMBOL(debug_dma_unmap_sg);
+
-- 
cgit v1.2.3-71-gd317


From 6bfd4498764d6201399849d2e80fda95db7742c0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 14:38:50 +0100
Subject: dma-debug: add checking for [alloc|free]_coherent

Impact: add debug callbacks for dma_[alloc|free]_coherent

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 16 ++++++++++++++++
 lib/dma-debug.c           | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index ee9fdb328549..cb72dfd87326 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -43,6 +43,12 @@ extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			       int nelems, int dir);
 
+extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
+				     dma_addr_t dma_addr, void *virt);
+
+extern void debug_dma_free_coherent(struct device *dev, size_t size,
+				    void *virt, dma_addr_t addr);
+
 #else /* CONFIG_DMA_API_DEBUG */
 
 static inline void dma_debug_init(u32 num_entries)
@@ -73,6 +79,16 @@ static inline void debug_dma_unmap_sg(struct device *dev,
 {
 }
 
+static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
+					    dma_addr_t dma_addr, void *virt)
+{
+}
+
+static inline void debug_dma_free_coherent(struct device *dev, size_t size,
+					   void *virt, dma_addr_t addr)
+{
+}
+
 #endif /* CONFIG_DMA_API_DEBUG */
 
 #endif /* __DMA_DEBUG_H */
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 26e40e93e0f2..44af837f68ef 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -692,3 +692,48 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 }
 EXPORT_SYMBOL(debug_dma_unmap_sg);
 
+void debug_dma_alloc_coherent(struct device *dev, size_t size,
+			      dma_addr_t dma_addr, void *virt)
+{
+	struct dma_debug_entry *entry;
+
+	if (unlikely(global_disable))
+		return;
+
+	if (unlikely(virt == NULL))
+		return;
+
+	entry = dma_entry_alloc();
+	if (!entry)
+		return;
+
+	entry->type      = dma_debug_coherent;
+	entry->dev       = dev;
+	entry->paddr     = virt_to_phys(virt);
+	entry->size      = size;
+	entry->dev_addr  = dma_addr;
+	entry->direction = DMA_BIDIRECTIONAL;
+
+	add_dma_entry(entry);
+}
+EXPORT_SYMBOL(debug_dma_alloc_coherent);
+
+void debug_dma_free_coherent(struct device *dev, size_t size,
+			 void *virt, dma_addr_t addr)
+{
+	struct dma_debug_entry ref = {
+		.type           = dma_debug_coherent,
+		.dev            = dev,
+		.paddr          = virt_to_phys(virt),
+		.dev_addr       = addr,
+		.size           = size,
+		.direction      = DMA_BIDIRECTIONAL,
+	};
+
+	if (unlikely(global_disable))
+		return;
+
+	check_unmap(&ref);
+}
+EXPORT_SYMBOL(debug_dma_free_coherent);
+
-- 
cgit v1.2.3-71-gd317


From b9d2317e0c4aed02afd20022083b2a485289605d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 14:43:04 +0100
Subject: dma-debug: add checks for sync_single_*

Impact: add debug callbacks for dma_sync_single_for_* functions

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 20 ++++++++++++++++++++
 lib/dma-debug.c           | 21 +++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index cb72dfd87326..0eee7af5143f 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -49,6 +49,14 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
 extern void debug_dma_free_coherent(struct device *dev, size_t size,
 				    void *virt, dma_addr_t addr);
 
+extern void debug_dma_sync_single_for_cpu(struct device *dev,
+					  dma_addr_t dma_handle, size_t size,
+					  int direction);
+
+extern void debug_dma_sync_single_for_device(struct device *dev,
+					     dma_addr_t dma_handle,
+					     size_t size, int direction);
+
 #else /* CONFIG_DMA_API_DEBUG */
 
 static inline void dma_debug_init(u32 num_entries)
@@ -89,6 +97,18 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
 {
 }
 
+static inline void debug_dma_sync_single_for_cpu(struct device *dev,
+						 dma_addr_t dma_handle,
+						 size_t size, int direction)
+{
+}
+
+static inline void debug_dma_sync_single_for_device(struct device *dev,
+						    dma_addr_t dma_handle,
+						    size_t size, int direction)
+{
+}
+
 #endif /* CONFIG_DMA_API_DEBUG */
 
 #endif /* __DMA_DEBUG_H */
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 44af837f68ef..714cfb6ca0e4 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -737,3 +737,24 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
 }
 EXPORT_SYMBOL(debug_dma_free_coherent);
 
+void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
+				   size_t size, int direction)
+{
+	if (unlikely(global_disable))
+		return;
+
+	check_sync(dev, dma_handle, size, 0, direction, true);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_for_cpu);
+
+void debug_dma_sync_single_for_device(struct device *dev,
+				      dma_addr_t dma_handle, size_t size,
+				      int direction)
+{
+	if (unlikely(global_disable))
+		return;
+
+	check_sync(dev, dma_handle, size, 0, direction, false);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_for_device);
+
-- 
cgit v1.2.3-71-gd317


From 948408ba3e2a67ed0f95e18ed5be1c622c2c5fc3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 14:55:38 +0100
Subject: dma-debug: add checks for sync_single_range_*

Impact: add debug callbacks for dma_sync_single_range_for_* functions

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 27 +++++++++++++++++++++++++++
 lib/dma-debug.c           | 24 ++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 0eee7af5143f..e9b903503adb 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -57,6 +57,17 @@ extern void debug_dma_sync_single_for_device(struct device *dev,
 					     dma_addr_t dma_handle,
 					     size_t size, int direction);
 
+extern void debug_dma_sync_single_range_for_cpu(struct device *dev,
+						dma_addr_t dma_handle,
+						unsigned long offset,
+						size_t size,
+						int direction);
+
+extern void debug_dma_sync_single_range_for_device(struct device *dev,
+						   dma_addr_t dma_handle,
+						   unsigned long offset,
+						   size_t size, int direction);
+
 #else /* CONFIG_DMA_API_DEBUG */
 
 static inline void dma_debug_init(u32 num_entries)
@@ -109,6 +120,22 @@ static inline void debug_dma_sync_single_for_device(struct device *dev,
 {
 }
 
+static inline void debug_dma_sync_single_range_for_cpu(struct device *dev,
+						       dma_addr_t dma_handle,
+						       unsigned long offset,
+						       size_t size,
+						       int direction)
+{
+}
+
+static inline void debug_dma_sync_single_range_for_device(struct device *dev,
+							  dma_addr_t dma_handle,
+							  unsigned long offset,
+							  size_t size,
+							  int direction)
+{
+}
+
 #endif /* CONFIG_DMA_API_DEBUG */
 
 #endif /* __DMA_DEBUG_H */
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 714cfb6ca0e4..d1c0ac1831b7 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -758,3 +758,27 @@ void debug_dma_sync_single_for_device(struct device *dev,
 }
 EXPORT_SYMBOL(debug_dma_sync_single_for_device);
 
+void debug_dma_sync_single_range_for_cpu(struct device *dev,
+					 dma_addr_t dma_handle,
+					 unsigned long offset, size_t size,
+					 int direction)
+{
+	if (unlikely(global_disable))
+		return;
+
+	check_sync(dev, dma_handle, size, offset, direction, true);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu);
+
+void debug_dma_sync_single_range_for_device(struct device *dev,
+					    dma_addr_t dma_handle,
+					    unsigned long offset,
+					    size_t size, int direction)
+{
+	if (unlikely(global_disable))
+		return;
+
+	check_sync(dev, dma_handle, size, offset, direction, false);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_range_for_device);
+
-- 
cgit v1.2.3-71-gd317


From a31fba5d68cebf8f5fefd03e079dab94875e25f5 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 9 Jan 2009 15:01:12 +0100
Subject: dma-debug: add checks for sync_single_sg_*

Impact: add debug callbacks for dma_sync_sg_* functions

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h | 20 ++++++++++++++++++++
 lib/dma-debug.c           | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index e9b903503adb..4985c6c5237e 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -68,6 +68,14 @@ extern void debug_dma_sync_single_range_for_device(struct device *dev,
 						   unsigned long offset,
 						   size_t size, int direction);
 
+extern void debug_dma_sync_sg_for_cpu(struct device *dev,
+				      struct scatterlist *sg,
+				      int nelems, int direction);
+
+extern void debug_dma_sync_sg_for_device(struct device *dev,
+					 struct scatterlist *sg,
+					 int nelems, int direction);
+
 #else /* CONFIG_DMA_API_DEBUG */
 
 static inline void dma_debug_init(u32 num_entries)
@@ -136,6 +144,18 @@ static inline void debug_dma_sync_single_range_for_device(struct device *dev,
 {
 }
 
+static inline void debug_dma_sync_sg_for_cpu(struct device *dev,
+					     struct scatterlist *sg,
+					     int nelems, int direction)
+{
+}
+
+static inline void debug_dma_sync_sg_for_device(struct device *dev,
+						struct scatterlist *sg,
+						int nelems, int direction)
+{
+}
+
 #endif /* CONFIG_DMA_API_DEBUG */
 
 #endif /* __DMA_DEBUG_H */
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d1c0ac1831b7..9d11e89c2ee2 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -782,3 +782,35 @@ void debug_dma_sync_single_range_for_device(struct device *dev,
 }
 EXPORT_SYMBOL(debug_dma_sync_single_range_for_device);
 
+void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+			       int nelems, int direction)
+{
+	struct scatterlist *s;
+	int i;
+
+	if (unlikely(global_disable))
+		return;
+
+	for_each_sg(sg, s, nelems, i) {
+		check_sync(dev, s->dma_address, s->dma_length, 0,
+				direction, true);
+	}
+}
+EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
+
+void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+				  int nelems, int direction)
+{
+	struct scatterlist *s;
+	int i;
+
+	if (unlikely(global_disable))
+		return;
+
+	for_each_sg(sg, s, nelems, i) {
+		check_sync(dev, s->dma_address, s->dma_length, 0,
+				direction, false);
+	}
+}
+EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
+
-- 
cgit v1.2.3-71-gd317


From e79c1ba84c68de9161d541bd2bcc8ea65c89955c Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Fri, 27 Feb 2009 16:59:05 +0100
Subject: ssb: Add SPROM fallback support

This adds SSB functionality to register a fallback SPROM image from the
architecture setup code.

Weird architectures exist that have half-assed SSB devices without SPROM attached to
their PCI busses. The architecture can register a fallback SPROM image that is
used if no SPROM is found on the SSB device.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Cc: Florian Fainelli <florian@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/ssb/pci.c         | 14 +++++++++++++-
 drivers/ssb/sprom.c       | 36 ++++++++++++++++++++++++++++++++++++
 drivers/ssb/ssb_private.h |  1 +
 include/linux/ssb/ssb.h   |  4 ++++
 4 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ssb/pci.c b/drivers/ssb/pci.c
index c958ac16423c..40ea41762247 100644
--- a/drivers/ssb/pci.c
+++ b/drivers/ssb/pci.c
@@ -564,6 +564,7 @@ static int sprom_extract(struct ssb_bus *bus, struct ssb_sprom *out,
 static int ssb_pci_sprom_get(struct ssb_bus *bus,
 			     struct ssb_sprom *sprom)
 {
+	const struct ssb_sprom *fallback;
 	int err = -ENOMEM;
 	u16 *buf;
 
@@ -583,12 +584,23 @@ static int ssb_pci_sprom_get(struct ssb_bus *bus,
 		bus->sprom_size = SSB_SPROMSIZE_WORDS_R4;
 		sprom_do_read(bus, buf);
 		err = sprom_check_crc(buf, bus->sprom_size);
-		if (err)
+		if (err) {
+			/* All CRC attempts failed.
+			 * Maybe there is no SPROM on the device?
+			 * If we have a fallback, use that. */
+			fallback = ssb_get_fallback_sprom();
+			if (fallback) {
+				memcpy(sprom, fallback, sizeof(*sprom));
+				err = 0;
+				goto out_free;
+			}
 			ssb_printk(KERN_WARNING PFX "WARNING: Invalid"
 				   " SPROM CRC (corrupt SPROM)\n");
+		}
 	}
 	err = sprom_extract(bus, sprom, buf, bus->sprom_size);
 
+out_free:
 	kfree(buf);
 out:
 	return err;
diff --git a/drivers/ssb/sprom.c b/drivers/ssb/sprom.c
index 3668edb39315..8943015a3eef 100644
--- a/drivers/ssb/sprom.c
+++ b/drivers/ssb/sprom.c
@@ -14,6 +14,9 @@
 #include "ssb_private.h"
 
 
+static const struct ssb_sprom *fallback_sprom;
+
+
 static int sprom2hex(const u16 *sprom, char *buf, size_t buf_len,
 		     size_t sprom_size_words)
 {
@@ -131,3 +134,36 @@ out:
 		return res;
 	return err ? err : count;
 }
+
+/**
+ * ssb_arch_set_fallback_sprom - Set a fallback SPROM for use if no SPROM is found.
+ *
+ * @sprom: The SPROM data structure to register.
+ *
+ * With this function the architecture implementation may register a fallback
+ * SPROM data structure. The fallback is only used for PCI based SSB devices,
+ * where no valid SPROM can be found in the shadow registers.
+ *
+ * This function is useful for weird architectures that have a half-assed SSB device
+ * hardwired to their PCI bus.
+ *
+ * Note that it does only work with PCI attached SSB devices. PCMCIA devices currently
+ * don't use this fallback.
+ * Architectures must provide the SPROM for native SSB devices anyway,
+ * so the fallback also isn't used for native devices.
+ *
+ * This function is available for architecture code, only. So it is not exported.
+ */
+int ssb_arch_set_fallback_sprom(const struct ssb_sprom *sprom)
+{
+	if (fallback_sprom)
+		return -EEXIST;
+	fallback_sprom = sprom;
+
+	return 0;
+}
+
+const struct ssb_sprom *ssb_get_fallback_sprom(void)
+{
+	return fallback_sprom;
+}
diff --git a/drivers/ssb/ssb_private.h b/drivers/ssb/ssb_private.h
index ebc32d8fe15f..57fa482abb94 100644
--- a/drivers/ssb/ssb_private.h
+++ b/drivers/ssb/ssb_private.h
@@ -131,6 +131,7 @@ ssize_t ssb_attr_sprom_store(struct ssb_bus *bus,
 			     const char *buf, size_t count,
 			     int (*sprom_check_crc)(const u16 *sprom, size_t size),
 			     int (*sprom_write)(struct ssb_bus *bus, const u16 *sprom));
+extern const struct ssb_sprom *ssb_get_fallback_sprom(void);
 
 
 /* core.c */
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 17d9b58f6379..5ae8fa22d331 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -339,6 +339,10 @@ extern int ssb_bus_pcmciabus_register(struct ssb_bus *bus,
 
 extern void ssb_bus_unregister(struct ssb_bus *bus);
 
+/* Set a fallback SPROM.
+ * See kdoc at the function definition for complete documentation. */
+extern int ssb_arch_set_fallback_sprom(const struct ssb_sprom *sprom);
+
 /* Suspend a SSB bus.
  * Call this from the parent bus suspend routine. */
 extern int ssb_bus_suspend(struct ssb_bus *bus);
-- 
cgit v1.2.3-71-gd317


From 6a242909b01120f6f3d571c0b75e20ec61f0d8d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:58 +0900
Subject: percpu: clean up percpu constants

Impact: cleaup

Make the following cleanups.

* There isn't much arch-specific about PERCPU_MODULE_RESERVE.  Always
  define it whether arch overrides PERCPU_ENOUGH_ROOM or not.

* blackfin overrides PERCPU_ENOUGH_ROOM to align static area size.  Do
  it by default.

* percpu allocation sizes doesn't have much to do with the page size.
  Don't use PAGE_SHIFT in their definition.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bryan Wu <cooloney@kernel.org>
---
 arch/blackfin/include/asm/percpu.h | 10 ----------
 include/linux/percpu.h             | 24 +++++++++++++-----------
 2 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/blackfin/include/asm/percpu.h b/arch/blackfin/include/asm/percpu.h
index 797c0c165069..c94c7bc88c71 100644
--- a/arch/blackfin/include/asm/percpu.h
+++ b/arch/blackfin/include/asm/percpu.h
@@ -3,14 +3,4 @@
 
 #include <asm-generic/percpu.h>
 
-#ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE 8192
-#else
-#define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
-
 #endif	/* __ARCH_BLACKFIN_PERCPU__ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 545b068bcb70..2d34b038fe70 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -5,6 +5,7 @@
 #include <linux/slab.h> /* For kmalloc() */
 #include <linux/smp.h>
 #include <linux/cpumask.h>
+#include <linux/pfn.h>
 
 #include <asm/percpu.h>
 
@@ -52,17 +53,18 @@
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
 
-/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
-#ifndef PERCPU_ENOUGH_ROOM
+/* enough to cover all DEFINE_PER_CPUs in modules */
 #ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE	8192
+#define PERCPU_MODULE_RESERVE		(8 << 10)
 #else
-#define PERCPU_MODULE_RESERVE	0
+#define PERCPU_MODULE_RESERVE		0
 #endif
 
+#ifndef PERCPU_ENOUGH_ROOM
 #define PERCPU_ENOUGH_ROOM						\
-	(__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE)
-#endif	/* PERCPU_ENOUGH_ROOM */
+	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) +	\
+	 PERCPU_MODULE_RESERVE)
+#endif
 
 /*
  * Must be an lvalue. Since @var must be a simple identifier,
@@ -79,7 +81,7 @@
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
 /* minimum unit size, also is the maximum supported allocation size */
-#define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
+#define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(64 << 10)
 
 /*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
@@ -96,15 +98,15 @@
 #ifndef PERCPU_DYNAMIC_RESERVE
 #  if BITS_PER_LONG > 32
 #    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(24 << 10)
 #    else
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
 #    endif
 #  else
 #    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
 #    else
-#      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(8 << 10)
 #    endif
 #  endif
 #endif	/* PERCPU_DYNAMIC_RESERVE */
-- 
cgit v1.2.3-71-gd317


From 2441d15c97d498b18f03ae9fba262ffeae42a08b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: cosmetic renames in pcpu_setup_first_chunk()

Impact: cosmetic, preparation for future changes

Make the following renames in pcpur_setup_first_chunk() in preparation
for future changes.

* s/free_size/dyn_size/
* s/static_vm/first_vm/
* s/static_chunk/schunk/

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |  2 +-
 mm/percpu.c            | 58 +++++++++++++++++++++++++-------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 2d34b038fe70..a0b4ea2a3354 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -118,7 +118,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 					size_t static_size, size_t unit_size,
-					size_t free_size, void *base_addr,
+					size_t dyn_size, void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 3d0f5456827c..9531590e6b69 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @free_size: free size in bytes, 0 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -849,12 +849,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * return the same number of pages for all cpus.
  *
  * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
  *
- * @free_size determines the number of free bytes after the static
+ * @dyn_size determines the number of free bytes after the static
  * area in the first chunk.  If zero, whatever left is available.
  * Specifying non-zero value make percpu leave the area after
- * @static_size + @free_size alone.
+ * @static_size + @dyn_size alone.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -870,19 +870,19 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t unit_size,
-				     size_t free_size, void *base_addr,
+				     size_t dyn_size, void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
-	static struct vm_struct static_vm;
-	struct pcpu_chunk *static_chunk;
+	static struct vm_struct first_vm;
+	struct pcpu_chunk *schunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && free_size);
-	BUG_ON(unit_size && unit_size < static_size + free_size);
+	BUG_ON(!unit_size && dyn_size);
+	BUG_ON(unit_size && unit_size < static_size + dyn_size);
 	BUG_ON(unit_size & ~PAGE_MASK);
 	BUG_ON(base_addr && !unit_size);
 	BUG_ON(base_addr && populate_pte_fn);
@@ -908,24 +908,24 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static_chunk */
-	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
-	INIT_LIST_HEAD(&static_chunk->list);
-	static_chunk->vm = &static_vm;
+	/* init static chunk */
+	schunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&schunk->list);
+	schunk->vm = &first_vm;
 
-	if (free_size)
-		static_chunk->free_size = free_size;
+	if (dyn_size)
+		schunk->free_size = dyn_size;
 	else
-		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+		schunk->free_size = pcpu_unit_size - pcpu_static_size;
 
-	static_chunk->contig_hint = static_chunk->free_size;
+	schunk->contig_hint = schunk->free_size;
 
 	/* allocate vm address */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
+	first_vm.flags = VM_ALLOC;
+	first_vm.size = pcpu_chunk_size;
 
 	if (!base_addr)
-		vm_area_register_early(&static_vm, PAGE_SIZE);
+		vm_area_register_early(&first_vm, PAGE_SIZE);
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
@@ -933,8 +933,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		 * be mapped or unmapped by percpu and is marked
 		 * immutable.
 		 */
-		static_vm.addr = base_addr;
-		static_chunk->immutable = true;
+		first_vm.addr = base_addr;
+		schunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -945,7 +945,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 			if (!page)
 				break;
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
+			*pcpu_chunk_pagep(schunk, cpu, i) = page;
 		}
 
 		BUG_ON(i < PFN_UP(pcpu_static_size));
@@ -960,20 +960,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (populate_pte_fn) {
 		for_each_possible_cpu(cpu)
 			for (i = 0; i < nr_pages; i++)
-				populate_pte_fn(pcpu_chunk_addr(static_chunk,
+				populate_pte_fn(pcpu_chunk_addr(schunk,
 								cpu, i));
 
-		err = pcpu_map(static_chunk, 0, nr_pages);
+		err = pcpu_map(schunk, 0, nr_pages);
 		if (err)
 			panic("failed to setup static percpu area, err=%d\n",
 			      err);
 	}
 
-	/* link static_chunk in */
-	pcpu_chunk_relocate(static_chunk, -1);
-	pcpu_chunk_addr_insert(static_chunk);
+	/* link the first chunk in */
+	pcpu_chunk_relocate(schunk, -1);
+	pcpu_chunk_addr_insert(schunk);
 
 	/* we're done */
-	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 	return pcpu_unit_size;
 }
-- 
cgit v1.2.3-71-gd317


From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: use negative for auto for pcpu_setup_first_chunk() arguments

Impact: argument semantic cleanup

In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant
auto-sizing.  It's okay for @unit_size as 0 doesn't make sense but 0
dynamic reserve size is valid.  Alos, if arch @dyn_size is calculated
from other parameters, it might end up passing in 0 @dyn_size and
malfunction when the size is automatically adjusted.

This patch makes both @unit_size and @dyn_size ssize_t and use -1 for
auto sizing.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |  2 +-
 include/linux/percpu.h         |  5 +++--
 mm/percpu.c                    | 46 +++++++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index c29f301d3885..ef3a2cd3fe64 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
 				     pcpu4k_populate_pte);
 	goto out_free_ar;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a0b4ea2a3354..a96fc53bbd62 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,8 +117,9 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size, size_t unit_size,
-					size_t dyn_size, void *base_addr,
+					size_t static_size,
+					ssize_t unit_size, ssize_t dyn_size,
+					void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 503ccad091af..a84cf9977faf 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -824,8 +824,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -842,13 +842,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
- * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
+ * @unit_size, if non-negative, specifies unit size and must be
+ * aligned to PAGE_SIZE and equal to or larger than @static_size +
+ * @dyn_size.
  *
- * @dyn_size determines the number of free bytes after the static
- * area in the first chunk.  If zero, whatever left is available.
- * Specifying non-zero value make percpu leave the area after
- * @static_size + @dyn_size alone.
+ * @dyn_size, if non-negative, limits the number of bytes available
+ * for dynamic allocation in the first chunk.  Specifying non-negative
+ * value make percpu leave alone the area beyond @static_size +
+ * @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -863,8 +864,9 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size, size_t unit_size,
-				     size_t dyn_size, void *base_addr,
+				     size_t static_size,
+				     ssize_t unit_size, ssize_t dyn_size,
+				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
@@ -877,13 +879,17 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && dyn_size);
-	BUG_ON(unit_size && unit_size < static_size + dyn_size);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(base_addr && !unit_size);
+	if (unit_size >= 0) {
+		BUG_ON(unit_size < static_size +
+				   (dyn_size >= 0 ? dyn_size : 0));
+		BUG_ON(unit_size & ~PAGE_MASK);
+	} else {
+		BUG_ON(dyn_size >= 0);
+		BUG_ON(base_addr);
+	}
 	BUG_ON(base_addr && populate_pte_fn);
 
-	if (unit_size)
+	if (unit_size >= 0)
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
@@ -894,6 +900,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
+	if (dyn_size < 0)
+		dyn_size = pcpu_unit_size - static_size;
+
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
@@ -909,12 +918,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
-
-	if (dyn_size)
-		schunk->free_size = dyn_size;
-	else
-		schunk->free_size = pcpu_unit_size - static_size;
-
+	schunk->free_size = dyn_size;
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
-- 
cgit v1.2.3-71-gd317


From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu, module: implement reserved allocation and use it for module
 percpu variables

Impact: add reserved allocation functionality and use it for module
	percpu variables

This patch implements reserved allocation from the first chunk.  When
setting up the first chunk, arch can ask to set aside certain number
of bytes right after the core static area which is available only
through a separate reserved allocator.  This will be used primarily
for module static percpu variables on architectures with limited
relocation range to ensure that the module perpcu symbols are inside
the relocatable range.

If reserved area is requested, the first chunk becomes reserved and
isn't available for regular allocation.  If the first chunk also
includes piggy-back dynamic allocation area, a separate chunk mapping
the same region is created to serve dynamic allocation.  The first one
is called static first chunk and the second dynamic first chunk.
Although they share the page map, their different area map
initializations guarantee they serve disjoint areas according to their
purposes.

If arch doesn't setup reserved area, reserved allocation is handled
like any other allocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |   8 +--
 include/linux/percpu.h         |  10 +--
 kernel/module.c                |   2 +-
 mm/percpu.c                    | 153 +++++++++++++++++++++++++++++++++++------
 4 files changed, 144 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 38e2b2a470a5..dd4eabc747c8 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -217,7 +217,7 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
 				     pcpur_size - static_size, vm.addr, NULL);
 	goto out_free_ar;
 
@@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
-				     pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
+				     NULL, pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a96fc53bbd62..8ff15153ae20 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size,
-					ssize_t unit_size, ssize_t dyn_size,
-					void *base_addr,
-					pcpu_populate_pte_fn_t populate_pte_fn);
+				size_t static_size, size_t reserved_size,
+				ssize_t unit_size, ssize_t dyn_size,
+				void *base_addr,
+				pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
  * Use this to get to a cpu's version of the per-cpu object
@@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  */
 #define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
 
+extern void *__alloc_reserved_percpu(size_t size, size_t align);
+
 #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 
 struct percpu_data {
diff --git a/kernel/module.c b/kernel/module.c
index 1f0657ae555b..f0e04d6b67d8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 		align = PAGE_SIZE;
 	}
 
-	ptr = __alloc_percpu(size, align);
+	ptr = __alloc_reserved_percpu(size, align);
 	if (!ptr)
 		printk(KERN_WARNING
 		       "Could not allocate %lu bytes percpu data\n", size);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5b47d9fe65f5..ef8e169b7731 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
+/* optional reserved chunk, only accessible for reserved allocations */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+/* offset limit of the reserved chunk */
+static int pcpu_reserved_chunk_limit;
+
 /*
  * One mutex to rule them all.
  *
@@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size)
  *
  * This function is called after an allocation or free changed @chunk.
  * New slot according to the changed state is determined and @chunk is
- * moved to the slot.
+ * moved to the slot.  Note that the reserved chunk is never put on
+ * chunk slots.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 
-	if (oslot != nslot) {
+	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
 		if (oslot < nslot)
 			list_move(&chunk->list, &pcpu_slot[nslot]);
 		else
@@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 
+	/* is it in the reserved chunk? */
+	if (pcpu_reserved_chunk) {
+		void *start = pcpu_reserved_chunk->vm->addr;
+
+		if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+			return pcpu_reserved_chunk;
+	}
+
+	/* nah... search the regular ones */
 	n = *pcpu_chunk_rb_search(addr, &parent);
 	if (!n) {
 		/* no exactly matching chunk, the parent is the closest */
@@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 }
 
 /**
- * __alloc_percpu - allocate percpu area
+ * pcpu_alloc - the percpu allocator
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
  *
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
@@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-void *__alloc_percpu(size_t size, size_t align)
+static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
 	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
@@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align)
 
 	mutex_lock(&pcpu_mutex);
 
-	/* allocate area */
+	/* serve reserved allocations from the reserved chunk if available */
+	if (reserved && pcpu_reserved_chunk) {
+		chunk = pcpu_reserved_chunk;
+		if (size > chunk->contig_hint)
+			goto out_unlock;
+		off = pcpu_alloc_area(chunk, size, align);
+		if (off >= 0)
+			goto area_found;
+		goto out_unlock;
+	}
+
+	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
@@ -773,8 +800,41 @@ out_unlock:
 	mutex_unlock(&pcpu_mutex);
 	return ptr;
 }
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, false);
+}
 EXPORT_SYMBOL_GPL(__alloc_percpu);
 
+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align from reserved
+ * percpu area if arch has set it up; otherwise, allocation is served
+ * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_reserved_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, true);
+}
+
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
 	WARN_ON(chunk->immutable);
@@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
@@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
+ * @reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk.  This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation.  This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
  * @unit_size, if non-negative, specifies unit size and must be
  * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * @dyn_size, if non-negative, limits the number of bytes available
  * for dynamic allocation in the first chunk.  Specifying non-negative
  * value make percpu leave alone the area beyond @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @populate_pte_fn is used to populate the pagetable.  NULL means the
  * caller already populated the pagetable.
  *
+ * If the first chunk ends up with both reserved and dynamic areas, it
+ * is served by two chunks - one to serve the core static and reserved
+ * areas and the other for the dynamic area.  They share the same vm
+ * and page map but uses different area allocation map to stay away
+ * from each other.  The latter chunk is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunks.
+ *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size,
+				     size_t static_size, size_t reserved_size,
 				     ssize_t unit_size, ssize_t dyn_size,
 				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
-	static int smap[2];
-	struct pcpu_chunk *schunk;
+	static int smap[2], dmap[2];
+	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
-	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
+		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	if (unit_size >= 0) {
-		BUG_ON(unit_size < static_size +
+		BUG_ON(unit_size < static_size + reserved_size +
 				   (dyn_size >= 0 ? dyn_size : 0));
 		BUG_ON(unit_size & ~PAGE_MASK);
 	} else {
@@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size));
+					PFN_UP(static_size + reserved_size));
 
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
@@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
 	if (dyn_size < 0)
-		dyn_size = pcpu_unit_size - static_size;
+		dyn_size = pcpu_unit_size - static_size - reserved_size;
 
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
@@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static chunk */
+	/*
+	 * Initialize static chunk.  If reserved_size is zero, the
+	 * static chunk covers static area + dynamic allocation area
+	 * in the first chunk.  If reserved_size is not zero, it
+	 * covers static area + reserved area (mostly used for module
+	 * static percpu allocation).
+	 */
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->page = schunk->page_ar;
-	schunk->free_size = dyn_size;
+
+	if (reserved_size) {
+		schunk->free_size = reserved_size;
+		pcpu_reserved_chunk = schunk;	/* not for dynamic alloc */
+	} else {
+		schunk->free_size = dyn_size;
+		dyn_size = 0;			/* dynamic area covered */
+	}
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
+	pcpu_reserved_chunk_limit = static_size + schunk->free_size;
+
+	/* init dynamic chunk if necessary */
+	if (dyn_size) {
+		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+		INIT_LIST_HEAD(&dchunk->list);
+		dchunk->vm = &first_vm;
+		dchunk->map = dmap;
+		dchunk->map_alloc = ARRAY_SIZE(dmap);
+		dchunk->page = schunk->page_ar;	/* share page map with schunk */
+
+		dchunk->contig_hint = dchunk->free_size = dyn_size;
+		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
+		dchunk->map[dchunk->map_used++] = dchunk->free_size;
+	}
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the static chunk can't
-		 * be mapped or unmapped by percpu and is marked
+		 * vmalloc area.  In this case the first chunks can't
+		 * be mapped or unmapped by percpu and are marked
 		 * immutable.
 		 */
 		first_vm.addr = base_addr;
 		schunk->immutable = true;
+		if (dchunk)
+			dchunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	}
 
 	/* link the first chunk in */
-	pcpu_chunk_relocate(schunk, -1);
-	pcpu_chunk_addr_insert(schunk);
+	if (!dchunk) {
+		pcpu_chunk_relocate(schunk, -1);
+		pcpu_chunk_addr_insert(schunk);
+	} else {
+		pcpu_chunk_relocate(dchunk, -1);
+		pcpu_chunk_addr_insert(dchunk);
+	}
 
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
-- 
cgit v1.2.3-71-gd317


From 6b19b0c2400437a3c10059ede0e59b517092e1bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: x86, percpu: setup reserved percpu area for x86_64

Impact: fix relocation overflow during module load

x86_64 uses 32bit relocations for symbol access and static percpu
symbols whether in core or modules must be inside 2GB of the percpu
segement base which the dynamic percpu allocator doesn't guarantee.
This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the
first chunk so that module percpu areas are always allocated from the
first chunk which is always inside the relocatable range.

This problem exists for any percpu allocator but is easily triggered
when using the embedding allocator because the second chunk is located
beyond 2GB on it.

This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such
that it only indicates the size of the area to reserve for dynamic
allocation as static and dynamic areas can be separate.  New
PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as
the reserved area separation eats away some allocatable space and
having slightly more headroom (currently between 4 and 8k after
minimal boot sans module area) makes sense for common case
performance.

x86_32 can address anywhere from anywhere and doesn't need reserving.

Mike Galbraith first reported the problem first and bisected it to the
embedding percpu allocator commit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Galbraith <efault@gmx.de>
Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 37 ++++++++++++++++++++++++++++---------
 include/linux/percpu.h         | 35 ++++++++++++-----------------------
 2 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index dd4eabc747c8..efa615f2bf43 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations.  Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base.  On x86_32, anything can
+ * address anywhere.  No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE	0
+#endif
+
 /**
  * pcpu_need_numa - determine percpu allocation needs to consider NUMA
  *
@@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
 	static struct vm_struct vm;
 	pg_data_t *last;
-	size_t ptrs_size;
+	size_t ptrs_size, dyn_size;
 	unsigned int cpu;
 	ssize_t ret;
 
@@ -169,12 +182,14 @@ proceed:
 	 * Currently supports only single page.  Supporting multiple
 	 * pages won't be too difficult if it ever becomes necessary.
 	 */
-	pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
 	if (pcpur_size > PMD_SIZE) {
 		pr_warning("PERCPU: static data is larger than large page, "
 			   "can't use large page\n");
 		return -EINVAL;
 	}
+	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	/* allocate pointer array and alloc large pages */
 	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
@@ -217,8 +232,9 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
-				     pcpur_size - static_size, vm.addr, NULL);
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE,
+				     PMD_SIZE, dyn_size, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
@@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 		return -EINVAL;
 
 	/* allocate and copy */
-	pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
 	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-	dyn_size = pcpue_size - static_size;
+	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
 				       PAGE_SIZE);
@@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+				      PERCPU_FIRST_CHUNK_RESERVE,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
-				     NULL, pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
+				     pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8ff15153ae20..54a968b4b924 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -85,31 +85,20 @@
 
 /*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
- * back on the first chunk if arch is manually allocating and mapping
- * it for faster access (as a part of large page mapping for example).
- * Note that dynamic percpu allocator covers both static and dynamic
- * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ * back on the first chunk for dynamic percpu allocation if arch is
+ * manually allocating and mapping it for faster access (as a part of
+ * large page mapping for example).
  *
- * On typical configuration with modules, the following values leave
- * about 8k of free space on the first chunk after boot on both x86_32
- * and 64 when module support is enabled.  When module support is
- * disabled, it's much tighter.
+ * The following values give between one and two pages of free space
+ * after typical minimal boot (2-way SMP, single disk and NIC) with
+ * both defconfig and a distro config on x86_64 and 32.  More
+ * intelligent way to determine this would be nice.
  */
-#ifndef PERCPU_DYNAMIC_RESERVE
-#  if BITS_PER_LONG > 32
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(24 << 10)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
-#    endif
-#  else
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(8 << 10)
-#    endif
-#  endif
-#endif	/* PERCPU_DYNAMIC_RESERVE */
+#if BITS_PER_LONG > 32
+#define PERCPU_DYNAMIC_RESERVE		(20 << 10)
+#else
+#define PERCPU_DYNAMIC_RESERVE		(12 << 10)
+#endif
 
 extern void *pcpu_base_addr;
 
-- 
cgit v1.2.3-71-gd317


From 0e39ac444636ff5be39b26f1cb56d79594654dda Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 6 Mar 2009 10:35:52 -0500
Subject: tracing, Text Edit Lock - Architecture Independent Code

This is an architecture independant synchronization around kernel text
modifications through use of a global mutex.

A mutex has been chosen so that kprobes, the main user of this, can sleep
during memory allocation between the memory read of the instructions it
must replace and the memory write of the breakpoint.

Other user of this interface: immediate values.

Paravirt and alternatives are always done when SMP is inactive, so there
is no need to use locks.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
LKML-Reference: <49B142D8.7020601@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/memory.h |  6 ++++++
 mm/memory.c            | 10 ++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 3fdc10806d31..86a6c0f0518d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -99,4 +99,10 @@ enum mem_add_context { BOOT, HOTPLUG };
 #define hotplug_memory_notifier(fn, pri) do { } while (0)
 #endif
 
+/*
+ * Kernel text modification mutex, used for code patching. Users of this lock
+ * can sleep.
+ */
+extern struct mutex text_mutex;
+
 #endif /* _LINUX_MEMORY_H_ */
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..05fab3bc5b4b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -48,6 +48,8 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/delayacct.h>
+#include <linux/kprobes.h>
+#include <linux/mutex.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
@@ -99,6 +101,14 @@ int randomize_va_space __read_mostly =
 					2;
 #endif
 
+/*
+ * mutex protecting text section modification (dynamic code patching).
+ * some users need to sleep (allocating memory...) while they hold this lock.
+ *
+ * NOT exported to modules - patching kernel text is a really delicate matter.
+ */
+DEFINE_MUTEX(text_mutex);
+
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
-- 
cgit v1.2.3-71-gd317


From 4370aa4aa75391a5e2e06bccb0919109f725ed8e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:46 +0100
Subject: vsprintf: add binary printf

Impact: add new APIs for binary trace printk infrastructure

vbin_printf(): write args to binary buffer, string is copied
when "%s" is occurred.

bstr_printf(): read from binary buffer for args and format a string

[fweisbec@gmail.com: rebase]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1236356510-8381-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/string.h |   7 +
 lib/Kconfig            |   3 +
 lib/vsprintf.c         | 442 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 452 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2f..27ac31784ad2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -10,6 +10,7 @@
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
+#include <stdarg.h>
 
 extern char *strndup_user(const char __user *, long);
 
@@ -111,6 +112,12 @@ extern void argv_free(char **argv);
 
 extern bool sysfs_streq(const char *s1, const char *s2);
 
+#ifdef CONFIG_BINARY_PRINTF
+int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
+int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
+int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
+#endif
+
 extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 			const void *from, size_t available);
 
diff --git a/lib/Kconfig b/lib/Kconfig
index 03c2c24b9083..97d62cf091a7 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -2,6 +2,9 @@
 # Library configuration
 #
 
+config BINARY_PRINTF
+	def_bool n
+
 menu "Library routines"
 
 config BITREVERSE
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0fbd0121d91d..3543bbe8b1bc 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1058,6 +1058,448 @@ int sprintf(char * buf, const char *fmt, ...)
 }
 EXPORT_SYMBOL(sprintf);
 
+#ifdef CONFIG_BINARY_PRINTF
+/*
+ * bprintf service:
+ * vbin_printf() - VA arguments to binary data
+ * bstr_printf() - Binary data to text string
+ */
+
+/**
+ * vbin_printf - Parse a format string and place args' binary value in a buffer
+ * @bin_buf: The buffer to place args' binary value
+ * @size: The size of the buffer(by words(32bits), not characters)
+ * @fmt: The format string to use
+ * @args: Arguments for the format string
+ *
+ * The format follows C99 vsnprintf, except %n is ignored, and its argument
+ * is skiped.
+ *
+ * The return value is the number of words(32bits) which would be generated for
+ * the given input.
+ *
+ * NOTE:
+ * If the return value is greater than @size, the resulting bin_buf is NOT
+ * valid for bstr_printf().
+ */
+int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+{
+	char *str, *end;
+	int qualifier;
+
+	str = (char *)bin_buf;
+	end = (char *)(bin_buf + size);
+
+#define save_arg(type)							\
+do {									\
+	if (sizeof(type) == 8) {					\
+		unsigned long long value;				\
+		str = PTR_ALIGN(str, sizeof(u32));			\
+		value = va_arg(args, unsigned long long);		\
+		if (str + sizeof(type) <= end) {			\
+			*(u32 *)str = *(u32 *)&value;			\
+			*(u32 *)(str + 4) = *((u32 *)&value + 1);	\
+		}							\
+	} else {							\
+		unsigned long value;					\
+		str = PTR_ALIGN(str, sizeof(type));			\
+		value = va_arg(args, int);				\
+		if (str + sizeof(type) <= end)				\
+			*(typeof(type) *)str = (type)value;		\
+	}								\
+	str += sizeof(type);						\
+} while (0)
+
+	for (; *fmt ; ++fmt) {
+		if (*fmt != '%')
+			continue;
+
+repeat:
+		/* parse flags */
+		++fmt;		/* this also skips first '%' */
+		if (*fmt == '-' || *fmt == '+' || *fmt == ' '
+				|| *fmt == '#' || *fmt == '0')
+			goto repeat;
+
+		/* parse field width */
+		if (isdigit(*fmt))
+			skip_atoi(&fmt);
+		else if (*fmt == '*') {
+			++fmt;
+			/* it's the next argument */
+			save_arg(int);
+		}
+
+		/* parse the precision */
+		if (*fmt == '.') {
+			++fmt;
+			if (isdigit(*fmt))
+				skip_atoi(&fmt);
+			else if (*fmt == '*') {
+				++fmt;
+				/* it's the next argument */
+				save_arg(int);
+			}
+		}
+
+		/* parse the conversion qualifier */
+		qualifier = -1;
+		if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
+		    *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
+			qualifier = *fmt;
+			++fmt;
+			if (qualifier == 'l' && *fmt == 'l') {
+				qualifier = 'L';
+				++fmt;
+			}
+		}
+
+		/* parse format type */
+		switch (*fmt) {
+		case 'c':
+			save_arg(char);
+			continue;
+		case 's': {
+			/* save the string argument */
+			const char *save_str = va_arg(args, char *);
+			size_t len;
+			if ((unsigned long)save_str > (unsigned long)-PAGE_SIZE
+					|| (unsigned long)save_str < PAGE_SIZE)
+				save_str = "<NULL>";
+			len = strlen(save_str);
+			if (str + len + 1 < end)
+				memcpy(str, save_str, len + 1);
+			str += len + 1;
+			continue;
+		}
+		case 'p':
+			save_arg(void *);
+			/* skip all alphanumeric pointer suffixes */
+			while (isalnum(fmt[1]))
+				fmt++;
+			continue;
+		case 'n': {
+			/* skip %n 's argument */
+			void *skip_arg;
+			if (qualifier == 'l')
+				skip_arg = va_arg(args, long *);
+			else if (qualifier == 'Z' || qualifier == 'z')
+				skip_arg = va_arg(args, size_t *);
+			else
+				skip_arg = va_arg(args, int *);
+			continue;
+		}
+		case 'o':
+		case 'x':
+		case 'X':
+		case 'd':
+		case 'i':
+		case 'u':
+			/* save arg for case: 'o', 'x', 'X', 'd', 'i', 'u' */
+			if (qualifier == 'L')
+				save_arg(long long);
+			else if (qualifier == 'l')
+				save_arg(unsigned long);
+			else if (qualifier == 'Z' || qualifier == 'z')
+				save_arg(size_t);
+			else if (qualifier == 't')
+				save_arg(ptrdiff_t);
+			else if (qualifier == 'h')
+				save_arg(short);
+			else
+				save_arg(int);
+			continue;
+		default:
+			if (!*fmt)
+				--fmt;
+			continue;
+		}
+	}
+#undef save_arg
+
+	return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
+}
+EXPORT_SYMBOL_GPL(vbin_printf);
+
+/**
+ * bstr_printf - Format a string from binary arguments and place it in a buffer
+ * @buf: The buffer to place the result into
+ * @size: The size of the buffer, including the trailing null space
+ * @fmt: The format string to use
+ * @bin_buf: Binary arguments for the format string
+ *
+ * This function like C99 vsnprintf, but the difference is that vsnprintf gets
+ * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
+ * a binary buffer that generated by vbin_printf.
+ *
+ * The format follows C99 vsnprintf, but has some extensions:
+ * %pS output the name of a text symbol
+ * %pF output the name of a function pointer
+ * %pR output the address range in a struct resource
+ * %n is ignored
+ *
+ * The return value is the number of characters which would
+ * be generated for the given input, excluding the trailing
+ * '\0', as per ISO C99. If you want to have the exact
+ * number of characters written into @buf as return value
+ * (not including the trailing '\0'), use vscnprintf(). If the
+ * return is greater than or equal to @size, the resulting
+ * string is truncated.
+ */
+int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+{
+	unsigned long long num;
+	int base;
+	char *str, *end, c;
+	const char *args = (const char *)bin_buf;
+
+	int flags;
+	int field_width;
+	int precision;
+	int qualifier;
+
+	if (unlikely((int) size < 0)) {
+		/* There can be only one.. */
+		static char warn = 1;
+		WARN_ON(warn);
+		warn = 0;
+		return 0;
+	}
+
+	str = buf;
+	end = buf + size;
+
+#define get_arg(type)							\
+({									\
+	typeof(type) value;						\
+	if (sizeof(type) == 8) {					\
+		args = PTR_ALIGN(args, sizeof(u32));			\
+		*(u32 *)&value = *(u32 *)args;				\
+		*((u32 *)&value + 1) = *(u32 *)(args + 4);		\
+	} else {							\
+		args = PTR_ALIGN(args, sizeof(type));			\
+		value = *(typeof(type) *)args;				\
+	}								\
+	args += sizeof(type);						\
+	value;								\
+})
+
+	/* Make sure end is always >= buf */
+	if (end < buf) {
+		end = ((void *)-1);
+		size = end - buf;
+	}
+
+	for (; *fmt ; ++fmt) {
+		if (*fmt != '%') {
+			if (str < end)
+				*str = *fmt;
+			++str;
+			continue;
+		}
+
+		/* process flags */
+		flags = 0;
+repeat:
+		++fmt;		/* this also skips first '%' */
+		switch (*fmt) {
+		case '-':
+			flags |= LEFT;
+			goto repeat;
+		case '+':
+			flags |= PLUS;
+			goto repeat;
+		case ' ':
+			flags |= SPACE;
+			goto repeat;
+		case '#':
+			flags |= SPECIAL;
+			goto repeat;
+		case '0':
+			flags |= ZEROPAD;
+			goto repeat;
+		}
+
+		/* get field width */
+		field_width = -1;
+		if (isdigit(*fmt))
+			field_width = skip_atoi(&fmt);
+		else if (*fmt == '*') {
+			++fmt;
+			/* it's the next argument */
+			field_width = get_arg(int);
+			if (field_width < 0) {
+				field_width = -field_width;
+				flags |= LEFT;
+			}
+		}
+
+		/* get the precision */
+		precision = -1;
+		if (*fmt == '.') {
+			++fmt;
+			if (isdigit(*fmt))
+				precision = skip_atoi(&fmt);
+			else if (*fmt == '*') {
+				++fmt;
+				/* it's the next argument */
+				precision = get_arg(int);
+			}
+			if (precision < 0)
+				precision = 0;
+		}
+
+		/* get the conversion qualifier */
+		qualifier = -1;
+		if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
+		    *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
+			qualifier = *fmt;
+			++fmt;
+			if (qualifier == 'l' && *fmt == 'l') {
+				qualifier = 'L';
+				++fmt;
+			}
+		}
+
+		/* default base */
+		base = 10;
+
+		switch (*fmt) {
+		case 'c':
+			if (!(flags & LEFT)) {
+				while (--field_width > 0) {
+					if (str < end)
+						*str = ' ';
+					++str;
+				}
+			}
+			c = (unsigned char) get_arg(char);
+			if (str < end)
+				*str = c;
+			++str;
+			while (--field_width > 0) {
+				if (str < end)
+					*str = ' ';
+				++str;
+			}
+			continue;
+
+		case 's':{
+			const char *str_arg = args;
+			size_t len = strlen(str_arg);
+			args += len + 1;
+			str = string(str, end, (char *)str_arg, field_width,
+					precision, flags);
+			continue;
+		}
+
+		case 'p':
+			str = pointer(fmt+1, str, end, get_arg(void *),
+					field_width, precision, flags);
+			/* Skip all alphanumeric pointer suffixes */
+			while (isalnum(fmt[1]))
+				fmt++;
+			continue;
+
+		case 'n':
+			/* skip %n */
+			continue;
+
+		case '%':
+			if (str < end)
+				*str = '%';
+			++str;
+			continue;
+
+		/* integer number formats - set up the flags and "break" */
+		case 'o':
+			base = 8;
+			break;
+
+		case 'x':
+			flags |= SMALL;
+		case 'X':
+			base = 16;
+			break;
+
+		case 'd':
+		case 'i':
+			flags |= SIGN;
+		case 'u':
+			break;
+
+		default:
+			if (str < end)
+				*str = '%';
+			++str;
+			if (*fmt) {
+				if (str < end)
+					*str = *fmt;
+				++str;
+			} else {
+				--fmt;
+			}
+			continue;
+		}
+		if (qualifier == 'L')
+			num = get_arg(long long);
+		else if (qualifier == 'l') {
+			num = get_arg(unsigned long);
+			if (flags & SIGN)
+				num = (signed long) num;
+		} else if (qualifier == 'Z' || qualifier == 'z') {
+			num = get_arg(size_t);
+		} else if (qualifier == 't') {
+			num = get_arg(ptrdiff_t);
+		} else if (qualifier == 'h') {
+			num = (unsigned short) get_arg(short);
+			if (flags & SIGN)
+				num = (signed short) num;
+		} else {
+			num = get_arg(unsigned int);
+			if (flags & SIGN)
+				num = (signed int) num;
+		}
+		str = number(str, end, num, base,
+				field_width, precision, flags);
+	}
+	if (size > 0) {
+		if (str < end)
+			*str = '\0';
+		else
+			end[-1] = '\0';
+	}
+#undef get_arg
+
+	/* the trailing null byte doesn't count towards the total */
+	return str - buf;
+}
+EXPORT_SYMBOL_GPL(bstr_printf);
+
+/**
+ * bprintf - Parse a format string and place args' binary value in a buffer
+ * @bin_buf: The buffer to place args' binary value
+ * @size: The size of the buffer(by words(32bits), not characters)
+ * @fmt: The format string to use
+ * @...: Arguments for the format string
+ *
+ * The function returns the number of words(u32) written
+ * into @bin_buf.
+ */
+int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = vbin_printf(bin_buf, size, fmt, args);
+	va_end(args);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bprintf);
+
+#endif /* CONFIG_BINARY_PRINTF */
+
 /**
  * vsscanf - Unformat a buffer into a list of arguments
  * @buf:	input buffer
-- 
cgit v1.2.3-71-gd317


From 1427cdf0592368bdec57276edaf714040ee8744f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:47 +0100
Subject: tracing: infrastructure for supporting binary record

Impact: save on memory for tracing

Current tracers are typically using a struct(like struct ftrace_entry,
struct ctx_switch_entry, struct special_entr etc...)to record a binary
event. These structs can only record a their own kind of events.
A new kind of tracer need a new struct and a lot of code too handle it.

So we need a generic binary record for events. This infrastructure
is for this purpose.

[fweisbec@gmail.com: rebase against latest -tip, make it safe while sched
tracing as reported by Steven Rostedt]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h       |  3 ++
 kernel/trace/Kconfig         |  6 +++
 kernel/trace/Makefile        |  1 +
 kernel/trace/trace.c         | 56 ++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 12 ++++++
 kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.c  | 75 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 240 insertions(+)
 create mode 100644 kernel/trace/trace_bprintk.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 498769425eb2..1c9cdca02580 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,6 +223,9 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+#endif
 
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 058d949a3214..ad8d3617d0a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -97,6 +97,12 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+config TRACE_BPRINTK
+	bool "Binary printk for tracing"
+	default y
+	depends on TRACING
+	select BINARY_PRINTF
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f44736c7574a..46557ef4c379 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
+obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6144acf2b75..ff53509e19f8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3792,6 +3792,62 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ *
+ * Caller must insure @fmt are valid when msg is in tracing buffer.
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct bprintk_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (tracing_disabled || !trace_bprintk_enable)
+		return 0;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8beff03fda68..0f5077f8f957 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -124,6 +125,16 @@ struct print_entry {
 	char			buf[];
 };
 
+struct bprintk_entry {
+	struct trace_entry ent;
+	unsigned long ip;
+	const char *fmt;
+	u32 buf[];
+};
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_bprintk_enable;
+#endif
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -285,6 +296,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
new file mode 100644
index 000000000000..1f8e532c3fb9
--- /dev/null
+++ b/kernel/trace/trace_bprintk.c
@@ -0,0 +1,87 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+/* binary printk basic */
+static DEFINE_MUTEX(btrace_mutex);
+static int btrace_metadata_count;
+
+static inline void lock_btrace(void)
+{
+	mutex_lock(&btrace_mutex);
+}
+
+static inline void unlock_btrace(void)
+{
+	mutex_unlock(&btrace_mutex);
+}
+
+static void get_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count++;
+	unlock_btrace();
+}
+
+static void put_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count--;
+	unlock_btrace();
+}
+
+/* events tracer */
+int trace_bprintk_enable;
+
+static void start_bprintk_trace(struct trace_array *tr)
+{
+	get_btrace_metadata();
+	tracing_reset_online_cpus(tr);
+	trace_bprintk_enable = 1;
+}
+
+static void stop_bprintk_trace(struct trace_array *tr)
+{
+	trace_bprintk_enable = 0;
+	tracing_reset_online_cpus(tr);
+	put_btrace_metadata();
+}
+
+static int init_bprintk_trace(struct trace_array *tr)
+{
+	start_bprintk_trace(tr);
+	return 0;
+}
+
+static struct tracer bprintk_trace __read_mostly =
+{
+	.name	     = "events",
+	.init	     = init_bprintk_trace,
+	.reset	     = stop_bprintk_trace,
+	.start	     = start_bprintk_trace,
+	.stop	     = stop_bprintk_trace,
+};
+
+static __init int init_bprintk(void)
+{
+	return register_tracer(&bprintk_trace);
+}
+
+device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 306fef84c503..4ab71201862e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
+static int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
 /**
  * trace_seq_puts - trace sequence printing of simple string
  * @s: trace sequence descriptor
@@ -855,6 +875,60 @@ static struct trace_event trace_print_event = {
 	.raw		= trace_print_raw,
 };
 
+/* TRACE_BPRINTK */
+static enum print_line_t
+trace_bprintk_print(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_puts(s, ": "))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+trace_bprintk_raw(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!trace_seq_printf(s, ": %lx : ", field->ip))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_bprintk_event = {
+	.type	 	= TRACE_BPRINTK,
+	.trace		= trace_bprintk_print,
+	.raw		= trace_bprintk_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -863,6 +937,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
+	&trace_bprintk_event,
 	NULL
 };
 
-- 
cgit v1.2.3-71-gd317


From 1ba28e02a18cbdbea123836f6c98efb09cbf59ec Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:48 +0100
Subject: tracing: add trace_bprintk()

Impact: add a generic printk() for tracing, like trace_printk()

trace_bprintk() uses the infrastructure to record events on ring_buffer.

[ fweisbec@gmail.com: ported to latest -tip, made it work if
  !CONFIG_MODULES, never free the format strings from modules
  because we can't keep track of them and conditionnaly create
  the ftrace format strings section (reported by Steven Rostedt) ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  9 ++++
 include/linux/ftrace.h            | 21 ++++++++++
 include/linux/module.h            |  5 +++
 kernel/module.c                   |  6 +++
 kernel/trace/trace.c              | 15 +++++++
 kernel/trace/trace_bprintk.c      | 87 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 133 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0add6b28c366..48ade3168b13 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -69,6 +69,14 @@
 #define FTRACE_EVENTS()
 #endif
 
+#ifdef CONFIG_TRACING
+#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .;      \
+			 *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \
+			 VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .;
+#else
+#define TRACE_PRINTKS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -100,6 +108,7 @@
 		*(__vermagic)		/* Kernel version magic */	\
 		*(__markers_strings)	/* Markers: strings */		\
 		*(__tracepoints_strings)/* Tracepoints: strings */	\
+		TRACE_PRINTKS()					\
 	}								\
 									\
 	.rodata1          : AT(ADDR(.rodata1) - LOAD_OFFSET) {		\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1c9cdca02580..1cc8ca453a9b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -225,6 +225,27 @@ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+		__attribute__ ((format (printf, 2, 3)));
+
+static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
+		__attribute__ ((format (printf, 1, 2)));
+static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
+#define __trace_bprintk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_bprintk_check_format(fmt, ##args);		\
+} while (0)
+
+#define trace_bprintk(fmt, args...)					\
+do {									\
+	static char *__attribute__((section("__trace_bprintk_fmt")))	\
+			trace_bprintk_fmt = fmt;			\
+	__trace_bprintk_check_format(fmt, ##args);			\
+	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
+} while (0)
+#else
+#define trace_bprintk trace_printk
 #endif
 
 /* May be defined in arch */
diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..8cbec972d8e7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,6 +329,11 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	const char **trace_bprintk_fmt_start;
+	unsigned int num_trace_bprintk_fmt;
+#endif
+
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
 	struct list_head modules_which_use_me;
diff --git a/kernel/module.c b/kernel/module.c
index 22d7379709da..2dece104f9a1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,6 +2158,12 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
+			"__trace_bprintk_fmt", sizeof(char *),
+			&mod->num_trace_bprintk_fmt);
+#endif
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff53509e19f8..46b3cd7a5752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3848,6 +3848,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!fmt)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vbprintk(ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_bprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
index 1f8e532c3fb9..f4c245a5cd33 100644
--- a/kernel/trace/trace_bprintk.c
+++ b/kernel/trace/trace_bprintk.c
@@ -19,9 +19,21 @@
 
 #include "trace.h"
 
+#ifdef CONFIG_MODULES
+
 /* binary printk basic */
 static DEFINE_MUTEX(btrace_mutex);
-static int btrace_metadata_count;
+/*
+ * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
 
 static inline void lock_btrace(void)
 {
@@ -33,26 +45,75 @@ static inline void unlock_btrace(void)
 	mutex_unlock(&btrace_mutex);
 }
 
-static void get_btrace_metadata(void)
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
-	lock_btrace();
-	btrace_metadata_count++;
-	unlock_btrace();
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
 }
 
-static void put_btrace_metadata(void)
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
 {
+	const char **iter;
 	lock_btrace();
-	btrace_metadata_count--;
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
 	unlock_btrace();
 }
 
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
 /* events tracer */
 int trace_bprintk_enable;
 
 static void start_bprintk_trace(struct trace_array *tr)
 {
-	get_btrace_metadata();
 	tracing_reset_online_cpus(tr);
 	trace_bprintk_enable = 1;
 }
@@ -61,7 +122,6 @@ static void stop_bprintk_trace(struct trace_array *tr)
 {
 	trace_bprintk_enable = 0;
 	tracing_reset_online_cpus(tr);
-	put_btrace_metadata();
 }
 
 static int init_bprintk_trace(struct trace_array *tr)
@@ -81,7 +141,14 @@ static struct tracer bprintk_trace __read_mostly =
 
 static __init int init_bprintk(void)
 {
-	return register_tracer(&bprintk_trace);
+	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
+	if (ret)
+		return ret;
+
+	ret = register_tracer(&bprintk_trace);
+	if (ret)
+		unregister_module_notifier(&module_trace_bprintk_format_nb);
+	return ret;
 }
 
 device_initcall(init_bprintk);
-- 
cgit v1.2.3-71-gd317


From 769b0441f438c4bb4872cb8560eb6fe51bcc09ee Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Mar 2009 17:21:49 +0100
Subject: tracing/core: drop the old trace_printk() implementation in favour of
 trace_bprintk()

Impact: faster and lighter tracing

Now that we have trace_bprintk() which is faster and consume lesser
memory than trace_printk() and has the same purpose, we can now drop
the old implementation in favour of the binary one from trace_bprintk(),
which means we move all the implementation of trace_bprintk() to
trace_printk(), so the Api doesn't change except that we must now use
trace_seq_bprintk() to print the TRACE_PRINT entries.

Some changes result of this:

- Previously, trace_bprintk depended of a single tracer and couldn't
  work without. This tracer has been dropped and the whole implementation
  of trace_printk() (like the module formats management) is now integrated
  in the tracing core (comes with CONFIG_TRACING), though we keep the file
  trace_printk (previously trace_bprintk.c) where we can find the module
  management. Thus we don't overflow trace.c

- changes some parts to use trace_seq_bprintk() to print TRACE_PRINT entries.

- change a bit trace_printk/trace_vprintk macros to support non-builtin formats
  constants, and fix 'const' qualifiers warnings. But this is all transparent for
  developers.

- etc...

V2:

- Rebase against last changes
- Fix mispell on the changelog

V3:

- Rebase against last changes (moving trace_printk() to kernel.h)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h               |  25 -----
 include/linux/kernel.h               |  34 +++++-
 include/linux/module.h               |   2 +-
 kernel/trace/Kconfig                 |   7 +-
 kernel/trace/Makefile                |   2 +-
 kernel/trace/trace.c                 | 212 ++++++++++-------------------------
 kernel/trace/trace.h                 |  14 +--
 kernel/trace/trace_bprintk.c         | 154 -------------------------
 kernel/trace/trace_functions_graph.c |   6 +-
 kernel/trace/trace_mmiotrace.c       |   9 +-
 kernel/trace/trace_output.c          |  70 ++----------
 kernel/trace/trace_output.h          |   2 +
 kernel/trace/trace_printk.c          | 138 +++++++++++++++++++++++
 13 files changed, 262 insertions(+), 413 deletions(-)
 delete mode 100644 kernel/trace/trace_bprintk.c
 create mode 100644 kernel/trace/trace_printk.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1cc8ca453a9b..e1583f2639b0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,31 +223,6 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
-#ifdef CONFIG_TRACE_BPRINTK
-extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-		__attribute__ ((format (printf, 2, 3)));
-
-static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
-		__attribute__ ((format (printf, 1, 2)));
-static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
-#define __trace_bprintk_check_format(fmt, args...)			\
-do {									\
-	if (0)								\
-		____trace_bprintk_check_format(fmt, ##args);		\
-} while (0)
-
-#define trace_bprintk(fmt, args...)					\
-do {									\
-	static char *__attribute__((section("__trace_bprintk_fmt")))	\
-			trace_bprintk_fmt = fmt;			\
-	__trace_bprintk_check_format(fmt, ##args);			\
-	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
-} while (0)
-#else
-#define trace_bprintk trace_printk
-#endif
-
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7aef15c4645e..4e726b9a71ec 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -423,6 +423,16 @@ extern void ftrace_off_permanent(void);
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
+static inline void __attribute__ ((format (printf, 1, 2)))
+____trace_printk_check_format(const char *fmt, ...)
+{
+}
+#define __trace_printk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_printk_check_format(fmt, ##args);		\
+} while (0)
+
 /**
  * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
@@ -439,13 +449,31 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
+
+#define trace_printk(fmt, args...)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__trace_printk_check_format(fmt, ##args);			\
+	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+} while (0)
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
+
+#define ftrace_vprintk(fmt, vargs)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+} while (0)
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -467,7 +495,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 	return 0;
 }
 static inline void ftrace_dump(void) { }
-#endif
+#endif /* CONFIG_TRACING */
 
 /*
  *      Display an IP address in readable format.
diff --git a/include/linux/module.h b/include/linux/module.h
index 8cbec972d8e7..22d9878e868c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,7 +329,7 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
-#ifdef CONFIG_TRACE_BPRINTK
+#ifdef CONFIG_TRACING
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 #endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ad8d3617d0a6..8e4a2a61cd75 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -52,6 +52,7 @@ config TRACING
 	select STACKTRACE if STACKTRACE_SUPPORT
 	select TRACEPOINTS
 	select NOP_TRACER
+	select BINARY_PRINTF
 
 #
 # Minimum requirements an architecture has to meet for us to
@@ -97,12 +98,6 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
-config TRACE_BPRINTK
-	bool "Binary printk for tracing"
-	default y
-	depends on TRACING
-	select BINARY_PRINTF
-
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46557ef4c379..c7a2943796eb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
-obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
+obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46b3cd7a5752..cc94f8642485 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1169,6 +1169,67 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+
+/**
+ * trace_vprintk - write binary msg to tracing buffer
+ *
+ */
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct print_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (unlikely(tracing_selftest_running || tracing_disabled))
+		return 0;
+
+	/* Don't pollute graph traces with trace_vprintk internals */
+	pause_graph_tracing();
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+	unpause_graph_tracing();
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
@@ -1564,7 +1625,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%s", field->buf);
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -3714,155 +3775,6 @@ static __init int tracer_init_debugfs(void)
 	return 0;
 }
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
-{
-	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
-	static char trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	int cpu, len = 0, size, pc;
-	struct print_entry *entry;
-	unsigned long irq_flags;
-
-	if (tracing_disabled || tracing_selftest_running)
-		return 0;
-
-	pc = preempt_count();
-	preempt_disable_notrace();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	pause_graph_tracing();
-	raw_local_irq_save(irq_flags);
-	__raw_spin_lock(&trace_buf_lock);
-	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	len = min(len, TRACE_BUF_SIZE-1);
-	trace_buf[len] = 0;
-
-	size = sizeof(*entry) + len + 1;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->depth			= depth;
-
-	memcpy(&entry->buf, trace_buf, len);
-	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
- out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	raw_local_irq_restore(irq_flags);
-	unpause_graph_tracing();
- out:
-	preempt_enable_notrace();
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-
-int __trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_printk);
-
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
-{
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-}
-EXPORT_SYMBOL_GPL(__ftrace_vprintk);
-
-/**
- * trace_vbprintk - write binary msg to tracing buffer
- *
- * Caller must insure @fmt are valid when msg is in tracing buffer.
- */
-int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
-{
-	static DEFINE_SPINLOCK(trace_buf_lock);
-	static u32 trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	struct bprintk_entry *entry;
-	unsigned long flags;
-	int resched;
-	int cpu, len = 0, size, pc;
-
-	if (tracing_disabled || !trace_bprintk_enable)
-		return 0;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	spin_lock_irqsave(&trace_buf_lock, flags);
-	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	if (len > TRACE_BUF_SIZE || len < 0)
-		goto out_unlock;
-
-	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->fmt			= fmt;
-
-	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
-
-out:
-	ftrace_preempt_enable(resched);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vbprintk);
-
-int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!fmt)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_bprintk);
-
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0f5077f8f957..6140922392c8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,7 +20,6 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
-	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -120,16 +119,10 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long		ip;
+	unsigned long 		ip;
 	int			depth;
-	char			buf[];
-};
-
-struct bprintk_entry {
-	struct trace_entry ent;
-	unsigned long ip;
-	const char *fmt;
-	u32 buf[];
+	const char		*fmt;
+	u32 			buf[];
 };
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_bprintk_enable;
@@ -296,7 +289,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
-		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
deleted file mode 100644
index f4c245a5cd33..000000000000
--- a/kernel/trace/trace_bprintk.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * trace binary printk
- *
- * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/ftrace.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
-
-#include "trace.h"
-
-#ifdef CONFIG_MODULES
-
-/* binary printk basic */
-static DEFINE_MUTEX(btrace_mutex);
-/*
- * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
- * which are queued on trace_bprintk_fmt_list.
- */
-static LIST_HEAD(trace_bprintk_fmt_list);
-
-struct trace_bprintk_fmt {
-	struct list_head list;
-	char fmt[0];
-};
-
-
-static inline void lock_btrace(void)
-{
-	mutex_lock(&btrace_mutex);
-}
-
-static inline void unlock_btrace(void)
-{
-	mutex_unlock(&btrace_mutex);
-}
-
-
-static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
-{
-	struct trace_bprintk_fmt *pos;
-	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
-		if (!strcmp(pos->fmt, fmt))
-			return pos;
-	}
-	return NULL;
-}
-
-static
-void hold_module_trace_bprintk_format(const char **start, const char **end)
-{
-	const char **iter;
-	lock_btrace();
-	for (iter = start; iter < end; iter++) {
-		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
-		if (tb_fmt) {
-			*iter = tb_fmt->fmt;
-			continue;
-		}
-
-		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
-				+ strlen(*iter) + 1, GFP_KERNEL);
-		if (tb_fmt) {
-			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-			strcpy(tb_fmt->fmt, *iter);
-			*iter = tb_fmt->fmt;
-		} else
-			*iter = NULL;
-	}
-	unlock_btrace();
-}
-
-static int module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	struct module *mod = data;
-	if (mod->num_trace_bprintk_fmt) {
-		const char **start = mod->trace_bprintk_fmt_start;
-		const char **end = start + mod->num_trace_bprintk_fmt;
-
-		if (val == MODULE_STATE_COMING)
-			hold_module_trace_bprintk_format(start, end);
-	}
-	return 0;
-}
-
-#else /* !CONFIG_MODULES */
-__init static int
-module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	return 0;
-}
-#endif /* CONFIG_MODULES */
-
-
-__initdata_or_module static
-struct notifier_block module_trace_bprintk_format_nb = {
-	.notifier_call = module_trace_bprintk_format_notify,
-};
-
-/* events tracer */
-int trace_bprintk_enable;
-
-static void start_bprintk_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	trace_bprintk_enable = 1;
-}
-
-static void stop_bprintk_trace(struct trace_array *tr)
-{
-	trace_bprintk_enable = 0;
-	tracing_reset_online_cpus(tr);
-}
-
-static int init_bprintk_trace(struct trace_array *tr)
-{
-	start_bprintk_trace(tr);
-	return 0;
-}
-
-static struct tracer bprintk_trace __read_mostly =
-{
-	.name	     = "events",
-	.init	     = init_bprintk_trace,
-	.reset	     = stop_bprintk_trace,
-	.start	     = start_bprintk_trace,
-	.stop	     = stop_bprintk_trace,
-};
-
-static __init int init_bprintk(void)
-{
-	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
-	if (ret)
-		return ret;
-
-	ret = register_tracer(&bprintk_trace);
-	if (ret)
-		unregister_module_notifier(&module_trace_bprintk_format_nb);
-	return ret;
-}
-
-device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e527f2f66c73..453ebd3b636e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -742,7 +742,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 		}
 
 	/* The comment */
-	ret = trace_seq_printf(s, "/* %s", trace->buf);
+	ret = trace_seq_printf(s, "/* ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c401b908e805..23e346a734ca 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,15 +254,18 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
-	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, print->fmt, print->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4ab71201862e..ef8fd661b217 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,8 +53,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
-static int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
 	int ret;
@@ -834,54 +833,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static enum print_line_t trace_print_print(struct trace_iterator *iter,
-					   int flags)
-{
-	struct print_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (!trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
-	.trace		= trace_print_print,
-	.raw		= trace_print_raw,
-};
-
-/* TRACE_BPRINTK */
 static enum print_line_t
-trace_bprintk_print(struct trace_iterator *iter, int flags)
+trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
+	struct print_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -900,14 +857,13 @@ trace_bprintk_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t
-trace_bprintk_raw(struct trace_iterator *iter, int flags)
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
-	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!trace_seq_printf(s, ": %lx : ", field->ip))
 		goto partial;
@@ -921,12 +877,11 @@ trace_bprintk_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_bprintk_event = {
-	.type	 	= TRACE_BPRINTK,
-	.trace		= trace_bprintk_print,
-	.raw		= trace_bprintk_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.raw		= trace_print_raw,
 };
 
 static struct trace_event *events[] __initdata = {
@@ -937,7 +892,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
-	&trace_bprintk_event,
 	NULL
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8a34d688ed63..3b90e6ade1aa 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -18,6 +18,8 @@ struct trace_event {
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000000..a50aea22e929
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,138 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+#ifdef CONFIG_MODULES
+
+/*
+ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+/* serialize accesses to trace_bprintk_fmt_list */
+static DEFINE_MUTEX(btrace_mutex);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
+{
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
+}
+
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
+{
+	const char **iter;
+
+	mutex_lock(&btrace_mutex);
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
+	mutex_unlock(&btrace_mutex);
+}
+
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+ {
+	int ret;
+	va_list ap;
+
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+ {
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
+
+static __init int init_trace_printk(void)
+{
+	return register_module_notifier(&module_trace_bprintk_format_nb);
+}
+
+early_initcall(init_trace_printk);
-- 
cgit v1.2.3-71-gd317


From ab96ddec7213004b632d24dc2cdcd2df5f16f50b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 7 Mar 2009 13:39:22 -0800
Subject: Input: serio - fix protocol number for TouchIT213

Protocol 0x37 has been reserved for iNexio devices and Sahara
was supposed to get 0x38.

Reported-by: Claudio Nieder <private@claudio.ch>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/serio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serio.h b/include/linux/serio.h
index 1bcb357a01a1..e0417e4d3f15 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -212,7 +212,7 @@ static inline void serio_unpin_driver(struct serio *serio)
 #define SERIO_FUJITSU	0x35
 #define SERIO_ZHENHUA	0x36
 #define SERIO_INEXIO	0x37
-#define SERIO_TOUCHIT213	0x37
+#define SERIO_TOUCHIT213	0x38
 #define SERIO_W8001	0x39
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 7bffc23e56e92c14b787bf4d95249a32085bfed5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Mar 2009 10:11:36 +0100
Subject: tracing: optimize trace_printk()

Impact: micro-optimization

trace_printk() does this unconditionally:

	trace_printk_fmt = fmt;

Where trace_printk_fmt is an entry into a global array. This is
very SMP-unfriendly.

So only write it once per bootup.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4e726b9a71ec..7742798c9208 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -454,7 +454,10 @@ do {									\
 do {									\
 	static const char *trace_printk_fmt				\
 	__attribute__((section("__trace_printk_fmt")));			\
-	trace_printk_fmt = fmt;					\
+									\
+	if (!trace_printk_fmt)						\
+		trace_printk_fmt = fmt;					\
+									\
 	__trace_printk_check_format(fmt, ##args);			\
 	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
 } while (0)
@@ -467,7 +470,10 @@ __trace_printk(unsigned long ip, const char *fmt, ...)
 do {									\
 	static const char *trace_printk_fmt				\
 	__attribute__((section("__trace_printk_fmt")));			\
-	trace_printk_fmt = fmt;					\
+									\
+	if (!trace_printk_fmt)						\
+		trace_printk_fmt = fmt;					\
+									\
 	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
 } while (0)
 
-- 
cgit v1.2.3-71-gd317


From 129f8ae9b1b5be94517da76009ea956e89104ce8 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 9 Mar 2009 15:07:33 -0400
Subject: Revert "[CPUFREQ] Disable sysfs ui for p4-clockmod."

This reverts commit e088e4c9cdb618675874becb91b2fd581ee707e6.

Removing the sysfs interface for p4-clockmod was flagged as a
regression in bug 12826.

Course of action:
 - Find out the remaining causes of overheating, and fix them
   if possible. ACPI should be doing the right thing automatically.
   If it isn't, we need to fix that.
 - mark p4-clockmod ui as deprecated
 - try again with the removal in six months.

It's not really feasible to printk about the deprecation, because
it needs to happen at all the sysfs entry points, which means adding
a lot of strcmp("p4-clockmod".. calls to the core, which.. bleuch.

Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c |  1 -
 drivers/cpufreq/cpufreq.c                 | 51 +++++++++++--------------------
 include/linux/cpufreq.h                   |  1 -
 3 files changed, 18 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04cbc9e..3178c3acd97e 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = {
 	.name		= "p4-clockmod",
 	.owner		= THIS_MODULE,
 	.attr		= p4clockmod_attr,
-	.hide_interface	= 1,
 };
 
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b55cb67435bd..d6daf3c507d3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -754,11 +754,6 @@ static struct kobj_type ktype_cpufreq = {
 	.release	= cpufreq_sysfs_release,
 };
 
-static struct kobj_type ktype_empty_cpufreq = {
-	.sysfs_ops	= &sysfs_ops,
-	.release	= cpufreq_sysfs_release,
-};
-
 
 /**
  * cpufreq_add_dev - add a CPU device
@@ -892,36 +887,26 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
 
 	/* prepare interface data */
-	if (!cpufreq_driver->hide_interface) {
-		ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
-					   &sys_dev->kobj, "cpufreq");
+	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj,
+				   "cpufreq");
+	if (ret)
+		goto err_out_driver_exit;
+
+	/* set up files for this cpu device */
+	drv_attr = cpufreq_driver->attr;
+	while ((drv_attr) && (*drv_attr)) {
+		ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
 		if (ret)
 			goto err_out_driver_exit;
-
-		/* set up files for this cpu device */
-		drv_attr = cpufreq_driver->attr;
-		while ((drv_attr) && (*drv_attr)) {
-			ret = sysfs_create_file(&policy->kobj,
-						&((*drv_attr)->attr));
-			if (ret)
-				goto err_out_driver_exit;
-			drv_attr++;
-		}
-		if (cpufreq_driver->get) {
-			ret = sysfs_create_file(&policy->kobj,
-						&cpuinfo_cur_freq.attr);
-			if (ret)
-				goto err_out_driver_exit;
-		}
-		if (cpufreq_driver->target) {
-			ret = sysfs_create_file(&policy->kobj,
-						&scaling_cur_freq.attr);
-			if (ret)
-				goto err_out_driver_exit;
-		}
-	} else {
-		ret = kobject_init_and_add(&policy->kobj, &ktype_empty_cpufreq,
-					   &sys_dev->kobj, "cpufreq");
+		drv_attr++;
+	}
+	if (cpufreq_driver->get) {
+		ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);
+		if (ret)
+			goto err_out_driver_exit;
+	}
+	if (cpufreq_driver->target) {
+		ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
 		if (ret)
 			goto err_out_driver_exit;
 	}
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 384b38d3e8e2..161042746afc 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -234,7 +234,6 @@ struct cpufreq_driver {
 	int	(*suspend)	(struct cpufreq_policy *policy, pm_message_t pmsg);
 	int	(*resume)	(struct cpufreq_policy *policy);
 	struct freq_attr	**attr;
-	bool			hide_interface;
 };
 
 /* flags */
-- 
cgit v1.2.3-71-gd317


From 2939b0469d04ba9ac791aca9a81625d7eb50662b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 15:47:18 -0400
Subject: tracing: replace TP<var> with TP_<var>

Impact: clean up

The macros TPPROTO, TPARGS, TPFMT, TPRAWFMT, and TPCMD all look a bit
ugly. This patch adds an underscore to their names.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 Documentation/tracepoints.txt          |  8 +--
 include/linux/tracepoint.h             |  6 +--
 include/trace/block.h                  | 70 ++++++++++++------------
 include/trace/irq_event_types.h        | 16 +++---
 include/trace/lockdep_event_types.h    | 24 ++++-----
 include/trace/power.h                  | 12 ++---
 include/trace/sched_event_types.h      | 98 +++++++++++++++++-----------------
 include/trace/workqueue.h              | 16 +++---
 kernel/trace/trace_event_types.h       | 28 +++++-----
 kernel/trace/trace_events_stage_2.h    |  6 +--
 kernel/trace/trace_events_stage_3.h    |  8 +--
 kernel/trace/trace_export.c            |  8 +--
 kernel/trace/trace_format.h            | 55 -------------------
 samples/tracepoints/tp-samples-trace.h |  8 +--
 14 files changed, 154 insertions(+), 209 deletions(-)
 delete mode 100644 kernel/trace/trace_format.h

(limited to 'include/linux')

diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 6f0a044f5b5e..4ff43c6de299 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -45,8 +45,8 @@ In include/trace/subsys.h :
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_eventname,
-	TPPROTO(int firstarg, struct task_struct *p),
-	TPARGS(firstarg, p));
+	TP_PROTO(int firstarg, struct task_struct *p),
+	TP_ARGS(firstarg, p));
 
 In subsys/file.c (where the tracing statement must be added) :
 
@@ -66,10 +66,10 @@ Where :
     - subsys is the name of your subsystem.
     - eventname is the name of the event to trace.
 
-- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
+- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
   function called by this tracepoint.
 
-- TPARGS(firstarg, p) are the parameters names, same as found in the
+- TP_ARGS(firstarg, p) are the parameters names, same as found in the
   prototype.
 
 Connecting a function (probe) to a tracepoint is done by providing a
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 152b2f03fb86..3bcc3e171443 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,8 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
-#define TPPROTO(args...)	args
-#define TPARGS(args...)		args
+#define TP_PROTO(args...)	args
+#define TP_ARGS(args...)		args
 
 #ifdef CONFIG_TRACEPOINTS
 
@@ -65,7 +65,7 @@ struct tracepoint {
 	{								\
 		if (unlikely(__tracepoint_##name.state))		\
 			__DO_TRACE(&__tracepoint_##name,		\
-				TPPROTO(proto), TPARGS(args));		\
+				TP_PROTO(proto), TP_ARGS(args));	\
 	}								\
 	static inline int register_trace_##name(void (*probe)(proto))	\
 	{								\
diff --git a/include/trace/block.h b/include/trace/block.h
index 25c6a1fd5b77..25b7068b819e 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -5,72 +5,72 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(block_rq_abort,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_insert,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_issue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_requeue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_complete,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_bio_bounce,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_complete,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_backmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_frontmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_queue,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_getrq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_sleeprq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_plug,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_timer,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_io,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_split,
-	TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-		TPARGS(q, bio, pdu));
+	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
+	      TP_ARGS(q, bio, pdu));
 
 DECLARE_TRACE(block_remap,
-	TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		sector_t from, sector_t to),
-		TPARGS(q, bio, dev, from, to));
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from, sector_t to),
+	      TP_ARGS(q, bio, dev, from, to));
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 65850bc5ea06..0147d9eef5f4 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -9,25 +9,25 @@
 #define TRACE_SYSTEM irq
 
 TRACE_EVENT_FORMAT(irq_handler_entry,
-	TPPROTO(int irq, struct irqaction *action),
-	TPARGS(irq, action),
-	TPFMT("irq=%d handler=%s", irq, action->name),
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 	),
-	TPRAWFMT("irq %d")
+	TP_RAW_FMT("irq %d")
 	);
 
 TRACE_EVENT_FORMAT(irq_handler_exit,
-	TPPROTO(int irq, struct irqaction *action, int ret),
-	TPARGS(irq, action, ret),
-	TPFMT("irq=%d handler=%s return=%s",
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+	TP_ARGS(irq, action, ret),
+	TP_FMT("irq=%d handler=%s return=%s",
 		irq, action->name, ret ? "handled" : "unhandled"),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("irq %d ret %d")
+	TP_RAW_FMT("irq %d ret %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index f713d74a82b4..1f00e8b3543e 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -10,32 +10,32 @@
 #ifdef CONFIG_LOCKDEP
 
 TRACE_FORMAT(lock_acquire,
-	TPPROTO(struct lockdep_map *lock, unsigned int subclass,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
 		int trylock, int read, int check,
 		struct lockdep_map *next_lock, unsigned long ip),
-	TPARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TPFMT("%s%s%s", trylock ? "try " : "",
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
 		read ? "read " : "", lock->name)
 	);
 
 TRACE_FORMAT(lock_release,
-	TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TPARGS(lock, nested, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #ifdef CONFIG_LOCK_STAT
 
 TRACE_FORMAT(lock_contended,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 TRACE_FORMAT(lock_acquired,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #endif
diff --git a/include/trace/power.h b/include/trace/power.h
index 38aca537e497..ef204666e983 100644
--- a/include/trace/power.h
+++ b/include/trace/power.h
@@ -18,15 +18,15 @@ struct power_trace {
 };
 
 DECLARE_TRACE(power_start,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_mark,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_end,
-	TPPROTO(struct power_trace *it),
-		TPARGS(it));
+	TP_PROTO(struct power_trace *it),
+	      TP_ARGS(it));
 
 #endif /* _TRACE_POWER_H */
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a6de5c1601a0..71b14828a957 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -9,143 +9,143 @@
 #define TRACE_SYSTEM sched
 
 TRACE_EVENT_FORMAT(sched_kthread_stop,
-	TPPROTO(struct task_struct *t),
-	TPARGS(t),
-	TPFMT("task %s:%d", t->comm, t->pid),
+	TP_PROTO(struct task_struct *t),
+	TP_ARGS(t),
+	TP_FMT("task %s:%d", t->comm, t->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, t->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_kthread_stop_ret,
-	TPPROTO(int ret),
-	TPARGS(ret),
-	TPFMT("ret=%d", ret),
+	TP_PROTO(int ret),
+	TP_ARGS(ret),
+	TP_FMT("ret=%d", ret),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("ret=%d")
+	TP_RAW_FMT("ret=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wait_task,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-	TPARGS(rq, p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_ARGS(rq, p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d %s",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d %s",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_switch,
-	TPPROTO(struct rq *rq, struct task_struct *prev,
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
-	TPARGS(rq, prev, next),
-	TPFMT("task %s:%d ==> %s:%d",
+	TP_ARGS(rq, prev, next),
+	TP_FMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, prev_pid, prev->pid)
 		TRACE_FIELD(int, prev_prio, prev->prio)
 		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
 				    next_comm,
-				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
+				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
 						 next->comm,
 						 TASK_COMM_LEN)))
 		TRACE_FIELD(pid_t, next_pid, next->pid)
 		TRACE_FIELD(int, next_prio, next->prio)
 	),
-	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
+	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
-	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-	TPARGS(p, orig_cpu, dest_cpu),
-	TPFMT("task %s:%d from: %d  to: %d",
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TP_ARGS(p, orig_cpu, dest_cpu),
+	TP_FMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, orig_cpu, orig_cpu)
 		TRACE_FIELD(int, dest_cpu, dest_cpu)
 	),
-	TPRAWFMT("task %d  from: %d to: %d")
+	TP_RAW_FMT("task %d  from: %d to: %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_free,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_exit,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_wait,
-	TPPROTO(struct pid *pid),
-	TPARGS(pid),
-	TPFMT("pid %d", pid_nr(pid)),
+	TP_PROTO(struct pid *pid),
+	TP_ARGS(pid),
+	TP_FMT("pid %d", pid_nr(pid)),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, pid_nr(pid))
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_fork,
-	TPPROTO(struct task_struct *parent, struct task_struct *child),
-	TPARGS(parent, child),
-	TPFMT("parent %s:%d  child %s:%d",
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+	TP_ARGS(parent, child),
+	TP_FMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, parent, parent->pid)
 		TRACE_FIELD(pid_t, child, child->pid)
 	),
-	TPRAWFMT("parent %d  child %d")
+	TP_RAW_FMT("parent %d  child %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_signal_send,
-	TPPROTO(int sig, struct task_struct *p),
-	TPARGS(sig, p),
-	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
+	TP_PROTO(int sig, struct task_struct *p),
+	TP_ARGS(sig, p),
+	TP_FMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, sig, sig)
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("sig: %d  task %d")
+	TP_RAW_FMT("sig: %d  task %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
index 867829df4571..7626523deeba 100644
--- a/include/trace/workqueue.h
+++ b/include/trace/workqueue.h
@@ -6,20 +6,20 @@
 #include <linux/sched.h>
 
 DECLARE_TRACE(workqueue_insertion,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 DECLARE_TRACE(workqueue_execution,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 /* Trace the creation of one workqueue thread on a cpu */
 DECLARE_TRACE(workqueue_creation,
-	   TPPROTO(struct task_struct *wq_thread, int cpu),
-	   TPARGS(wq_thread, cpu));
+	   TP_PROTO(struct task_struct *wq_thread, int cpu),
+	   TP_ARGS(wq_thread, cpu));
 
 DECLARE_TRACE(workqueue_destruction,
-	   TPPROTO(struct task_struct *wq_thread),
-	   TPARGS(wq_thread));
+	   TP_PROTO(struct task_struct *wq_thread),
+	   TP_ARGS(wq_thread));
 
 #endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fb4eba166433..d94179aa1fc2 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -10,7 +10,7 @@ TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
 	),
-	TPRAWFMT(" %lx <-- %lx")
+	TP_RAW_FMT(" %lx <-- %lx")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
@@ -19,7 +19,7 @@ TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
 		TRACE_FIELD(unsigned long, graph_ent.func, func)
 		TRACE_FIELD(int, graph_ent.depth, depth)
 	),
-	TPRAWFMT("--> %lx (%d)")
+	TP_RAW_FMT("--> %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
@@ -28,7 +28,7 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
 		TRACE_FIELD(unsigned long, ret.func, func)
 		TRACE_FIELD(int, ret.depth, depth)
 	),
-	TPRAWFMT("<-- %lx (%d)")
+	TP_RAW_FMT("<-- %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
@@ -41,7 +41,7 @@ TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
@@ -54,7 +54,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
@@ -63,7 +63,7 @@ TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
 		TRACE_FIELD(unsigned long, arg2, arg2)
 		TRACE_FIELD(unsigned long, arg3, arg3)
 	),
-	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+	TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
 );
 
 /*
@@ -83,7 +83,7 @@ TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -98,7 +98,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -108,7 +108,7 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TPRAWFMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
@@ -118,7 +118,7 @@ TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
 		TRACE_FIELD(char, correct, correct)
 	),
-	TPRAWFMT("%u:%s:%s (%u)")
+	TP_RAW_FMT("%u:%s:%s (%u)")
 );
 
 TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
@@ -126,7 +126,7 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 		TRACE_FIELD(u64, from, from)
 		TRACE_FIELD(u64, to, to)
 	),
-	TPRAWFMT("from: %llx to: %llx")
+	TP_RAW_FMT("from: %llx to: %llx")
 );
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
@@ -136,7 +136,7 @@ TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
-	TPRAWFMT("%llx->%llx type:%u state:%u")
+	TP_RAW_FMT("%llx->%llx type:%u state:%u")
 );
 
 TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
@@ -149,7 +149,7 @@ TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
 		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
 		TRACE_FIELD(int, node, node)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
 		 " flags:%x node:%d")
 );
 
@@ -159,7 +159,7 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
 		TRACE_FIELD(unsigned long, call_site, call_site)
 		TRACE_FIELD(const void *, ptr, ptr)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d24a97e74aea..8e2e0f56c2a8 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -44,8 +44,8 @@
 	field->item,
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...)	args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...)	args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2c8d76c7dbed..557ca52bcdcd 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -106,8 +106,8 @@
  *
  */
 
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
+#undef TP_FMT
+#define TP_FMT(fmt, args...)	fmt "\n", ##args
 
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
@@ -152,8 +152,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7162ab49d05d..e62bc10f8103 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -26,8 +26,8 @@
 		return 0;
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...) args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...) args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
@@ -57,8 +57,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
deleted file mode 100644
index 97e59a9c82ea..000000000000
--- a/kernel/trace/trace_format.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
- *
- * }
- */
-
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 01724e04c556..dffdc49878af 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -5,9 +5,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_event,
-	TPPROTO(struct inode *inode, struct file *file),
-	TPARGS(inode, file));
+	TP_PROTO(struct inode *inode, struct file *file),
+	TP_ARGS(inode, file));
 DECLARE_TRACE(subsys_eventb,
-	TPPROTO(void),
-	TPARGS());
+	TP_PROTO(void),
+	TP_ARGS());
 #endif
-- 
cgit v1.2.3-71-gd317


From da4d03020c2af32f73e8bfbab0a66620d85bb9bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 17:14:30 -0400
Subject: tracing: new format for specialized trace points

Impact: clean up and enhancement

The TRACE_EVENT_FORMAT macro looks quite ugly and is limited in its
ability to save data as well as to print the record out. Working with
Ingo Molnar, we came up with a new format that is much more pleasing to
the eye of C developers. This new macro is more C style than the old
macro, and is more obvious to what it does.

Here's the example. The only updated macro in this patch is the
sched_switch trace point.

The old method looked like this:

 TRACE_EVENT_FORMAT(sched_switch,
        TP_PROTO(struct rq *rq, struct task_struct *prev,
                struct task_struct *next),
        TP_ARGS(rq, prev, next),
        TP_FMT("task %s:%d ==> %s:%d",
              prev->comm, prev->pid, next->comm, next->pid),
        TRACE_STRUCT(
                TRACE_FIELD(pid_t, prev_pid, prev->pid)
                TRACE_FIELD(int, prev_prio, prev->prio)
                TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
                                    next_comm,
                                    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
                                                 next->comm,
                                                 TASK_COMM_LEN)))
                TRACE_FIELD(pid_t, next_pid, next->pid)
                TRACE_FIELD(int, next_prio, next->prio)
        ),
        TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
        );

The above method is hard to read and requires two format fields.

The new method:

 /*
  * Tracepoint for task switches, performed by the scheduler:
  *
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,

	TP_PROTO(struct rq *rq, struct task_struct *prev,
		 struct task_struct *next),

	TP_ARGS(rq, prev, next),

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	next_pid			)
		__field(	int,	next_prio			)
	),

	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
		__entry->next_comm, __entry->next_pid, __entry->next_prio),

	TP_fast_assign(
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
	)
 );

This macro is called TRACE_EVENT, it is broken up into 5 parts:

 TP_PROTO:        the proto type of the trace point
 TP_ARGS:         the arguments of the trace point
 TP_STRUCT_entry: the structure layout of the entry in the ring buffer
 TP_printk:       the printk format
 TP_fast_assign:  the method used to write the entry into the ring buffer

The structure is the definition of how the event will be saved in the
ring buffer. The printk is used by the internal tracing in case of
an oops, and the kernel needs to print out the format of the record
to the console. This the TP_printk gives a means to show the records
in a human readable format. It is also used to print out the data
from the trace file.

The TP_fast_assign is executed directly. It is basically like a C function,
where the __entry is the handle to the record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   3 +
 include/trace/sched_event_types.h   |  48 +++++++----
 kernel/trace/trace.h                |   5 --
 kernel/trace/trace_event_types.h    |   3 +-
 kernel/trace/trace_events.c         | 159 +-----------------------------------
 kernel/trace/trace_events_stage_1.h |  28 ++++---
 kernel/trace/trace_events_stage_2.h |  89 ++++++++++++++++----
 kernel/trace/trace_events_stage_3.h |  34 +++-----
 kernel/trace/trace_export.c         |  23 +++++-
 9 files changed, 159 insertions(+), 233 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3bcc3e171443..6b4f1bb3701e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -160,4 +160,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
 	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
+#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 71b14828a957..aa77fb754038 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -62,25 +62,41 @@ TRACE_EVENT_FORMAT(sched_wakeup_new,
 	TP_RAW_FMT("task %d success=%d")
 	);
 
-TRACE_EVENT_FORMAT(sched_switch,
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
 	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		struct task_struct *next),
+		 struct task_struct *next),
+
 	TP_ARGS(rq, prev, next),
-	TP_FMT("task %s:%d ==> %s:%d",
-	      prev->comm, prev->pid, next->comm, next->pid),
-	TRACE_STRUCT(
-		TRACE_FIELD(pid_t, prev_pid, prev->pid)
-		TRACE_FIELD(int, prev_prio, prev->prio)
-		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
-				    next_comm,
-				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
-						 next->comm,
-						 TASK_COMM_LEN)))
-		TRACE_FIELD(pid_t, next_pid, next->pid)
-		TRACE_FIELD(int, next_prio, next->prio)
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
 	),
-	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
-	);
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	)
+);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
 	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2bfb7d11fc17..c5e1d8865fe4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -751,12 +751,7 @@ struct ftrace_event_call {
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
 	int		id;
-	struct dentry	*raw_dir;
-	int		raw_enabled;
-	int		type;
 	int		(*raw_init)(void);
-	int		(*raw_reg)(void);
-	void		(*raw_unreg)(void);
 	int		(*show_format)(struct trace_seq *s);
 };
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d94179aa1fc2..5cca4c978bde 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -106,9 +106,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TP_RAW_FMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fa32ca320767..1880a6438097 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -59,22 +59,12 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-		}
 		break;
 	case 1:
-		if (!call->enabled &&
-		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+		if (!call->enabled) {
 			call->enabled = 1;
 			call->regfunc();
 		}
-		if (!call->raw_enabled &&
-		    (call->type & TRACE_EVENT_TYPE_RAW)) {
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
 		break;
 	}
 }
@@ -300,7 +290,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled || call->raw_enabled)
+	if (call->enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -346,107 +336,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-static ssize_t
-event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
-		loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	if (call->type & TRACE_EVENT_TYPE_PRINTF)
-		r += sprintf(buf, "printf\n");
-
-	if (call->type & TRACE_EVENT_TYPE_RAW)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
-		 loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[64];
-
-	/*
-	 * If there's only one type, we can't change it.
-	 * And currently we always have printf type, and we
-	 * may or may not have raw type.
-	 *
-	 * This is a redundant check, the file should be read
-	 * only if this is the case anyway.
-	 */
-
-	if (!call->raw_init)
-		return -EPERM;
-
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	if (!strncmp(buf, "printf", 6) &&
-	    (!buf[6] || isspace(buf[6]))) {
-
-		call->type = TRACE_EVENT_TYPE_PRINTF;
-
-		/*
-		 * If raw enabled, the disable it and enable
-		 * printf type.
-		 */
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-
-			call->enabled = 1;
-			call->regfunc();
-		}
-
-	} else if (!strncmp(buf, "raw", 3) &&
-	    (!buf[3] || isspace(buf[3]))) {
-
-		call->type = TRACE_EVENT_TYPE_RAW;
-
-		/*
-		 * If printf enabled, the disable it and enable
-		 * raw type.
-		 */
-		if (call->enabled) {
-			call->enabled = 0;
-			call->unregfunc();
-
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
-	} else
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static ssize_t
-event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
-			   loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	r += sprintf(buf, "printf\n");
-
-	if (call->raw_init)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
 #undef FIELD
 #define FIELD(type, name)						\
 	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
@@ -470,6 +359,7 @@ static int trace_write_header(struct trace_seq *s)
 				FIELD(int, pid),
 				FIELD(int, tgid));
 }
+
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -527,13 +417,6 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
-static const struct file_operations ftrace_avail_fops = {
-	.open = ftrace_event_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -548,17 +431,6 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
-static const struct file_operations ftrace_type_fops = {
-	.open = tracing_open_generic,
-	.read = event_type_read,
-	.write = event_type_write,
-};
-
-static const struct file_operations ftrace_available_types_fops = {
-	.open = tracing_open_generic,
-	.read = event_available_types_read,
-};
-
 static const struct file_operations ftrace_event_format_fops = {
 	.open = tracing_open_generic,
 	.read = event_format_read,
@@ -647,9 +519,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	/* default the output to printf */
-	call->type = TRACE_EVENT_TYPE_PRINTF;
-
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -665,21 +534,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
-	/* Only let type be writable, if we can change it */
-	entry = debugfs_create_file("type",
-				    call->raw_init ? 0644 : 0444,
-				    call->dir, call,
-				    &ftrace_type_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/type' entry\n", call->name);
-
-	entry = debugfs_create_file("available_types", 0444, call->dir, call,
-				    &ftrace_available_types_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/available_types' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
@@ -704,13 +558,6 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("available_events", 0444, d_tracer,
-				    (void *)&show_event_seq_ops,
-				    &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_events' entry\n");
-
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 3830a731424c..edfcbd3a0d1b 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -18,19 +18,23 @@
 #define TRACE_FORMAT(call, proto, args, fmt)
 
 #undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
-	struct ftrace_raw_##name {					\
-		struct trace_entry	ent;				\
-		tstruct							\
-	};								\
-	static struct ftrace_event_call event_##name
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __field
+#define __field(type, item)		type	item;
 
-#define TRACE_FIELD(type, item, assign) \
-	type item;
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	type_item;
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+	};							\
+	static struct ftrace_event_call event_##name
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 8e2e0f56c2a8..d91bf4c56661 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -32,23 +32,14 @@
  * in binary.
  */
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __entry
+#define __entry field
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign) \
-	field->item,
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	field->item,
-
-
-#undef TP_RAW_FMT
-#define TP_RAW_FMT(args...)	args
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -66,14 +57,76 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	ret = trace_seq_printf(s, print);				\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
 	return TRACE_TYPE_HANDLED;					\
 }
-
+	
 #include <trace/trace_event_types.h>
 
-#include "trace_format.h"
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __entry
+#define __entry "REC"
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 41b82b93c9c7..8e398d864096 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -144,27 +144,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TP_CMD
-#define TP_CMD(cmd...)	cmd
-
-#undef TRACE_ENTRY
-#define TRACE_ENTRY	entry
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
+	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	cmd;
+#undef __entry
+#define __entry entry
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -185,7 +173,7 @@ static void ftrace_raw_event_##call(proto)				\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-	tstruct;							\
+	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
 }									\
@@ -226,10 +214,8 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
-	.raw_reg		= ftrace_raw_reg_event_##call,		\
-	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e62bc10f8103..23ae78430d58 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,7 +15,28 @@
 
 #include "trace_output.h"
 
-#include "trace_format.h"
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-- 
cgit v1.2.3-71-gd317


From 157587d7ac555458da9f682e3250135e468470a6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 00:15:34 -0400
Subject: tracing: remove obsolete TRACE_EVENT_FORMAT macro

Impact: clean up

The TRACE_EVENT_FORMAT macro is no longer used by trace points
and only the DECLARE_TRACE, TRACE_FORMAT or TRACE_EVENT macros should
be used by them. Although the TRACE_EVENT_FORMAT macro is still used
by the internal tracing utility, it should not be used in core
kernel code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          | 3 ---
 include/trace/lockdep_event_types.h | 2 +-
 include/trace/sched_event_types.h   | 2 +-
 kernel/trace/trace_events_stage_1.h | 3 ---
 kernel/trace/trace_events_stage_3.h | 6 +-----
 5 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 6b4f1bb3701e..69b56988813d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,9 +157,6 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
-	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #define TRACE_EVENT(name, proto, args, struct, print, assign)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index 1f00e8b3543e..adccfcd2ec8f 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -1,5 +1,5 @@
 
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 0bbbf410e01f..fb37af672c88 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,6 +1,6 @@
 
 /* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_EVENT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index edfcbd3a0d1b..15e9bf965a18 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -17,9 +17,6 @@
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 8e398d864096..3ba55d4ab073 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -35,7 +35,7 @@
  * }
  *
  *
- * For those macros defined with TRACE_EVENT_FORMAT:
+ * For those macros defined with TRACE_EVENT:
  *
  * static struct ftrace_event_call event_<call>;
  *
@@ -144,10 +144,6 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
-	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #undef __entry
 #define __entry entry
 
-- 
cgit v1.2.3-71-gd317


From 6074d5b0a319fe8400ff079a3c289406ca024321 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2009 16:27:48 +0900
Subject: percpu: more flexibility for @dyn_size of pcpu_setup_first_chunk()

Impact: cleanup, more flexibility for first chunk init

Non-negative @dyn_size used to be allowed iff @unit_size wasn't auto.
This restriction stemmed from implementation detail and made things a
bit less intuitive.  This patch allows @dyn_size to be specified
regardless of @unit_size and swaps the positions of @dyn_size and
@unit_size so that the parameter order makes more sense (static,
reserved and dyn sizes followed by enclosing unit_size).

While at it, add @unit_size >= PCPU_MIN_UNIT_SIZE sanity check.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 13 ++++++-------
 include/linux/percpu.h         |  2 +-
 mm/percpu.c                    | 28 ++++++++++++++--------------
 3 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index efa615f2bf43..e41c51f6ada1 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -233,8 +233,8 @@ proceed:
 		"%zu bytes\n", vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE,
-				     PMD_SIZE, dyn_size, vm.addr, NULL);
+				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				     PMD_SIZE, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
@@ -315,9 +315,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
 	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      PERCPU_FIRST_CHUNK_RESERVE,
-				      pcpue_unit_size, dyn_size,
-				      pcpue_ptr, NULL);
+				      PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				      pcpue_unit_size, pcpue_ptr, NULL);
 }
 
 /*
@@ -375,8 +374,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 		pcpu4k_nr_static_pages, static_size);
 
 	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
-				     pcpu4k_populate_pte);
+				     PERCPU_FIRST_CHUNK_RESERVE, -1,
+				     -1, NULL, pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 54a968b4b924..fb455dcc59c7 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -107,7 +107,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
-				ssize_t unit_size, ssize_t dyn_size,
+				ssize_t dyn_size, ssize_t unit_size,
 				void *base_addr,
 				pcpu_populate_pte_fn_t populate_pte_fn);
 
diff --git a/mm/percpu.c b/mm/percpu.c
index c6f38a2aface..2f94661d3e36 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1027,8 +1027,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -1053,14 +1053,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * limited offset range for symbol relocations to guarantee module
  * percpu symbols fall inside the relocatable range.
  *
+ * @dyn_size, if non-negative, determines the number of bytes
+ * available for dynamic allocation in the first chunk.  Specifying
+ * non-negative value makes percpu leave alone the area beyond
+ * @static_size + @reserved_size + @dyn_size.
+ *
  * @unit_size, if non-negative, specifies unit size and must be
  * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @reserved_size + @dyn_size.
- *
- * @dyn_size, if non-negative, limits the number of bytes available
- * for dynamic allocation in the first chunk.  Specifying non-negative
- * value make percpu leave alone the area beyond @static_size +
- * @reserved_size + @dyn_size.
+ * @reserved_size + if non-negative, @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -1083,12 +1083,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t reserved_size,
-				     ssize_t unit_size, ssize_t dyn_size,
+				     ssize_t dyn_size, ssize_t unit_size,
 				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
+	size_t size_sum = static_size + reserved_size +
+			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
 	int nr_pages;
@@ -1099,20 +1101,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	if (unit_size >= 0) {
-		BUG_ON(unit_size < static_size + reserved_size +
-				   (dyn_size >= 0 ? dyn_size : 0));
+		BUG_ON(unit_size < size_sum);
 		BUG_ON(unit_size & ~PAGE_MASK);
-	} else {
-		BUG_ON(dyn_size >= 0);
+		BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+	} else
 		BUG_ON(base_addr);
-	}
 	BUG_ON(base_addr && populate_pte_fn);
 
 	if (unit_size >= 0)
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size + reserved_size));
+					PFN_UP(size_sum));
 
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
-- 
cgit v1.2.3-71-gd317


From 66c3a75772247c31feabefb724e082220a1ab060 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2009 16:27:48 +0900
Subject: percpu: generalize embedding first chunk setup helper

Impact: code reorganization

Separate out embedding first chunk setup helper from x86 embedding
first chunk allocator and put it in mm/percpu.c.  This will be used by
the default percpu first chunk allocator and possibly by other archs.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 54 +++-----------------------
 include/linux/percpu.h         |  4 ++
 mm/percpu.c                    | 86 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e41c51f6ada1..400331b50a53 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -257,31 +257,13 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
  * Embedding allocator
  *
  * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves, and allocated as a contiguous area
- * using bootmem allocator and used as-is without being mapped into
- * vmalloc area.  This enables the first chunk to piggy back on the
- * linear physical PMD mapping and doesn't add any additional pressure
- * to TLB.  Note that if the needed size is smaller than the minimum
- * unit size, the leftover is returned to the bootmem allocator.
+ * module and dynamic reserves and embedded into linear physical
+ * mapping so that it can use PMD mapping without additional TLB
+ * pressure.
  */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpue_size)
-		return NULL;
-
-	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
-}
-
 static ssize_t __init setup_pcpu_embed(size_t static_size)
 {
-	unsigned int cpu;
-	size_t dyn_size;
+	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
 	/*
 	 * If large page isn't supported, there's no benefit in doing
@@ -291,32 +273,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	if (!cpu_has_pse || pcpu_need_numa())
 		return -EINVAL;
 
-	/* allocate and copy */
-	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
-			       PERCPU_DYNAMIC_RESERVE);
-	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
-	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
-				       PAGE_SIZE);
-	if (!pcpue_ptr)
-		return -ENOMEM;
-
-	for_each_possible_cpu(cpu) {
-		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
-
-		free_bootmem(__pa(ptr + pcpue_size),
-			     pcpue_unit_size - pcpue_size);
-		memcpy(ptr, __per_cpu_load, static_size);
-	}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
-		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
-
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				      pcpue_unit_size, pcpue_ptr, NULL);
+	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
 }
 
 /*
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index fb455dcc59c7..ee5615d65211 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -111,6 +111,10 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				void *base_addr,
 				pcpu_populate_pte_fn_t populate_pte_fn);
 
+extern ssize_t __init pcpu_embed_first_chunk(
+				size_t static_size, size_t reserved_size,
+				ssize_t dyn_size, ssize_t unit_size);
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index 2f94661d3e36..1aa5d8fbca12 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1238,3 +1238,89 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 	return pcpu_unit_size;
 }
+
+/*
+ * Embedding first chunk setup helper.
+ */
+static void *pcpue_ptr __initdata;
+static size_t pcpue_size __initdata;
+static size_t pcpue_unit_size __initdata;
+
+static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpue_size)
+		return NULL;
+
+	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
+}
+
+/**
+ * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * If this function is used to setup the first chunk, it is allocated
+ * as a contiguous area using bootmem allocator and used as-is without
+ * being mapped into vmalloc area.  This enables the first chunk to
+ * piggy back on the linear physical mapping which often uses larger
+ * page size.
+ *
+ * When @dyn_size is positive, dynamic area might be larger than
+ * specified to fill page alignment.  Also, when @dyn_size is auto,
+ * @dyn_size does not fill the whole first chunk but only what's
+ * necessary for page alignment after static and reserved areas.
+ *
+ * If the needed size is smaller than the minimum or specified unit
+ * size, the leftover is returned to the bootmem allocator.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
+				      ssize_t dyn_size, ssize_t unit_size)
+{
+	unsigned int cpu;
+
+	/* determine parameters and allocate */
+	pcpue_size = PFN_ALIGN(static_size + reserved_size +
+			       (dyn_size >= 0 ? dyn_size : 0));
+	if (dyn_size != 0)
+		dyn_size = pcpue_size - static_size - reserved_size;
+
+	if (unit_size >= 0) {
+		BUG_ON(unit_size < pcpue_size);
+		pcpue_unit_size = unit_size;
+	} else
+		pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
+
+	pcpue_ptr = __alloc_bootmem_nopanic(
+					num_possible_cpus() * pcpue_unit_size,
+					PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	if (!pcpue_ptr)
+		return -ENOMEM;
+
+	/* return the leftover and copy */
+	for_each_possible_cpu(cpu) {
+		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+
+		free_bootmem(__pa(ptr + pcpue_size),
+			     pcpue_unit_size - pcpue_size);
+		memcpy(ptr, __per_cpu_load, static_size);
+	}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
+		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+				      reserved_size, dyn_size,
+				      pcpue_unit_size, pcpue_ptr, NULL);
+}
-- 
cgit v1.2.3-71-gd317


From 30a8fecc2d34f086df34fe2f2b926f080e002600 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:41:38 -0400
Subject: tracing: flip the TP_printk and TP_fast_assign in the TRACE_EVENT
 macro

Impact: clean up

In trying to stay consistant with the C style format in the TRACE_EVENT
macro, it makes more sense to do the printk after the assigning of
the variables.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   2 +-
 include/trace/irq_event_types.h     |   8 +--
 include/trace/sched_event_types.h   | 102 ++++++++++++++++++------------------
 kernel/trace/trace_events_stage_1.h |   2 +-
 kernel/trace/trace_events_stage_2.h |   4 +-
 kernel/trace/trace_events_stage_3.h |   2 +-
 6 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 69b56988813d..c7b09452514b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,7 +157,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 43bcb74dd49f..214bb928fe9e 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -31,13 +31,13 @@ TRACE_EVENT(irq_handler_exit,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled"),
-
 	TP_fast_assign(
 		__entry->irq	= irq;
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index fb37af672c88..63547dc1125f 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -22,12 +22,12 @@ TRACE_EVENT(sched_kthread_stop,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("task %s:%d", __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
 		__entry->pid	= t->pid;
-	)
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
 );
 
 /*
@@ -43,11 +43,11 @@ TRACE_EVENT(sched_kthread_stop_ret,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("ret %d", __entry->ret),
-
 	TP_fast_assign(
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("ret %d", __entry->ret)
 );
 
 /*
@@ -68,14 +68,14 @@ TRACE_EVENT(sched_wait_task,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->prio	= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -97,16 +97,16 @@ TRACE_EVENT(sched_wakeup,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -128,16 +128,16 @@ TRACE_EVENT(sched_wakeup_new,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -162,10 +162,6 @@ TRACE_EVENT(sched_switch,
 		__field(	int,	next_prio			)
 	),
 
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio),
-
 	TP_fast_assign(
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
@@ -173,7 +169,11 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
 /*
@@ -193,17 +193,17 @@ TRACE_EVENT(sched_migrate_task,
 		__field(	int,	dest_cpu		)
 	),
 
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->orig_cpu	= orig_cpu;
 		__entry->dest_cpu	= dest_cpu;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
 );
 
 /*
@@ -221,14 +221,14 @@ TRACE_EVENT(sched_process_free,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -246,14 +246,14 @@ TRACE_EVENT(sched_process_exit,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -271,14 +271,14 @@ TRACE_EVENT(sched_process_wait,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 		__entry->pid		= pid_nr(pid);
 		__entry->prio		= current->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -297,16 +297,16 @@ TRACE_EVENT(sched_process_fork,
 		__field(	pid_t,	child_pid			)
 	),
 
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid),
-
 	TP_fast_assign(
 		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
 		__entry->parent_pid	= parent->pid;
 		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
 		__entry->child_pid	= child->pid;
-	)
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
 );
 
 /*
@@ -324,14 +324,14 @@ TRACE_EVENT(sched_signal_send,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->sig	= sig;
-	)
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 15e9bf965a18..82f68443c556 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -27,7 +27,7 @@
 #define TP_STRUCT__entry(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d91bf4c56661..1ad9f8d2fe45 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,7 +39,7 @@
 #define TP_printk(fmt, args...) fmt "\n", args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -115,7 +115,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 #define TP_fast_assign(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 3ba55d4ab073..d6de06b9201a 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -148,7 +148,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define __entry entry
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
-- 
cgit v1.2.3-71-gd317


From 823f9124fb2e33eeb624d139978a52089f8a02ae Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:58:51 -0400
Subject: tracing: document TRACE_EVENT macro in tracepoint.h

Impact: clean up / comments

Kosaki Motohiro asked about an explanation to the TRACE_EVENT macro.
Ingo Molnar replied with a nice description.

This patch takes the description that Ingo wrote (with some slight
modifications) and adds it to the tracepoint.h file.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h | 103 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c7b09452514b..119ece224c21 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,6 +157,109 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
+
+/*
+ * For use with the TRACE_EVENT macro:
+ *
+ * We define a tracepoint, its arguments, its printk format
+ * and its 'fast binay record' layout.
+ *
+ * Firstly, name your tracepoint via TRACE_EVENT(name : the
+ * 'subsystem_event' notation is fine.
+ *
+ * Think about this whole construct as the
+ * 'trace_sched_switch() function' from now on.
+ *
+ *
+ *  TRACE_EVENT(sched_switch,
+ *
+ *	*
+ *	* A function has a regular function arguments
+ *	* prototype, declare it via TP_PROTO():
+ *	*
+ *
+ * 	TP_PROTO(struct rq *rq, struct task_struct *prev,
+ * 		 struct task_struct *next),
+ *
+ *	*
+ *	* Define the call signature of the 'function'.
+ *	* (Design sidenote: we use this instead of a
+ *	*  TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)
+ *	*
+ *
+ * 	TP_ARGS(rq, prev, next),
+ *
+ *	*
+ *	* Fast binary tracing: define the trace record via
+ *	* TP_STRUCT__entry(). You can think about it like a
+ *	* regular C structure local variable definition.
+ *	*
+ *	* This is how the trace record is structured and will
+ *	* be saved into the ring buffer. These are the fields
+ *	* that will be exposed to user-space in
+ *	* /debug/tracing/events/<*>/format.
+ *	*
+ *	* The declared 'local variable' is called '__entry'
+ *	*
+ *	* __field(pid_t, prev_prid) is equivalent to a standard declariton:
+ *	*
+ *	*	pid_t	prev_pid;
+ *	*
+ *	* __array(char, prev_comm, TASK_COMM_LEN) is equivalent to:
+ *	*
+ *	*	char	prev_comm[TASK_COMM_LEN];
+ *	*
+ *
+ *	TP_STRUCT__entry(
+ *		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+ *		__field(	pid_t,	prev_pid			)
+ *		__field(	int,	prev_prio			)
+ *		__array(	char,	next_comm,	TASK_COMM_LEN	)
+ *		__field(	pid_t,	next_pid			)
+ *		__field(	int,	next_prio			)
+ *	),
+ *
+ *	*
+ *	* Assign the entry into the trace record, by embedding
+ *	* a full C statement block into TP_fast_assign(). You
+ *	* can refer to the trace record as '__entry' -
+ *	* otherwise you can put arbitrary C code in here.
+ *	*
+ *	* Note: this C code will execute every time a trace event
+ *	* happens, on an active tracepoint.
+ *	*
+ *
+ * 	TP_fast_assign(
+ * 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+ * 		__entry->prev_pid	= prev->pid;
+ * 		__entry->prev_prio	= prev->prio;
+ *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+ *		__entry->next_pid	= next->pid;
+ * 		__entry->next_prio	= next->prio;
+ *	)
+ *
+ *	*
+ *	* Formatted output of a trace record via TP_printk().
+ *	* This is how the tracepoint will appear under ftrace
+ *	* plugins that make use of this tracepoint.
+ *	*
+ *	* (raw-binary tracing wont actually perform this step.)
+ *	*
+ *
+ *	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+ *		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+ *		__entry->next_comm, __entry->next_pid, __entry->next_prio),
+ *
+ * );
+ *
+ * This macro construct is thus used for the regular printk format
+ * tracing setup, it is used to construct a function pointer based
+ * tracepoint callback (this is used by programmatic plugins and
+ * can also by used by generic instrumentation like SystemTap), and
+ * it is also used to expose a structured trace record in
+ * /debug/tracing/events/.
+ */
+
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-- 
cgit v1.2.3-71-gd317


From ef18012b248b47ec9a12c3a83ca5e99782d39c5d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 14:10:56 -0400
Subject: tracing: remove funky whitespace in the trace code

Impact: clean up

There existed a lot of <space><tab>'s in the tracing code. This
patch removes them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h           | 16 +++---
 kernel/trace/blktrace.c              | 10 ++--
 kernel/trace/trace.c                 |  2 +-
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_events_stage_3.h  | 98 ++++++++++++++++++------------------
 kernel/trace/trace_export.c          |  2 +-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_output.c          | 14 +++---
 kernel/trace/trace_workqueue.c       |  6 +--
 9 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 119ece224c21..d35a7ee7611f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -178,8 +178,8 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* prototype, declare it via TP_PROTO():
  *	*
  *
- * 	TP_PROTO(struct rq *rq, struct task_struct *prev,
- * 		 struct task_struct *next),
+ *	TP_PROTO(struct rq *rq, struct task_struct *prev,
+ *		 struct task_struct *next),
  *
  *	*
  *	* Define the call signature of the 'function'.
@@ -187,7 +187,7 @@ static inline void tracepoint_synchronize_unregister(void)
  *	*  TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)
  *	*
  *
- * 	TP_ARGS(rq, prev, next),
+ *	TP_ARGS(rq, prev, next),
  *
  *	*
  *	* Fast binary tracing: define the trace record via
@@ -229,13 +229,13 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* happens, on an active tracepoint.
  *	*
  *
- * 	TP_fast_assign(
- * 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
- * 		__entry->prev_pid	= prev->pid;
- * 		__entry->prev_prio	= prev->prio;
+ *	TP_fast_assign(
+ *		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+ *		__entry->prev_pid	= prev->pid;
+ *		__entry->prev_prio	= prev->prio;
  *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *		__entry->next_pid	= next->pid;
- * 		__entry->next_prio	= next->prio;
+ *		__entry->next_prio	= next->prio;
  *	)
  *
  *	*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e39679a72a3b..bec69d3678c1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -33,7 +33,7 @@ static struct trace_array *blk_tr;
 static int __read_mostly  blk_tracer_enabled;
 
 /* Select an alternative, minimalistic output than the original one */
-#define TRACE_BLK_OPT_CLASSIC 	0x1
+#define TRACE_BLK_OPT_CLASSIC	0x1
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
@@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:	the block device
- * @cmd: 	the ioctl cmd
+ * @cmd:	the ioctl cmd
  * @arg:	the argument data, if any
  *
  **/
@@ -1128,9 +1128,9 @@ static void blk_tracer_reset(struct trace_array *tr)
 
 static struct {
 	const char *act[2];
-	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
 } what2act[] __read_mostly = {
-	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
@@ -1229,7 +1229,7 @@ static struct tracer blk_tracer __read_mostly = {
 };
 
 static struct trace_event trace_blk_event = {
-	.type	 	= TRACE_BLK,
+	.type		= TRACE_BLK,
 	.trace		= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f8642485..8c6a902db40a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -799,7 +799,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->tgid               	= (tsk) ? tsk->tgid : 0;
+	entry->tgid			= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index aaa0755268b9..ad8c22efff41 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -157,7 +157,7 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 
 
 static struct trace_event trace_branch_event = {
-	.type	 	= TRACE_BRANCH,
+	.type		= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 };
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6ee1de59f19d..ae2e323df0c7 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,23 +5,23 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to  <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to  <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_event_<call>);
+ *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
  * For those macros defined with TRACE_FORMAT:
@@ -29,9 +29,9 @@
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.name			= "<call>",
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  * }
  *
  *
@@ -41,66 +41,66 @@
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
- * 	struct ring_buffer_event *event;
- * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- * 	unsigned long irq_flags;
- * 	int pc;
- *
- * 	local_save_flags(irq_flags);
- * 	pc = preempt_count();
- *
- * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
- * 				  sizeof(struct ftrace_raw_<call>),
- * 				  irq_flags, pc);
- * 	if (!event)
- * 		return;
- * 	entry	= ring_buffer_event_data(event);
- *
- * 	<assign>;  <-- Here we assign the entries by the __field and
+ *	struct ring_buffer_event *event;
+ *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *	unsigned long irq_flags;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *				  sizeof(struct ftrace_raw_<call>),
+ *				  irq_flags, pc);
+ *	if (!event)
+ *		return;
+ *	entry	= ring_buffer_event_data(event);
+ *
+ *	<assign>;  <-- Here we assign the entries by the __field and
  *			__array macros.
  *
- * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ *	unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
- * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
  * static int ftrace_raw_init_event_<call>(void)
  * {
- * 	int id;
+ *	int id;
  *
- * 	id = register_ftrace_event(&ftrace_event_type_<call>);
- * 	if (!id)
- * 		return -ENODEV;
- * 	event_<call>.id = id;
- * 	return 0;
+ *	id = register_ftrace_event(&ftrace_event_type_<call>);
+ *	if (!id)
+ *		return -ENODEV;
+ *	event_<call>.id = id;
+ *	return 0;
  * }
  *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
+ *	.name			= "<call>",
  *	.system			= "<system>",
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.raw_init		= ftrace_raw_init_event_<call>,
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
@@ -138,7 +138,7 @@ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
@@ -163,7 +163,7 @@ static void ftrace_raw_event_##call(proto)				\
 	pc = preempt_count();						\
 									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call), 	\
+				  sizeof(struct ftrace_raw_##call),	\
 				  irq_flags, pc);			\
 	if (!event)							\
 		return;							\
@@ -208,7 +208,7 @@ static int ftrace_raw_init_event_##call(void)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 23ae78430d58..4d9952d3df50 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -94,7 +94,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.show_format		= ftrace_format_##call,			\
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 453ebd3b636e..d1493b853e41 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -841,12 +841,12 @@ static void graph_trace_close(struct trace_iterator *iter)
 }
 
 static struct tracer graph_trace __read_mostly = {
-	.name	     	= "function_graph",
+	.name		= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
 	.wait_pipe	= poll_wait_pipe,
-	.init	     	= graph_trace_init,
-	.reset	     	= graph_trace_reset,
+	.init		= graph_trace_init,
+	.reset		= graph_trace_reset,
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ef8fd661b217..491832af9ba1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -565,7 +565,7 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 }
 
 static struct trace_event trace_fn_event = {
-	.type	 	= TRACE_FN,
+	.type		= TRACE_FN,
 	.trace		= trace_fn_trace,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
@@ -696,7 +696,7 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_ctx_event = {
-	.type	 	= TRACE_CTX,
+	.type		= TRACE_CTX,
 	.trace		= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
@@ -704,7 +704,7 @@ static struct trace_event trace_ctx_event = {
 };
 
 static struct trace_event trace_wake_event = {
-	.type	 	= TRACE_WAKE,
+	.type		= TRACE_WAKE,
 	.trace		= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
@@ -759,7 +759,7 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_special_event = {
-	.type	 	= TRACE_SPECIAL,
+	.type		= TRACE_SPECIAL,
 	.trace		= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -796,7 +796,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_stack_event = {
-	.type	 	= TRACE_STACK,
+	.type		= TRACE_STACK,
 	.trace		= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -825,7 +825,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_user_stack_event = {
-	.type	 	= TRACE_USER_STACK,
+	.type		= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -879,7 +879,7 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 
 
 static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
+	.type		= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990fe9c5..e542483df623 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -19,14 +19,14 @@ struct cpu_workqueue_stats {
 /* Useful to know if we print the cpu headers */
 	bool		            first_entry;
 	int		            cpu;
-	pid_t 			    pid;
+	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
-	atomic_t 	            inserted;
+	atomic_t	            inserted;
 /*
  *  Don't need to be atomic, works are serialized in a single workqueue thread
  *  on a single CPU.
  */
-	unsigned int 	 	    executed;
+	unsigned int		    executed;
 };
 
 /* List of workqueue threads on one cpu */
-- 
cgit v1.2.3-71-gd317


From ae46141ff08f1965b17c531b571953c39ce8b9e2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 10 Mar 2009 20:33:18 -0400
Subject: NFSv3: Fix posix ACL code

Fix a memory leak due to allocation in the XDR layer. In cases where the
RPC call needs to be retransmitted, we end up allocating new pages without
clearing the old ones. Fix this by moving the allocation into
nfs3_proc_setacls().

Also fix an issue discovered by Kevin Rudd, whereby the amount of memory
reserved for the acls in the xdr_buf->head was miscalculated, and causing
corruption.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3acl.c        | 27 +++++++++++++++++++++------
 fs/nfs/nfs3xdr.c        | 34 +++++++++++++---------------------
 include/linux/nfs_xdr.h |  2 ++
 include/linux/nfsacl.h  |  3 +++
 4 files changed, 39 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index cef62557c87d..6bbf0e6daad2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_fattr fattr;
-	struct page *pages[NFSACL_MAXPAGES] = { };
+	struct page *pages[NFSACL_MAXPAGES];
 	struct nfs3_setaclargs args = {
 		.inode = inode,
 		.mask = NFS_ACL,
@@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		.rpc_argp	= &args,
 		.rpc_resp	= &fattr,
 	};
-	int status, count;
+	int status;
 
 	status = -EOPNOTSUPP;
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	if (S_ISDIR(inode->i_mode)) {
 		args.mask |= NFS_DFACL;
 		args.acl_default = dfacl;
+		args.len = nfsacl_size(acl, dfacl);
+	} else
+		args.len = nfsacl_size(acl, NULL);
+
+	if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+		unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+		status = -ENOMEM;
+		do {
+			args.pages[args.npages] = alloc_page(GFP_KERNEL);
+			if (args.pages[args.npages] == NULL)
+				goto out_freepages;
+			args.npages++;
+		} while (args.npages < npages);
 	}
 
 	dprintk("NFS call setacl\n");
@@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	nfs_zap_acl_cache(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
-	/* pages may have been allocated at the xdr layer. */
-	for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
-		__free_page(args.pages[count]);
-
 	switch (status) {
 		case 0:
 			status = nfs_refresh_inode(inode, &fattr);
@@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		case -ENOTSUPP:
 			status = -EOPNOTSUPP;
 	}
+out_freepages:
+	while (args.npages != 0) {
+		args.npages--;
+		__free_page(args.pages[args.npages]);
+	}
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 11cdddec1432..6cdeacffde46 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -82,8 +82,10 @@
 #define NFS3_commitres_sz	(1+NFS3_wcc_data_sz+2)
 
 #define ACL3_getaclargs_sz	(NFS3_fh_sz+1)
-#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+2*(2+5*3))
-#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+2*(2+5*3))
+#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
 
 /*
@@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
                    struct nfs3_setaclargs *args)
 {
 	struct xdr_buf *buf = &req->rq_snd_buf;
-	unsigned int base, len_in_head, len = nfsacl_size(
-		(args->mask & NFS_ACL)   ? args->acl_access  : NULL,
-		(args->mask & NFS_DFACL) ? args->acl_default : NULL);
-	int count, err;
+	unsigned int base;
+	int err;
 
 	p = xdr_encode_fhandle(p, NFS_FH(args->inode));
 	*p++ = htonl(args->mask);
-	base = (char *)p - (char *)buf->head->iov_base;
-	/* put as much of the acls into head as possible. */
-	len_in_head = min_t(unsigned int, buf->head->iov_len - base, len);
-	len -= len_in_head;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2));
-
-	for (count = 0; (count << PAGE_SHIFT) < len; count++) {
-		args->pages[count] = alloc_page(GFP_KERNEL);
-		if (!args->pages[count]) {
-			while (count)
-				__free_page(args->pages[--count]);
-			return -ENOMEM;
-		}
-	}
-	xdr_encode_pages(buf, args->pages, 0, len);
+	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	base = req->rq_slen;
+
+	if (args->npages != 0)
+		xdr_encode_pages(buf, args->pages, 0, args->len);
+	else
+		req->rq_slen += args->len;
 
 	err = nfsacl_encode(buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index a550b528319f..2e5f00066afd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -406,6 +406,8 @@ struct nfs3_setaclargs {
 	int			mask;
 	struct posix_acl *	acl_access;
 	struct posix_acl *	acl_default;
+	size_t			len;
+	unsigned int		npages;
 	struct page **		pages;
 };
 
diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h
index 54487a99beb8..43011b69297c 100644
--- a/include/linux/nfsacl.h
+++ b/include/linux/nfsacl.h
@@ -37,6 +37,9 @@
 #define NFSACL_MAXPAGES		((2*(8+12*NFS_ACL_MAX_ENTRIES) + PAGE_SIZE-1) \
 				 >> PAGE_SHIFT)
 
+#define NFS_ACL_MAX_ENTRIES_INLINE	(5)
+#define NFS_ACL_INLINE_BUFSIZE	((2*(2+3*NFS_ACL_MAX_ENTRIES_INLINE)) << 2)
+
 static inline unsigned int
 nfsacl_size(struct posix_acl *acl_access, struct posix_acl *acl_default)
 {
-- 
cgit v1.2.3-71-gd317


From 78851e1aa4c3b796d5f0bb11b445016726302b44 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 10 Mar 2009 20:33:19 -0400
Subject: NLM: Shrink the IPv4-only version of nlm_cmp_addr()

Clean up/micro-optimatization:  Make the AF_INET-only version of
nlm_cmp_addr() smaller.  This matches the style of
nlm_privileged_requester(), and makes the AF_INET-only version of
nlm_cmp_addr() nearly the same size as it was before IPv6 support.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/lockd/lockd.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index aa6fe7026de7..51855dfd8adb 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -346,6 +346,7 @@ static inline int __nlm_cmp_addr4(const struct sockaddr *sap1,
 	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline int __nlm_cmp_addr6(const struct sockaddr *sap1,
 				  const struct sockaddr *sap2)
 {
@@ -353,6 +354,13 @@ static inline int __nlm_cmp_addr6(const struct sockaddr *sap1,
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
 	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
 }
+#else	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+static inline int __nlm_cmp_addr6(const struct sockaddr *sap1,
+				  const struct sockaddr *sap2)
+{
+	return 0;
+}
+#endif	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
 
 /*
  * Compare two host addresses
-- 
cgit v1.2.3-71-gd317


From aaf1e176fa9a96fe1eea33b710684bba066aedc1 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 10 Mar 2009 10:55:15 +0000
Subject: ASoC: Add initial driver for the WM8400 CODEC

The WM8400 is a highly integrated audio CODEC and power management unit
intended for mobile multimedia application.  This driver supports the
primary audio CODEC features, including:

 - 1W speaker driver
 - Fully differential headphone output
 - Up to 4 differential microphone inputs

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/mfd/wm8400-audio.h |    1 +
 sound/soc/codecs/Kconfig         |    4 +
 sound/soc/codecs/Makefile        |    2 +
 sound/soc/codecs/wm8400.c        | 1479 ++++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/wm8400.h        |   62 ++
 5 files changed, 1548 insertions(+)
 create mode 100644 sound/soc/codecs/wm8400.c
 create mode 100644 sound/soc/codecs/wm8400.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8400-audio.h b/include/linux/mfd/wm8400-audio.h
index b6640e018046..e06ed3eb1d0a 100644
--- a/include/linux/mfd/wm8400-audio.h
+++ b/include/linux/mfd/wm8400-audio.h
@@ -1181,6 +1181,7 @@
 #define WM8400_FLL_OUTDIV_SHIFT                      0  /* FLL_OUTDIV - [2:0] */
 #define WM8400_FLL_OUTDIV_WIDTH                      3  /* FLL_OUTDIV - [2:0] */
 
+struct wm8400;
 void wm8400_reset_codec_reg_cache(struct wm8400 *wm8400);
 
 #endif
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index a1af311e7f06..b6c7f7a01cb0 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -26,6 +26,7 @@ config SND_SOC_ALL_CODECS
 	select SND_SOC_UDA134X
 	select SND_SOC_UDA1380 if I2C
 	select SND_SOC_WM8350 if MFD_WM8350
+	select SND_SOC_WM8400 if MFD_WM8400
 	select SND_SOC_WM8510 if SND_SOC_I2C_AND_SPI
 	select SND_SOC_WM8580 if I2C
 	select SND_SOC_WM8728 if SND_SOC_I2C_AND_SPI
@@ -110,6 +111,9 @@ config SND_SOC_UDA1380
 config SND_SOC_WM8350
 	tristate
 
+config SND_SOC_WM8400
+	tristate
+
 config SND_SOC_WM8510
 	tristate
 
diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile
index 4717c3c99040..030d2454725f 100644
--- a/sound/soc/codecs/Makefile
+++ b/sound/soc/codecs/Makefile
@@ -14,6 +14,7 @@ snd-soc-twl4030-objs := twl4030.o
 snd-soc-uda134x-objs := uda134x.o
 snd-soc-uda1380-objs := uda1380.o
 snd-soc-wm8350-objs := wm8350.o
+snd-soc-wm8400-objs := wm8400.o
 snd-soc-wm8510-objs := wm8510.o
 snd-soc-wm8580-objs := wm8580.o
 snd-soc-wm8728-objs := wm8728.o
@@ -44,6 +45,7 @@ obj-$(CONFIG_SND_SOC_TWL4030)	+= snd-soc-twl4030.o
 obj-$(CONFIG_SND_SOC_UDA134X)	+= snd-soc-uda134x.o
 obj-$(CONFIG_SND_SOC_UDA1380)	+= snd-soc-uda1380.o
 obj-$(CONFIG_SND_SOC_WM8350)	+= snd-soc-wm8350.o
+obj-$(CONFIG_SND_SOC_WM8400)	+= snd-soc-wm8400.o
 obj-$(CONFIG_SND_SOC_WM8510)	+= snd-soc-wm8510.o
 obj-$(CONFIG_SND_SOC_WM8580)	+= snd-soc-wm8580.o
 obj-$(CONFIG_SND_SOC_WM8728)	+= snd-soc-wm8728.o
diff --git a/sound/soc/codecs/wm8400.c b/sound/soc/codecs/wm8400.c
new file mode 100644
index 000000000000..9cb73d9d023d
--- /dev/null
+++ b/sound/soc/codecs/wm8400.c
@@ -0,0 +1,1479 @@
+/*
+ * wm8400.c  --  WM8400 ALSA Soc Audio driver
+ *
+ * Copyright 2008, 2009 Wolfson Microelectronics PLC.
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/pm.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/mfd/wm8400-audio.h>
+#include <linux/mfd/wm8400-private.h>
+#include <sound/core.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+#include <sound/soc-dapm.h>
+#include <sound/initval.h>
+#include <sound/tlv.h>
+
+#include "wm8400.h"
+
+/* Fake register for internal state */
+#define WM8400_INTDRIVBITS      (WM8400_REGISTER_COUNT + 1)
+#define WM8400_INMIXL_PWR			0
+#define WM8400_AINLMUX_PWR			1
+#define WM8400_INMIXR_PWR			2
+#define WM8400_AINRMUX_PWR			3
+
+static struct regulator_bulk_data power[] = {
+	{
+		.supply = "I2S1VDD",
+	},
+	{
+		.supply = "I2S2VDD",
+	},
+	{
+		.supply = "DCVDD",
+	},
+	{
+		.supply = "FLLVDD",
+	},
+	{
+		.supply = "HPVDD",
+	},
+	{
+		.supply = "SPKVDD",
+	},
+};
+
+/* codec private data */
+struct wm8400_priv {
+	struct snd_soc_codec codec;
+	struct wm8400 *wm8400;
+	u16 fake_register;
+	unsigned int sysclk;
+	unsigned int pcmclk;
+	struct work_struct work;
+};
+
+static inline unsigned int wm8400_read(struct snd_soc_codec *codec,
+				       unsigned int reg)
+{
+	struct wm8400_priv *wm8400 = codec->private_data;
+
+	if (reg == WM8400_INTDRIVBITS)
+		return wm8400->fake_register;
+	else
+		return wm8400_reg_read(wm8400->wm8400, reg);
+}
+
+/*
+ * write to the wm8400 register space
+ */
+static int wm8400_write(struct snd_soc_codec *codec, unsigned int reg,
+	unsigned int value)
+{
+	struct wm8400_priv *wm8400 = codec->private_data;
+
+	if (reg == WM8400_INTDRIVBITS) {
+		wm8400->fake_register = value;
+		return 0;
+	} else
+		return wm8400_set_bits(wm8400->wm8400, reg, 0xffff, value);
+}
+
+static void wm8400_codec_reset(struct snd_soc_codec *codec)
+{
+	struct wm8400_priv *wm8400 = codec->private_data;
+
+	wm8400_reset_codec_reg_cache(wm8400->wm8400);
+}
+
+static const DECLARE_TLV_DB_LINEAR(rec_mix_tlv, -1500, 600);
+
+static const DECLARE_TLV_DB_LINEAR(in_pga_tlv, -1650, 3000);
+
+static const DECLARE_TLV_DB_LINEAR(out_mix_tlv, -2100, 0);
+
+static const DECLARE_TLV_DB_LINEAR(out_pga_tlv, -7300, 600);
+
+static const DECLARE_TLV_DB_LINEAR(out_omix_tlv, -600, 0);
+
+static const DECLARE_TLV_DB_LINEAR(out_dac_tlv, -7163, 0);
+
+static const DECLARE_TLV_DB_LINEAR(in_adc_tlv, -7163, 1763);
+
+static const DECLARE_TLV_DB_LINEAR(out_sidetone_tlv, -3600, 0);
+
+static int wm8400_outpga_put_volsw_vu(struct snd_kcontrol *kcontrol,
+        struct snd_ctl_elem_value *ucontrol)
+{
+        struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	struct soc_mixer_control *mc =
+		(struct soc_mixer_control *)kcontrol->private_value;
+	int reg = mc->reg;
+        int ret;
+        u16 val;
+
+        ret = snd_soc_put_volsw(kcontrol, ucontrol);
+        if (ret < 0)
+                return ret;
+
+        /* now hit the volume update bits (always bit 8) */
+        val = wm8400_read(codec, reg);
+        return wm8400_write(codec, reg, val | 0x0100);
+}
+
+#define WM8400_OUTPGA_SINGLE_R_TLV(xname, reg, shift, max, invert, tlv_array) \
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname), \
+	.access = SNDRV_CTL_ELEM_ACCESS_TLV_READ |\
+		SNDRV_CTL_ELEM_ACCESS_READWRITE,\
+	.tlv.p = (tlv_array), \
+	.info = snd_soc_info_volsw, \
+	.get = snd_soc_get_volsw, .put = wm8400_outpga_put_volsw_vu, \
+	.private_value = SOC_SINGLE_VALUE(reg, shift, max, invert) }
+
+
+static const char *wm8400_digital_sidetone[] =
+	{"None", "Left ADC", "Right ADC", "Reserved"};
+
+static const struct soc_enum wm8400_left_digital_sidetone_enum =
+SOC_ENUM_SINGLE(WM8400_DIGITAL_SIDE_TONE,
+		WM8400_ADC_TO_DACL_SHIFT, 2, wm8400_digital_sidetone);
+
+static const struct soc_enum wm8400_right_digital_sidetone_enum =
+SOC_ENUM_SINGLE(WM8400_DIGITAL_SIDE_TONE,
+		WM8400_ADC_TO_DACR_SHIFT, 2, wm8400_digital_sidetone);
+
+static const char *wm8400_adcmode[] =
+	{"Hi-fi mode", "Voice mode 1", "Voice mode 2", "Voice mode 3"};
+
+static const struct soc_enum wm8400_right_adcmode_enum =
+SOC_ENUM_SINGLE(WM8400_ADC_CTRL, WM8400_ADC_HPF_CUT_SHIFT, 3, wm8400_adcmode);
+
+static const struct snd_kcontrol_new wm8400_snd_controls[] = {
+/* INMIXL */
+SOC_SINGLE("LIN12 PGA Boost", WM8400_INPUT_MIXER3, WM8400_L12MNBST_SHIFT,
+	   1, 0),
+SOC_SINGLE("LIN34 PGA Boost", WM8400_INPUT_MIXER3, WM8400_L34MNBST_SHIFT,
+	   1, 0),
+/* INMIXR */
+SOC_SINGLE("RIN12 PGA Boost", WM8400_INPUT_MIXER3, WM8400_R12MNBST_SHIFT,
+	   1, 0),
+SOC_SINGLE("RIN34 PGA Boost", WM8400_INPUT_MIXER3, WM8400_R34MNBST_SHIFT,
+	   1, 0),
+
+/* LOMIX */
+SOC_SINGLE_TLV("LOMIX LIN3 Bypass Volume", WM8400_OUTPUT_MIXER3,
+	WM8400_LLI3LOVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("LOMIX RIN12 PGA Bypass Volume", WM8400_OUTPUT_MIXER3,
+	WM8400_LR12LOVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("LOMIX LIN12 PGA Bypass Volume", WM8400_OUTPUT_MIXER3,
+	WM8400_LL12LOVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("LOMIX RIN3 Bypass Volume", WM8400_OUTPUT_MIXER5,
+	WM8400_LRI3LOVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("LOMIX AINRMUX Bypass Volume", WM8400_OUTPUT_MIXER5,
+	WM8400_LRBLOVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("LOMIX AINLMUX Bypass Volume", WM8400_OUTPUT_MIXER5,
+	WM8400_LRBLOVOL_SHIFT, 7, 0, out_mix_tlv),
+
+/* ROMIX */
+SOC_SINGLE_TLV("ROMIX RIN3 Bypass Volume", WM8400_OUTPUT_MIXER4,
+	WM8400_RRI3ROVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("ROMIX LIN12 PGA Bypass Volume", WM8400_OUTPUT_MIXER4,
+	WM8400_RL12ROVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("ROMIX RIN12 PGA Bypass Volume", WM8400_OUTPUT_MIXER4,
+	WM8400_RR12ROVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("ROMIX LIN3 Bypass Volume", WM8400_OUTPUT_MIXER6,
+	WM8400_RLI3ROVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("ROMIX AINLMUX Bypass Volume", WM8400_OUTPUT_MIXER6,
+	WM8400_RLBROVOL_SHIFT, 7, 0, out_mix_tlv),
+SOC_SINGLE_TLV("ROMIX AINRMUX Bypass Volume", WM8400_OUTPUT_MIXER6,
+	WM8400_RRBROVOL_SHIFT, 7, 0, out_mix_tlv),
+
+/* LOUT */
+WM8400_OUTPGA_SINGLE_R_TLV("LOUT Volume", WM8400_LEFT_OUTPUT_VOLUME,
+	WM8400_LOUTVOL_SHIFT, WM8400_LOUTVOL_MASK, 0, out_pga_tlv),
+SOC_SINGLE("LOUT ZC", WM8400_LEFT_OUTPUT_VOLUME, WM8400_LOZC_SHIFT, 1, 0),
+
+/* ROUT */
+WM8400_OUTPGA_SINGLE_R_TLV("ROUT Volume", WM8400_RIGHT_OUTPUT_VOLUME,
+	WM8400_ROUTVOL_SHIFT, WM8400_ROUTVOL_MASK, 0, out_pga_tlv),
+SOC_SINGLE("ROUT ZC", WM8400_RIGHT_OUTPUT_VOLUME, WM8400_ROZC_SHIFT, 1, 0),
+
+/* LOPGA */
+WM8400_OUTPGA_SINGLE_R_TLV("LOPGA Volume", WM8400_LEFT_OPGA_VOLUME,
+	WM8400_LOPGAVOL_SHIFT, WM8400_LOPGAVOL_MASK, 0, out_pga_tlv),
+SOC_SINGLE("LOPGA ZC Switch", WM8400_LEFT_OPGA_VOLUME,
+	WM8400_LOPGAZC_SHIFT, 1, 0),
+
+/* ROPGA */
+WM8400_OUTPGA_SINGLE_R_TLV("ROPGA Volume", WM8400_RIGHT_OPGA_VOLUME,
+	WM8400_ROPGAVOL_SHIFT, WM8400_ROPGAVOL_MASK, 0, out_pga_tlv),
+SOC_SINGLE("ROPGA ZC Switch", WM8400_RIGHT_OPGA_VOLUME,
+	WM8400_ROPGAZC_SHIFT, 1, 0),
+
+SOC_SINGLE("LON Mute Switch", WM8400_LINE_OUTPUTS_VOLUME,
+	WM8400_LONMUTE_SHIFT, 1, 0),
+SOC_SINGLE("LOP Mute Switch", WM8400_LINE_OUTPUTS_VOLUME,
+	WM8400_LOPMUTE_SHIFT, 1, 0),
+SOC_SINGLE("LOP Attenuation Switch", WM8400_LINE_OUTPUTS_VOLUME,
+	WM8400_LOATTN_SHIFT, 1, 0),
+SOC_SINGLE("RON Mute Switch", WM8400_LINE_OUTPUTS_VOLUME,
+	WM8400_RONMUTE_SHIFT, 1, 0),
+SOC_SINGLE("ROP Mute Switch", WM8400_LINE_OUTPUTS_VOLUME,
+	WM8400_ROPMUTE_SHIFT, 1, 0),
+SOC_SINGLE("ROP Attenuation Switch", WM8400_LINE_OUTPUTS_VOLUME,
+	WM8400_ROATTN_SHIFT, 1, 0),
+
+SOC_SINGLE("OUT3 Mute Switch", WM8400_OUT3_4_VOLUME,
+	WM8400_OUT3MUTE_SHIFT, 1, 0),
+SOC_SINGLE("OUT3 Attenuation Switch", WM8400_OUT3_4_VOLUME,
+	WM8400_OUT3ATTN_SHIFT, 1, 0),
+
+SOC_SINGLE("OUT4 Mute Switch", WM8400_OUT3_4_VOLUME,
+	WM8400_OUT4MUTE_SHIFT, 1, 0),
+SOC_SINGLE("OUT4 Attenuation Switch", WM8400_OUT3_4_VOLUME,
+	WM8400_OUT4ATTN_SHIFT, 1, 0),
+
+SOC_SINGLE("Speaker Mode Switch", WM8400_CLASSD1,
+	WM8400_CDMODE_SHIFT, 1, 0),
+
+SOC_SINGLE("Speaker Output Attenuation Volume", WM8400_SPEAKER_VOLUME,
+	WM8400_SPKATTN_SHIFT, WM8400_SPKATTN_MASK, 0),
+SOC_SINGLE("Speaker DC Boost Volume", WM8400_CLASSD3,
+	WM8400_DCGAIN_SHIFT, 6, 0),
+SOC_SINGLE("Speaker AC Boost Volume", WM8400_CLASSD3,
+	WM8400_ACGAIN_SHIFT, 6, 0),
+
+WM8400_OUTPGA_SINGLE_R_TLV("Left DAC Digital Volume",
+	WM8400_LEFT_DAC_DIGITAL_VOLUME, WM8400_DACL_VOL_SHIFT,
+	127, 0, out_dac_tlv),
+
+WM8400_OUTPGA_SINGLE_R_TLV("Right DAC Digital Volume",
+	WM8400_RIGHT_DAC_DIGITAL_VOLUME, WM8400_DACR_VOL_SHIFT,
+	127, 0, out_dac_tlv),
+
+SOC_ENUM("Left Digital Sidetone", wm8400_left_digital_sidetone_enum),
+SOC_ENUM("Right Digital Sidetone", wm8400_right_digital_sidetone_enum),
+
+SOC_SINGLE_TLV("Left Digital Sidetone Volume", WM8400_DIGITAL_SIDE_TONE,
+	WM8400_ADCL_DAC_SVOL_SHIFT, 15, 0, out_sidetone_tlv),
+SOC_SINGLE_TLV("Right Digital Sidetone Volume", WM8400_DIGITAL_SIDE_TONE,
+	WM8400_ADCR_DAC_SVOL_SHIFT, 15, 0, out_sidetone_tlv),
+
+SOC_SINGLE("ADC Digital High Pass Filter Switch", WM8400_ADC_CTRL,
+	WM8400_ADC_HPF_ENA_SHIFT, 1, 0),
+
+SOC_ENUM("ADC HPF Mode", wm8400_right_adcmode_enum),
+
+WM8400_OUTPGA_SINGLE_R_TLV("Left ADC Digital Volume",
+	WM8400_LEFT_ADC_DIGITAL_VOLUME,
+	WM8400_ADCL_VOL_SHIFT,
+	WM8400_ADCL_VOL_MASK,
+	0,
+	in_adc_tlv),
+
+WM8400_OUTPGA_SINGLE_R_TLV("Right ADC Digital Volume",
+	WM8400_RIGHT_ADC_DIGITAL_VOLUME,
+	WM8400_ADCR_VOL_SHIFT,
+	WM8400_ADCR_VOL_MASK,
+	0,
+	in_adc_tlv),
+
+WM8400_OUTPGA_SINGLE_R_TLV("LIN12 Volume",
+	WM8400_LEFT_LINE_INPUT_1_2_VOLUME,
+	WM8400_LIN12VOL_SHIFT,
+	WM8400_LIN12VOL_MASK,
+	0,
+	in_pga_tlv),
+
+SOC_SINGLE("LIN12 ZC Switch", WM8400_LEFT_LINE_INPUT_1_2_VOLUME,
+	WM8400_LI12ZC_SHIFT, 1, 0),
+
+SOC_SINGLE("LIN12 Mute Switch", WM8400_LEFT_LINE_INPUT_1_2_VOLUME,
+	WM8400_LI12MUTE_SHIFT, 1, 0),
+
+WM8400_OUTPGA_SINGLE_R_TLV("LIN34 Volume",
+	WM8400_LEFT_LINE_INPUT_3_4_VOLUME,
+	WM8400_LIN34VOL_SHIFT,
+	WM8400_LIN34VOL_MASK,
+	0,
+	in_pga_tlv),
+
+SOC_SINGLE("LIN34 ZC Switch", WM8400_LEFT_LINE_INPUT_3_4_VOLUME,
+	WM8400_LI34ZC_SHIFT, 1, 0),
+
+SOC_SINGLE("LIN34 Mute Switch", WM8400_LEFT_LINE_INPUT_3_4_VOLUME,
+	WM8400_LI34MUTE_SHIFT, 1, 0),
+
+WM8400_OUTPGA_SINGLE_R_TLV("RIN12 Volume",
+	WM8400_RIGHT_LINE_INPUT_1_2_VOLUME,
+	WM8400_RIN12VOL_SHIFT,
+	WM8400_RIN12VOL_MASK,
+	0,
+	in_pga_tlv),
+
+SOC_SINGLE("RIN12 ZC Switch", WM8400_RIGHT_LINE_INPUT_1_2_VOLUME,
+	WM8400_RI12ZC_SHIFT, 1, 0),
+
+SOC_SINGLE("RIN12 Mute Switch", WM8400_RIGHT_LINE_INPUT_1_2_VOLUME,
+	WM8400_RI12MUTE_SHIFT, 1, 0),
+
+WM8400_OUTPGA_SINGLE_R_TLV("RIN34 Volume",
+	WM8400_RIGHT_LINE_INPUT_3_4_VOLUME,
+	WM8400_RIN34VOL_SHIFT,
+	WM8400_RIN34VOL_MASK,
+	0,
+	in_pga_tlv),
+
+SOC_SINGLE("RIN34 ZC Switch", WM8400_RIGHT_LINE_INPUT_3_4_VOLUME,
+	WM8400_RI34ZC_SHIFT, 1, 0),
+
+SOC_SINGLE("RIN34 Mute Switch", WM8400_RIGHT_LINE_INPUT_3_4_VOLUME,
+	WM8400_RI34MUTE_SHIFT, 1, 0),
+
+};
+
+/* add non dapm controls */
+static int wm8400_add_controls(struct snd_soc_codec *codec)
+{
+	int err, i;
+
+	for (i = 0; i < ARRAY_SIZE(wm8400_snd_controls); i++) {
+		err = snd_ctl_add(codec->card,
+				snd_soc_cnew(&wm8400_snd_controls[i],codec,
+					NULL));
+		if (err < 0)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * _DAPM_ Controls
+ */
+
+static int inmixer_event (struct snd_soc_dapm_widget *w,
+	struct snd_kcontrol *kcontrol, int event)
+{
+	u16 reg, fakepower;
+
+	reg = wm8400_read(w->codec, WM8400_POWER_MANAGEMENT_2);
+	fakepower = wm8400_read(w->codec, WM8400_INTDRIVBITS);
+
+	if (fakepower & ((1 << WM8400_INMIXL_PWR) |
+		(1 << WM8400_AINLMUX_PWR))) {
+		reg |= WM8400_AINL_ENA;
+	} else {
+		reg &= ~WM8400_AINL_ENA;
+	}
+
+	if (fakepower & ((1 << WM8400_INMIXR_PWR) |
+		(1 << WM8400_AINRMUX_PWR))) {
+		reg |= WM8400_AINR_ENA;
+	} else {
+		reg &= ~WM8400_AINL_ENA;
+	}
+	wm8400_write(w->codec, WM8400_POWER_MANAGEMENT_2, reg);
+
+	return 0;
+}
+
+static int outmixer_event (struct snd_soc_dapm_widget *w,
+	struct snd_kcontrol * kcontrol, int event)
+{
+	struct soc_mixer_control *mc =
+		(struct soc_mixer_control *)kcontrol->private_value;
+	u32 reg_shift = mc->shift;
+	int ret = 0;
+	u16 reg;
+
+	switch (reg_shift) {
+	case WM8400_SPEAKER_MIXER | (WM8400_LDSPK << 8) :
+		reg = wm8400_read(w->codec, WM8400_OUTPUT_MIXER1);
+		if (reg & WM8400_LDLO) {
+			printk(KERN_WARNING
+			"Cannot set as Output Mixer 1 LDLO Set\n");
+			ret = -1;
+		}
+		break;
+	case WM8400_SPEAKER_MIXER | (WM8400_RDSPK << 8):
+		reg = wm8400_read(w->codec, WM8400_OUTPUT_MIXER2);
+		if (reg & WM8400_RDRO) {
+			printk(KERN_WARNING
+			"Cannot set as Output Mixer 2 RDRO Set\n");
+			ret = -1;
+		}
+		break;
+	case WM8400_OUTPUT_MIXER1 | (WM8400_LDLO << 8):
+		reg = wm8400_read(w->codec, WM8400_SPEAKER_MIXER);
+		if (reg & WM8400_LDSPK) {
+			printk(KERN_WARNING
+			"Cannot set as Speaker Mixer LDSPK Set\n");
+			ret = -1;
+		}
+		break;
+	case WM8400_OUTPUT_MIXER2 | (WM8400_RDRO << 8):
+		reg = wm8400_read(w->codec, WM8400_SPEAKER_MIXER);
+		if (reg & WM8400_RDSPK) {
+			printk(KERN_WARNING
+			"Cannot set as Speaker Mixer RDSPK Set\n");
+			ret = -1;
+		}
+		break;
+	}
+
+	return ret;
+}
+
+/* INMIX dB values */
+static const unsigned int in_mix_tlv[] = {
+	TLV_DB_RANGE_HEAD(1),
+	0,7, TLV_DB_LINEAR_ITEM(-1200, 600),
+};
+
+/* Left In PGA Connections */
+static const struct snd_kcontrol_new wm8400_dapm_lin12_pga_controls[] = {
+SOC_DAPM_SINGLE("LIN1 Switch", WM8400_INPUT_MIXER2, WM8400_LMN1_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LIN2 Switch", WM8400_INPUT_MIXER2, WM8400_LMP2_SHIFT, 1, 0),
+};
+
+static const struct snd_kcontrol_new wm8400_dapm_lin34_pga_controls[] = {
+SOC_DAPM_SINGLE("LIN3 Switch", WM8400_INPUT_MIXER2, WM8400_LMN3_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LIN4 Switch", WM8400_INPUT_MIXER2, WM8400_LMP4_SHIFT, 1, 0),
+};
+
+/* Right In PGA Connections */
+static const struct snd_kcontrol_new wm8400_dapm_rin12_pga_controls[] = {
+SOC_DAPM_SINGLE("RIN1 Switch", WM8400_INPUT_MIXER2, WM8400_RMN1_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("RIN2 Switch", WM8400_INPUT_MIXER2, WM8400_RMP2_SHIFT, 1, 0),
+};
+
+static const struct snd_kcontrol_new wm8400_dapm_rin34_pga_controls[] = {
+SOC_DAPM_SINGLE("RIN3 Switch", WM8400_INPUT_MIXER2, WM8400_RMN3_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("RIN4 Switch", WM8400_INPUT_MIXER2, WM8400_RMP4_SHIFT, 1, 0),
+};
+
+/* INMIXL */
+static const struct snd_kcontrol_new wm8400_dapm_inmixl_controls[] = {
+SOC_DAPM_SINGLE_TLV("Record Left Volume", WM8400_INPUT_MIXER3,
+	WM8400_LDBVOL_SHIFT, WM8400_LDBVOL_MASK, 0, in_mix_tlv),
+SOC_DAPM_SINGLE_TLV("LIN2 Volume", WM8400_INPUT_MIXER5, WM8400_LI2BVOL_SHIFT,
+	7, 0, in_mix_tlv),
+SOC_DAPM_SINGLE("LINPGA12 Switch", WM8400_INPUT_MIXER3, WM8400_L12MNB_SHIFT,
+		1, 0),
+SOC_DAPM_SINGLE("LINPGA34 Switch", WM8400_INPUT_MIXER3, WM8400_L34MNB_SHIFT,
+		1, 0),
+};
+
+/* INMIXR */
+static const struct snd_kcontrol_new wm8400_dapm_inmixr_controls[] = {
+SOC_DAPM_SINGLE_TLV("Record Right Volume", WM8400_INPUT_MIXER4,
+	WM8400_RDBVOL_SHIFT, WM8400_RDBVOL_MASK, 0, in_mix_tlv),
+SOC_DAPM_SINGLE_TLV("RIN2 Volume", WM8400_INPUT_MIXER6, WM8400_RI2BVOL_SHIFT,
+	7, 0, in_mix_tlv),
+SOC_DAPM_SINGLE("RINPGA12 Switch", WM8400_INPUT_MIXER3, WM8400_L12MNB_SHIFT,
+	1, 0),
+SOC_DAPM_SINGLE("RINPGA34 Switch", WM8400_INPUT_MIXER3, WM8400_L34MNB_SHIFT,
+	1, 0),
+};
+
+/* AINLMUX */
+static const char *wm8400_ainlmux[] =
+	{"INMIXL Mix", "RXVOICE Mix", "DIFFINL Mix"};
+
+static const struct soc_enum wm8400_ainlmux_enum =
+SOC_ENUM_SINGLE( WM8400_INPUT_MIXER1, WM8400_AINLMODE_SHIFT,
+	ARRAY_SIZE(wm8400_ainlmux), wm8400_ainlmux);
+
+static const struct snd_kcontrol_new wm8400_dapm_ainlmux_controls =
+SOC_DAPM_ENUM("Route", wm8400_ainlmux_enum);
+
+/* DIFFINL */
+
+/* AINRMUX */
+static const char *wm8400_ainrmux[] =
+	{"INMIXR Mix", "RXVOICE Mix", "DIFFINR Mix"};
+
+static const struct soc_enum wm8400_ainrmux_enum =
+SOC_ENUM_SINGLE( WM8400_INPUT_MIXER1, WM8400_AINRMODE_SHIFT,
+	ARRAY_SIZE(wm8400_ainrmux), wm8400_ainrmux);
+
+static const struct snd_kcontrol_new wm8400_dapm_ainrmux_controls =
+SOC_DAPM_ENUM("Route", wm8400_ainrmux_enum);
+
+/* RXVOICE */
+static const struct snd_kcontrol_new wm8400_dapm_rxvoice_controls[] = {
+SOC_DAPM_SINGLE_TLV("LIN4/RXN", WM8400_INPUT_MIXER5, WM8400_LR4BVOL_SHIFT,
+			WM8400_LR4BVOL_MASK, 0, in_mix_tlv),
+SOC_DAPM_SINGLE_TLV("RIN4/RXP", WM8400_INPUT_MIXER6, WM8400_RL4BVOL_SHIFT,
+			WM8400_RL4BVOL_MASK, 0, in_mix_tlv),
+};
+
+/* LOMIX */
+static const struct snd_kcontrol_new wm8400_dapm_lomix_controls[] = {
+SOC_DAPM_SINGLE("LOMIX Right ADC Bypass Switch", WM8400_OUTPUT_MIXER1,
+	WM8400_LRBLO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOMIX Left ADC Bypass Switch", WM8400_OUTPUT_MIXER1,
+	WM8400_LLBLO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOMIX RIN3 Bypass Switch", WM8400_OUTPUT_MIXER1,
+	WM8400_LRI3LO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOMIX LIN3 Bypass Switch", WM8400_OUTPUT_MIXER1,
+	WM8400_LLI3LO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOMIX RIN12 PGA Bypass Switch", WM8400_OUTPUT_MIXER1,
+	WM8400_LR12LO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOMIX LIN12 PGA Bypass Switch", WM8400_OUTPUT_MIXER1,
+	WM8400_LL12LO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOMIX Left DAC Switch", WM8400_OUTPUT_MIXER1,
+	WM8400_LDLO_SHIFT, 1, 0),
+};
+
+/* ROMIX */
+static const struct snd_kcontrol_new wm8400_dapm_romix_controls[] = {
+SOC_DAPM_SINGLE("ROMIX Left ADC Bypass Switch", WM8400_OUTPUT_MIXER2,
+	WM8400_RLBRO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROMIX Right ADC Bypass Switch", WM8400_OUTPUT_MIXER2,
+	WM8400_RRBRO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROMIX LIN3 Bypass Switch", WM8400_OUTPUT_MIXER2,
+	WM8400_RLI3RO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROMIX RIN3 Bypass Switch", WM8400_OUTPUT_MIXER2,
+	WM8400_RRI3RO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROMIX LIN12 PGA Bypass Switch", WM8400_OUTPUT_MIXER2,
+	WM8400_RL12RO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROMIX RIN12 PGA Bypass Switch", WM8400_OUTPUT_MIXER2,
+	WM8400_RR12RO_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROMIX Right DAC Switch", WM8400_OUTPUT_MIXER2,
+	WM8400_RDRO_SHIFT, 1, 0),
+};
+
+/* LONMIX */
+static const struct snd_kcontrol_new wm8400_dapm_lonmix_controls[] = {
+SOC_DAPM_SINGLE("LONMIX Left Mixer PGA Switch", WM8400_LINE_MIXER1,
+	WM8400_LLOPGALON_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LONMIX Right Mixer PGA Switch", WM8400_LINE_MIXER1,
+	WM8400_LROPGALON_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LONMIX Inverted LOP Switch", WM8400_LINE_MIXER1,
+	WM8400_LOPLON_SHIFT, 1, 0),
+};
+
+/* LOPMIX */
+static const struct snd_kcontrol_new wm8400_dapm_lopmix_controls[] = {
+SOC_DAPM_SINGLE("LOPMIX Right Mic Bypass Switch", WM8400_LINE_MIXER1,
+	WM8400_LR12LOP_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOPMIX Left Mic Bypass Switch", WM8400_LINE_MIXER1,
+	WM8400_LL12LOP_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("LOPMIX Left Mixer PGA Switch", WM8400_LINE_MIXER1,
+	WM8400_LLOPGALOP_SHIFT, 1, 0),
+};
+
+/* RONMIX */
+static const struct snd_kcontrol_new wm8400_dapm_ronmix_controls[] = {
+SOC_DAPM_SINGLE("RONMIX Right Mixer PGA Switch", WM8400_LINE_MIXER2,
+	WM8400_RROPGARON_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("RONMIX Left Mixer PGA Switch", WM8400_LINE_MIXER2,
+	WM8400_RLOPGARON_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("RONMIX Inverted ROP Switch", WM8400_LINE_MIXER2,
+	WM8400_ROPRON_SHIFT, 1, 0),
+};
+
+/* ROPMIX */
+static const struct snd_kcontrol_new wm8400_dapm_ropmix_controls[] = {
+SOC_DAPM_SINGLE("ROPMIX Left Mic Bypass Switch", WM8400_LINE_MIXER2,
+	WM8400_RL12ROP_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROPMIX Right Mic Bypass Switch", WM8400_LINE_MIXER2,
+	WM8400_RR12ROP_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("ROPMIX Right Mixer PGA Switch", WM8400_LINE_MIXER2,
+	WM8400_RROPGAROP_SHIFT, 1, 0),
+};
+
+/* OUT3MIX */
+static const struct snd_kcontrol_new wm8400_dapm_out3mix_controls[] = {
+SOC_DAPM_SINGLE("OUT3MIX LIN4/RXP Bypass Switch", WM8400_OUT3_4_MIXER,
+	WM8400_LI4O3_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("OUT3MIX Left Out PGA Switch", WM8400_OUT3_4_MIXER,
+	WM8400_LPGAO3_SHIFT, 1, 0),
+};
+
+/* OUT4MIX */
+static const struct snd_kcontrol_new wm8400_dapm_out4mix_controls[] = {
+SOC_DAPM_SINGLE("OUT4MIX Right Out PGA Switch", WM8400_OUT3_4_MIXER,
+	WM8400_RPGAO4_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("OUT4MIX RIN4/RXP Bypass Switch", WM8400_OUT3_4_MIXER,
+	WM8400_RI4O4_SHIFT, 1, 0),
+};
+
+/* SPKMIX */
+static const struct snd_kcontrol_new wm8400_dapm_spkmix_controls[] = {
+SOC_DAPM_SINGLE("SPKMIX LIN2 Bypass Switch", WM8400_SPEAKER_MIXER,
+	WM8400_LI2SPK_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("SPKMIX LADC Bypass Switch", WM8400_SPEAKER_MIXER,
+	WM8400_LB2SPK_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("SPKMIX Left Mixer PGA Switch", WM8400_SPEAKER_MIXER,
+	WM8400_LOPGASPK_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("SPKMIX Left DAC Switch", WM8400_SPEAKER_MIXER,
+	WM8400_LDSPK_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("SPKMIX Right DAC Switch", WM8400_SPEAKER_MIXER,
+	WM8400_RDSPK_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("SPKMIX Right Mixer PGA Switch", WM8400_SPEAKER_MIXER,
+	WM8400_ROPGASPK_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("SPKMIX RADC Bypass Switch", WM8400_SPEAKER_MIXER,
+	WM8400_RL12ROP_SHIFT, 1, 0),
+SOC_DAPM_SINGLE("SPKMIX RIN2 Bypass Switch", WM8400_SPEAKER_MIXER,
+	WM8400_RI2SPK_SHIFT, 1, 0),
+};
+
+static const struct snd_soc_dapm_widget wm8400_dapm_widgets[] = {
+/* Input Side */
+/* Input Lines */
+SND_SOC_DAPM_INPUT("LIN1"),
+SND_SOC_DAPM_INPUT("LIN2"),
+SND_SOC_DAPM_INPUT("LIN3"),
+SND_SOC_DAPM_INPUT("LIN4/RXN"),
+SND_SOC_DAPM_INPUT("RIN3"),
+SND_SOC_DAPM_INPUT("RIN4/RXP"),
+SND_SOC_DAPM_INPUT("RIN1"),
+SND_SOC_DAPM_INPUT("RIN2"),
+SND_SOC_DAPM_INPUT("Internal ADC Source"),
+
+/* DACs */
+SND_SOC_DAPM_ADC("Left ADC", "Left Capture", WM8400_POWER_MANAGEMENT_2,
+	WM8400_ADCL_ENA_SHIFT, 0),
+SND_SOC_DAPM_ADC("Right ADC", "Right Capture", WM8400_POWER_MANAGEMENT_2,
+	WM8400_ADCR_ENA_SHIFT, 0),
+
+/* Input PGAs */
+SND_SOC_DAPM_MIXER("LIN12 PGA", WM8400_POWER_MANAGEMENT_2,
+		   WM8400_LIN12_ENA_SHIFT,
+		   0, &wm8400_dapm_lin12_pga_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_lin12_pga_controls)),
+SND_SOC_DAPM_MIXER("LIN34 PGA", WM8400_POWER_MANAGEMENT_2,
+		   WM8400_LIN34_ENA_SHIFT,
+		   0, &wm8400_dapm_lin34_pga_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_lin34_pga_controls)),
+SND_SOC_DAPM_MIXER("RIN12 PGA", WM8400_POWER_MANAGEMENT_2,
+		   WM8400_RIN12_ENA_SHIFT,
+		   0, &wm8400_dapm_rin12_pga_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_rin12_pga_controls)),
+SND_SOC_DAPM_MIXER("RIN34 PGA", WM8400_POWER_MANAGEMENT_2,
+		   WM8400_RIN34_ENA_SHIFT,
+		   0, &wm8400_dapm_rin34_pga_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_rin34_pga_controls)),
+
+/* INMIXL */
+SND_SOC_DAPM_MIXER_E("INMIXL", WM8400_INTDRIVBITS, WM8400_INMIXL_PWR, 0,
+	&wm8400_dapm_inmixl_controls[0],
+	ARRAY_SIZE(wm8400_dapm_inmixl_controls),
+	inmixer_event, SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
+
+/* AINLMUX */
+SND_SOC_DAPM_MUX_E("AILNMUX", WM8400_INTDRIVBITS, WM8400_AINLMUX_PWR, 0,
+	&wm8400_dapm_ainlmux_controls, inmixer_event,
+	SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
+
+/* INMIXR */
+SND_SOC_DAPM_MIXER_E("INMIXR", WM8400_INTDRIVBITS, WM8400_INMIXR_PWR, 0,
+	&wm8400_dapm_inmixr_controls[0],
+	ARRAY_SIZE(wm8400_dapm_inmixr_controls),
+	inmixer_event, SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
+
+/* AINRMUX */
+SND_SOC_DAPM_MUX_E("AIRNMUX", WM8400_INTDRIVBITS, WM8400_AINRMUX_PWR, 0,
+	&wm8400_dapm_ainrmux_controls, inmixer_event,
+	SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
+
+/* Output Side */
+/* DACs */
+SND_SOC_DAPM_DAC("Left DAC", "Left Playback", WM8400_POWER_MANAGEMENT_3,
+	WM8400_DACL_ENA_SHIFT, 0),
+SND_SOC_DAPM_DAC("Right DAC", "Right Playback", WM8400_POWER_MANAGEMENT_3,
+	WM8400_DACR_ENA_SHIFT, 0),
+
+/* LOMIX */
+SND_SOC_DAPM_MIXER_E("LOMIX", WM8400_POWER_MANAGEMENT_3,
+		     WM8400_LOMIX_ENA_SHIFT,
+		     0, &wm8400_dapm_lomix_controls[0],
+		     ARRAY_SIZE(wm8400_dapm_lomix_controls),
+		     outmixer_event, SND_SOC_DAPM_PRE_REG),
+
+/* LONMIX */
+SND_SOC_DAPM_MIXER("LONMIX", WM8400_POWER_MANAGEMENT_3, WM8400_LON_ENA_SHIFT,
+		   0, &wm8400_dapm_lonmix_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_lonmix_controls)),
+
+/* LOPMIX */
+SND_SOC_DAPM_MIXER("LOPMIX", WM8400_POWER_MANAGEMENT_3, WM8400_LOP_ENA_SHIFT,
+		   0, &wm8400_dapm_lopmix_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_lopmix_controls)),
+
+/* OUT3MIX */
+SND_SOC_DAPM_MIXER("OUT3MIX", WM8400_POWER_MANAGEMENT_1, WM8400_OUT3_ENA_SHIFT,
+		   0, &wm8400_dapm_out3mix_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_out3mix_controls)),
+
+/* SPKMIX */
+SND_SOC_DAPM_MIXER_E("SPKMIX", WM8400_POWER_MANAGEMENT_1, WM8400_SPK_ENA_SHIFT,
+		     0, &wm8400_dapm_spkmix_controls[0],
+		     ARRAY_SIZE(wm8400_dapm_spkmix_controls), outmixer_event,
+		     SND_SOC_DAPM_PRE_REG),
+
+/* OUT4MIX */
+SND_SOC_DAPM_MIXER("OUT4MIX", WM8400_POWER_MANAGEMENT_1, WM8400_OUT4_ENA_SHIFT,
+	0, &wm8400_dapm_out4mix_controls[0],
+	ARRAY_SIZE(wm8400_dapm_out4mix_controls)),
+
+/* ROPMIX */
+SND_SOC_DAPM_MIXER("ROPMIX", WM8400_POWER_MANAGEMENT_3, WM8400_ROP_ENA_SHIFT,
+		   0, &wm8400_dapm_ropmix_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_ropmix_controls)),
+
+/* RONMIX */
+SND_SOC_DAPM_MIXER("RONMIX", WM8400_POWER_MANAGEMENT_3, WM8400_RON_ENA_SHIFT,
+		   0, &wm8400_dapm_ronmix_controls[0],
+		   ARRAY_SIZE(wm8400_dapm_ronmix_controls)),
+
+/* ROMIX */
+SND_SOC_DAPM_MIXER_E("ROMIX", WM8400_POWER_MANAGEMENT_3,
+		     WM8400_ROMIX_ENA_SHIFT,
+		     0, &wm8400_dapm_romix_controls[0],
+		     ARRAY_SIZE(wm8400_dapm_romix_controls),
+		     outmixer_event, SND_SOC_DAPM_PRE_REG),
+
+/* LOUT PGA */
+SND_SOC_DAPM_PGA("LOUT PGA", WM8400_POWER_MANAGEMENT_1, WM8400_LOUT_ENA_SHIFT,
+		 0, NULL, 0),
+
+/* ROUT PGA */
+SND_SOC_DAPM_PGA("ROUT PGA", WM8400_POWER_MANAGEMENT_1, WM8400_ROUT_ENA_SHIFT,
+		 0, NULL, 0),
+
+/* LOPGA */
+SND_SOC_DAPM_PGA("LOPGA", WM8400_POWER_MANAGEMENT_3, WM8400_LOPGA_ENA_SHIFT, 0,
+	NULL, 0),
+
+/* ROPGA */
+SND_SOC_DAPM_PGA("ROPGA", WM8400_POWER_MANAGEMENT_3, WM8400_ROPGA_ENA_SHIFT, 0,
+	NULL, 0),
+
+/* MICBIAS */
+SND_SOC_DAPM_MICBIAS("MICBIAS", WM8400_POWER_MANAGEMENT_1,
+	WM8400_MIC1BIAS_ENA_SHIFT, 0),
+
+SND_SOC_DAPM_OUTPUT("LON"),
+SND_SOC_DAPM_OUTPUT("LOP"),
+SND_SOC_DAPM_OUTPUT("OUT3"),
+SND_SOC_DAPM_OUTPUT("LOUT"),
+SND_SOC_DAPM_OUTPUT("SPKN"),
+SND_SOC_DAPM_OUTPUT("SPKP"),
+SND_SOC_DAPM_OUTPUT("ROUT"),
+SND_SOC_DAPM_OUTPUT("OUT4"),
+SND_SOC_DAPM_OUTPUT("ROP"),
+SND_SOC_DAPM_OUTPUT("RON"),
+
+SND_SOC_DAPM_OUTPUT("Internal DAC Sink"),
+};
+
+static const struct snd_soc_dapm_route audio_map[] = {
+	/* Make DACs turn on when playing even if not mixed into any outputs */
+	{"Internal DAC Sink", NULL, "Left DAC"},
+	{"Internal DAC Sink", NULL, "Right DAC"},
+
+	/* Make ADCs turn on when recording
+	 * even if not mixed from any inputs */
+	{"Left ADC", NULL, "Internal ADC Source"},
+	{"Right ADC", NULL, "Internal ADC Source"},
+
+	/* Input Side */
+	/* LIN12 PGA */
+	{"LIN12 PGA", "LIN1 Switch", "LIN1"},
+	{"LIN12 PGA", "LIN2 Switch", "LIN2"},
+	/* LIN34 PGA */
+	{"LIN34 PGA", "LIN3 Switch", "LIN3"},
+	{"LIN34 PGA", "LIN4 Switch", "LIN4/RXN"},
+	/* INMIXL */
+	{"INMIXL", "Record Left Volume", "LOMIX"},
+	{"INMIXL", "LIN2 Volume", "LIN2"},
+	{"INMIXL", "LINPGA12 Switch", "LIN12 PGA"},
+	{"INMIXL", "LINPGA34 Switch", "LIN34 PGA"},
+	/* AILNMUX */
+	{"AILNMUX", "INMIXL Mix", "INMIXL"},
+	{"AILNMUX", "DIFFINL Mix", "LIN12 PGA"},
+	{"AILNMUX", "DIFFINL Mix", "LIN34 PGA"},
+	{"AILNMUX", "RXVOICE Mix", "LIN4/RXN"},
+	{"AILNMUX", "RXVOICE Mix", "RIN4/RXP"},
+	/* ADC */
+	{"Left ADC", NULL, "AILNMUX"},
+
+	/* RIN12 PGA */
+	{"RIN12 PGA", "RIN1 Switch", "RIN1"},
+	{"RIN12 PGA", "RIN2 Switch", "RIN2"},
+	/* RIN34 PGA */
+	{"RIN34 PGA", "RIN3 Switch", "RIN3"},
+	{"RIN34 PGA", "RIN4 Switch", "RIN4/RXP"},
+	/* INMIXL */
+	{"INMIXR", "Record Right Volume", "ROMIX"},
+	{"INMIXR", "RIN2 Volume", "RIN2"},
+	{"INMIXR", "RINPGA12 Switch", "RIN12 PGA"},
+	{"INMIXR", "RINPGA34 Switch", "RIN34 PGA"},
+	/* AIRNMUX */
+	{"AIRNMUX", "INMIXR Mix", "INMIXR"},
+	{"AIRNMUX", "DIFFINR Mix", "RIN12 PGA"},
+	{"AIRNMUX", "DIFFINR Mix", "RIN34 PGA"},
+	{"AIRNMUX", "RXVOICE Mix", "LIN4/RXN"},
+	{"AIRNMUX", "RXVOICE Mix", "RIN4/RXP"},
+	/* ADC */
+	{"Right ADC", NULL, "AIRNMUX"},
+
+	/* LOMIX */
+	{"LOMIX", "LOMIX RIN3 Bypass Switch", "RIN3"},
+	{"LOMIX", "LOMIX LIN3 Bypass Switch", "LIN3"},
+	{"LOMIX", "LOMIX LIN12 PGA Bypass Switch", "LIN12 PGA"},
+	{"LOMIX", "LOMIX RIN12 PGA Bypass Switch", "RIN12 PGA"},
+	{"LOMIX", "LOMIX Right ADC Bypass Switch", "AIRNMUX"},
+	{"LOMIX", "LOMIX Left ADC Bypass Switch", "AILNMUX"},
+	{"LOMIX", "LOMIX Left DAC Switch", "Left DAC"},
+
+	/* ROMIX */
+	{"ROMIX", "ROMIX RIN3 Bypass Switch", "RIN3"},
+	{"ROMIX", "ROMIX LIN3 Bypass Switch", "LIN3"},
+	{"ROMIX", "ROMIX LIN12 PGA Bypass Switch", "LIN12 PGA"},
+	{"ROMIX", "ROMIX RIN12 PGA Bypass Switch", "RIN12 PGA"},
+	{"ROMIX", "ROMIX Right ADC Bypass Switch", "AIRNMUX"},
+	{"ROMIX", "ROMIX Left ADC Bypass Switch", "AILNMUX"},
+	{"ROMIX", "ROMIX Right DAC Switch", "Right DAC"},
+
+	/* SPKMIX */
+	{"SPKMIX", "SPKMIX LIN2 Bypass Switch", "LIN2"},
+	{"SPKMIX", "SPKMIX RIN2 Bypass Switch", "RIN2"},
+	{"SPKMIX", "SPKMIX LADC Bypass Switch", "AILNMUX"},
+	{"SPKMIX", "SPKMIX RADC Bypass Switch", "AIRNMUX"},
+	{"SPKMIX", "SPKMIX Left Mixer PGA Switch", "LOPGA"},
+	{"SPKMIX", "SPKMIX Right Mixer PGA Switch", "ROPGA"},
+	{"SPKMIX", "SPKMIX Right DAC Switch", "Right DAC"},
+	{"SPKMIX", "SPKMIX Left DAC Switch", "Right DAC"},
+
+	/* LONMIX */
+	{"LONMIX", "LONMIX Left Mixer PGA Switch", "LOPGA"},
+	{"LONMIX", "LONMIX Right Mixer PGA Switch", "ROPGA"},
+	{"LONMIX", "LONMIX Inverted LOP Switch", "LOPMIX"},
+
+	/* LOPMIX */
+	{"LOPMIX", "LOPMIX Right Mic Bypass Switch", "RIN12 PGA"},
+	{"LOPMIX", "LOPMIX Left Mic Bypass Switch", "LIN12 PGA"},
+	{"LOPMIX", "LOPMIX Left Mixer PGA Switch", "LOPGA"},
+
+	/* OUT3MIX */
+	{"OUT3MIX", "OUT3MIX LIN4/RXP Bypass Switch", "LIN4/RXN"},
+	{"OUT3MIX", "OUT3MIX Left Out PGA Switch", "LOPGA"},
+
+	/* OUT4MIX */
+	{"OUT4MIX", "OUT4MIX Right Out PGA Switch", "ROPGA"},
+	{"OUT4MIX", "OUT4MIX RIN4/RXP Bypass Switch", "RIN4/RXP"},
+
+	/* RONMIX */
+	{"RONMIX", "RONMIX Right Mixer PGA Switch", "ROPGA"},
+	{"RONMIX", "RONMIX Left Mixer PGA Switch", "LOPGA"},
+	{"RONMIX", "RONMIX Inverted ROP Switch", "ROPMIX"},
+
+	/* ROPMIX */
+	{"ROPMIX", "ROPMIX Left Mic Bypass Switch", "LIN12 PGA"},
+	{"ROPMIX", "ROPMIX Right Mic Bypass Switch", "RIN12 PGA"},
+	{"ROPMIX", "ROPMIX Right Mixer PGA Switch", "ROPGA"},
+
+	/* Out Mixer PGAs */
+	{"LOPGA", NULL, "LOMIX"},
+	{"ROPGA", NULL, "ROMIX"},
+
+	{"LOUT PGA", NULL, "LOMIX"},
+	{"ROUT PGA", NULL, "ROMIX"},
+
+	/* Output Pins */
+	{"LON", NULL, "LONMIX"},
+	{"LOP", NULL, "LOPMIX"},
+	{"OUT3", NULL, "OUT3MIX"},
+	{"LOUT", NULL, "LOUT PGA"},
+	{"SPKN", NULL, "SPKMIX"},
+	{"ROUT", NULL, "ROUT PGA"},
+	{"OUT4", NULL, "OUT4MIX"},
+	{"ROP", NULL, "ROPMIX"},
+	{"RON", NULL, "RONMIX"},
+};
+
+static int wm8400_add_widgets(struct snd_soc_codec *codec)
+{
+	snd_soc_dapm_new_controls(codec, wm8400_dapm_widgets,
+				  ARRAY_SIZE(wm8400_dapm_widgets));
+
+	snd_soc_dapm_add_routes(codec, audio_map, ARRAY_SIZE(audio_map));
+
+	snd_soc_dapm_new_widgets(codec);
+	return 0;
+}
+
+/*
+ * Clock after FLL and dividers
+ */
+static int wm8400_set_dai_sysclk(struct snd_soc_dai *codec_dai,
+		int clk_id, unsigned int freq, int dir)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	struct wm8400_priv *wm8400 = codec->private_data;
+
+	wm8400->sysclk = freq;
+	return 0;
+}
+
+/*
+ * Sets ADC and Voice DAC format.
+ */
+static int wm8400_set_dai_fmt(struct snd_soc_dai *codec_dai,
+		unsigned int fmt)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	u16 audio1, audio3;
+
+	audio1 = wm8400_read(codec, WM8400_AUDIO_INTERFACE_1);
+	audio3 = wm8400_read(codec, WM8400_AUDIO_INTERFACE_3);
+
+	/* set master/slave audio interface */
+	switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
+	case SND_SOC_DAIFMT_CBS_CFS:
+		audio3 &= ~WM8400_AIF_MSTR1;
+		break;
+	case SND_SOC_DAIFMT_CBM_CFM:
+		audio3 |= WM8400_AIF_MSTR1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	audio1 &= ~WM8400_AIF_FMT_MASK;
+
+	/* interface format */
+	switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+	case SND_SOC_DAIFMT_I2S:
+		audio1 |= WM8400_AIF_FMT_I2S;
+		audio1 &= ~WM8400_AIF_LRCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_RIGHT_J:
+		audio1 |= WM8400_AIF_FMT_RIGHTJ;
+		audio1 &= ~WM8400_AIF_LRCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_LEFT_J:
+		audio1 |= WM8400_AIF_FMT_LEFTJ;
+		audio1 &= ~WM8400_AIF_LRCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_DSP_A:
+		audio1 |= WM8400_AIF_FMT_DSP;
+		audio1 &= ~WM8400_AIF_LRCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_DSP_B:
+		audio1 |= WM8400_AIF_FMT_DSP | WM8400_AIF_LRCLK_INV;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	wm8400_write(codec, WM8400_AUDIO_INTERFACE_1, audio1);
+	wm8400_write(codec, WM8400_AUDIO_INTERFACE_3, audio3);
+	return 0;
+}
+
+static int wm8400_set_dai_clkdiv(struct snd_soc_dai *codec_dai,
+		int div_id, int div)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	u16 reg;
+
+	switch (div_id) {
+	case WM8400_MCLK_DIV:
+		reg = wm8400_read(codec, WM8400_CLOCKING_2) &
+			~WM8400_MCLK_DIV_MASK;
+		wm8400_write(codec, WM8400_CLOCKING_2, reg | div);
+		break;
+	case WM8400_DACCLK_DIV:
+		reg = wm8400_read(codec, WM8400_CLOCKING_2) &
+			~WM8400_DAC_CLKDIV_MASK;
+		wm8400_write(codec, WM8400_CLOCKING_2, reg | div);
+		break;
+	case WM8400_ADCCLK_DIV:
+		reg = wm8400_read(codec, WM8400_CLOCKING_2) &
+			~WM8400_ADC_CLKDIV_MASK;
+		wm8400_write(codec, WM8400_CLOCKING_2, reg | div);
+		break;
+	case WM8400_BCLK_DIV:
+		reg = wm8400_read(codec, WM8400_CLOCKING_1) &
+			~WM8400_BCLK_DIV_MASK;
+		wm8400_write(codec, WM8400_CLOCKING_1, reg | div);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Set PCM DAI bit size and sample rate.
+ */
+static int wm8400_hw_params(struct snd_pcm_substream *substream,
+	struct snd_pcm_hw_params *params,
+	struct snd_soc_dai *dai)
+{
+	struct snd_soc_pcm_runtime *rtd = substream->private_data;
+	struct snd_soc_device *socdev = rtd->socdev;
+	struct snd_soc_codec *codec = socdev->card->codec;
+	u16 audio1 = wm8400_read(codec, WM8400_AUDIO_INTERFACE_1);
+
+	audio1 &= ~WM8400_AIF_WL_MASK;
+	/* bit size */
+	switch (params_format(params)) {
+	case SNDRV_PCM_FORMAT_S16_LE:
+		break;
+	case SNDRV_PCM_FORMAT_S20_3LE:
+		audio1 |= WM8400_AIF_WL_20BITS;
+		break;
+	case SNDRV_PCM_FORMAT_S24_LE:
+		audio1 |= WM8400_AIF_WL_24BITS;
+		break;
+	case SNDRV_PCM_FORMAT_S32_LE:
+		audio1 |= WM8400_AIF_WL_32BITS;
+		break;
+	}
+
+	wm8400_write(codec, WM8400_AUDIO_INTERFACE_1, audio1);
+	return 0;
+}
+
+static int wm8400_mute(struct snd_soc_dai *dai, int mute)
+{
+	struct snd_soc_codec *codec = dai->codec;
+	u16 val = wm8400_read(codec, WM8400_DAC_CTRL) & ~WM8400_DAC_MUTE;
+
+	if (mute)
+		wm8400_write(codec, WM8400_DAC_CTRL, val | WM8400_DAC_MUTE);
+	else
+		wm8400_write(codec, WM8400_DAC_CTRL, val);
+
+	return 0;
+}
+
+/* TODO: set bias for best performance at standby */
+static int wm8400_set_bias_level(struct snd_soc_codec *codec,
+				 enum snd_soc_bias_level level)
+{
+	struct wm8400_priv *wm8400 = codec->private_data;
+	u16 val;
+	int ret;
+
+	switch (level) {
+	case SND_SOC_BIAS_ON:
+		break;
+
+	case SND_SOC_BIAS_PREPARE:
+		/* VMID=2*50k */
+		val = wm8400_read(codec, WM8400_POWER_MANAGEMENT_1) &
+			~WM8400_VMID_MODE_MASK;
+		wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val | 0x2);
+		break;
+
+	case SND_SOC_BIAS_STANDBY:
+		if (codec->bias_level == SND_SOC_BIAS_OFF) {
+			ret = regulator_bulk_enable(ARRAY_SIZE(power),
+						    &power[0]);
+			if (ret != 0) {
+				dev_err(wm8400->wm8400->dev,
+					"Failed to enable regulators: %d\n",
+					ret);
+				return ret;
+			}
+
+			wm8400_write(codec, WM8400_POWER_MANAGEMENT_1,
+				     WM8400_CODEC_ENA | WM8400_SYSCLK_ENA);
+
+			/* Enable all output discharge bits */
+			wm8400_write(codec, WM8400_ANTIPOP1, WM8400_DIS_LLINE |
+				WM8400_DIS_RLINE | WM8400_DIS_OUT3 |
+				WM8400_DIS_OUT4 | WM8400_DIS_LOUT |
+				WM8400_DIS_ROUT);
+
+			/* Enable POBCTRL, SOFT_ST, VMIDTOG and BUFDCOPEN */
+			wm8400_write(codec, WM8400_ANTIPOP2, WM8400_SOFTST |
+				     WM8400_BUFDCOPEN | WM8400_POBCTRL);
+
+			msleep(500);
+
+			/* Enable outputs */
+			val = wm8400_read(codec, WM8400_POWER_MANAGEMENT_1);
+			val |= WM8400_SPK_ENA | WM8400_OUT3_ENA |
+				WM8400_OUT4_ENA | WM8400_LOUT_ENA |
+				WM8400_ROUT_ENA;
+			wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val);
+
+			/* disable all output discharge bits */
+			wm8400_write(codec, WM8400_ANTIPOP1, 0);
+
+			/* Enable VREF & VMID at 2x50k */
+			val |= 0x2 | WM8400_VREF_ENA;
+			wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val);
+
+			msleep(600);
+
+			/* Enable BUFIOEN */
+			wm8400_write(codec, WM8400_ANTIPOP2, WM8400_SOFTST |
+				     WM8400_BUFDCOPEN | WM8400_POBCTRL |
+				     WM8400_BUFIOEN);
+
+			/* Disable outputs */
+			val &= ~(WM8400_SPK_ENA | WM8400_OUT3_ENA |
+				 WM8400_OUT4_ENA | WM8400_LOUT_ENA |
+				 WM8400_ROUT_ENA);
+			wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val);
+
+			/* disable POBCTRL, SOFT_ST and BUFDCOPEN */
+			wm8400_write(codec, WM8400_ANTIPOP2, WM8400_BUFIOEN);
+		}
+
+		/* VMID=2*300k */
+		val = wm8400_read(codec, WM8400_POWER_MANAGEMENT_1) &
+			~WM8400_VMID_MODE_MASK;
+		wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val | 0x4);
+		break;
+
+	case SND_SOC_BIAS_OFF:
+		/* Enable POBCTRL and SOFT_ST */
+		wm8400_write(codec, WM8400_ANTIPOP2, WM8400_SOFTST |
+			WM8400_POBCTRL | WM8400_BUFIOEN);
+
+		/* Enable POBCTRL, SOFT_ST and BUFDCOPEN */
+		wm8400_write(codec, WM8400_ANTIPOP2, WM8400_SOFTST |
+			WM8400_BUFDCOPEN | WM8400_POBCTRL |
+			WM8400_BUFIOEN);
+
+		/* mute DAC */
+		val = wm8400_read(codec, WM8400_DAC_CTRL);
+		wm8400_write(codec, WM8400_DAC_CTRL, val | WM8400_DAC_MUTE);
+
+		/* Enable any disabled outputs */
+		val = wm8400_read(codec, WM8400_POWER_MANAGEMENT_1);
+		val |= WM8400_SPK_ENA | WM8400_OUT3_ENA |
+			WM8400_OUT4_ENA | WM8400_LOUT_ENA |
+			WM8400_ROUT_ENA;
+		wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val);
+
+		/* Disable VMID */
+		val &= ~WM8400_VMID_MODE_MASK;
+		wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val);
+
+		msleep(300);
+
+		/* Enable all output discharge bits */
+		wm8400_write(codec, WM8400_ANTIPOP1, WM8400_DIS_LLINE |
+			WM8400_DIS_RLINE | WM8400_DIS_OUT3 |
+			WM8400_DIS_OUT4 | WM8400_DIS_LOUT |
+			WM8400_DIS_ROUT);
+
+		/* Disable VREF */
+		val &= ~WM8400_VREF_ENA;
+		wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, val);
+
+		/* disable POBCTRL, SOFT_ST and BUFDCOPEN */
+		wm8400_write(codec, WM8400_ANTIPOP2, 0x0);
+
+		ret = regulator_bulk_disable(ARRAY_SIZE(power),
+					     &power[0]);
+		if (ret != 0)
+			return ret;
+
+		break;
+	}
+
+	codec->bias_level = level;
+	return 0;
+}
+
+#define WM8400_RATES SNDRV_PCM_RATE_8000_96000
+
+#define WM8400_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S20_3LE |\
+	SNDRV_PCM_FMTBIT_S24_LE)
+
+/*
+ * The WM8400 supports 2 different and mutually exclusive DAI
+ * configurations.
+ *
+ * 1. ADC/DAC on Primary Interface
+ * 2. ADC on Primary Interface/DAC on secondary
+ */
+struct snd_soc_dai wm8400_dai = {
+/* ADC/DAC on primary */
+	.name = "WM8400 ADC/DAC Primary",
+	.id = 1,
+	.playback = {
+		.stream_name = "Playback",
+		.channels_min = 1,
+		.channels_max = 2,
+		.rates = WM8400_RATES,
+		.formats = WM8400_FORMATS,
+	},
+	.capture = {
+		.stream_name = "Capture",
+		.channels_min = 1,
+		.channels_max = 2,
+		.rates = WM8400_RATES,
+		.formats = WM8400_FORMATS,
+	},
+	.ops = {
+		.hw_params = wm8400_hw_params,
+		.digital_mute = wm8400_mute,
+		.set_fmt = wm8400_set_dai_fmt,
+		.set_clkdiv = wm8400_set_dai_clkdiv,
+		.set_sysclk = wm8400_set_dai_sysclk,
+	},
+};
+EXPORT_SYMBOL_GPL(wm8400_dai);
+
+static int wm8400_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec = socdev->card->codec;
+
+	wm8400_set_bias_level(codec, SND_SOC_BIAS_OFF);
+
+	return 0;
+}
+
+static int wm8400_resume(struct platform_device *pdev)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec = socdev->card->codec;
+
+	wm8400_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
+
+	return 0;
+}
+
+static struct snd_soc_codec *wm8400_codec;
+
+static int wm8400_probe(struct platform_device *pdev)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec;
+	int ret;
+
+	if (!wm8400_codec) {
+		dev_err(&pdev->dev, "wm8400 not yet discovered\n");
+		return -ENODEV;
+	}
+	codec = wm8400_codec;
+
+	socdev->card->codec = codec;
+
+	/* register pcms */
+	ret = snd_soc_new_pcms(socdev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to create pcms\n");
+		goto pcm_err;
+	}
+
+	wm8400_add_controls(codec);
+	wm8400_add_widgets(codec);
+
+	ret = snd_soc_init_card(socdev);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to register card\n");
+		goto card_err;
+	}
+
+	return ret;
+
+card_err:
+	snd_soc_free_pcms(socdev);
+	snd_soc_dapm_free(socdev);
+pcm_err:
+	return ret;
+}
+
+/* power down chip */
+static int wm8400_remove(struct platform_device *pdev)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+
+	snd_soc_free_pcms(socdev);
+	snd_soc_dapm_free(socdev);
+
+	return 0;
+}
+
+struct snd_soc_codec_device soc_codec_dev_wm8400 = {
+	.probe =	wm8400_probe,
+	.remove =	wm8400_remove,
+	.suspend =	wm8400_suspend,
+	.resume =	wm8400_resume,
+};
+
+static void wm8400_probe_deferred(struct work_struct *work)
+{
+	struct wm8400_priv *priv = container_of(work, struct wm8400_priv,
+						work);
+	struct snd_soc_codec *codec = &priv->codec;
+	int ret;
+
+	/* charge output caps */
+	wm8400_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
+
+	/* We're done, tell the subsystem. */
+	ret = snd_soc_register_codec(codec);
+	if (ret != 0) {
+		dev_err(priv->wm8400->dev,
+			"Failed to register codec: %d\n", ret);
+		goto err;
+	}
+
+	ret = snd_soc_register_dai(&wm8400_dai);
+	if (ret != 0) {
+		dev_err(priv->wm8400->dev,
+			"Failed to register DAI: %d\n", ret);
+		goto err_codec;
+	}
+
+	return;
+
+err_codec:
+	snd_soc_unregister_codec(codec);
+err:
+	wm8400_set_bias_level(codec, SND_SOC_BIAS_OFF);
+}
+
+static int wm8400_codec_probe(struct platform_device *dev)
+{
+	struct wm8400_priv *priv;
+	int ret;
+	u16 reg;
+	struct snd_soc_codec *codec;
+
+	priv = kzalloc(sizeof(struct wm8400_priv), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	codec = &priv->codec;
+	codec->private_data = priv;
+	codec->control_data = dev->dev.driver_data;
+	priv->wm8400 = dev->dev.driver_data;
+
+	ret = regulator_bulk_get(priv->wm8400->dev,
+				 ARRAY_SIZE(power), &power[0]);
+	if (ret != 0) {
+		dev_err(&dev->dev, "Failed to get regulators: %d\n", ret);
+	        goto err;
+	}
+
+	codec->dev = &dev->dev;
+	wm8400_dai.dev = &dev->dev;
+
+	codec->name = "WM8400";
+	codec->owner = THIS_MODULE;
+	codec->read = wm8400_read;
+	codec->write = wm8400_write;
+	codec->bias_level = SND_SOC_BIAS_OFF;
+	codec->set_bias_level = wm8400_set_bias_level;
+	codec->dai = &wm8400_dai;
+	codec->num_dai = 1;
+	codec->reg_cache_size = WM8400_REGISTER_COUNT;
+	mutex_init(&codec->mutex);
+	INIT_LIST_HEAD(&codec->dapm_widgets);
+	INIT_LIST_HEAD(&codec->dapm_paths);
+	INIT_WORK(&priv->work, wm8400_probe_deferred);
+
+	wm8400_codec_reset(codec);
+
+	reg = wm8400_read(codec, WM8400_POWER_MANAGEMENT_1);
+	wm8400_write(codec, WM8400_POWER_MANAGEMENT_1, reg | WM8400_CODEC_ENA);
+
+	/* Latch volume update bits */
+	reg = wm8400_read(codec, WM8400_LEFT_LINE_INPUT_1_2_VOLUME);
+	wm8400_write(codec, WM8400_LEFT_LINE_INPUT_1_2_VOLUME,
+		     reg & WM8400_IPVU);
+	reg = wm8400_read(codec, WM8400_RIGHT_LINE_INPUT_1_2_VOLUME);
+	wm8400_write(codec, WM8400_RIGHT_LINE_INPUT_1_2_VOLUME,
+		     reg & WM8400_IPVU);
+
+	wm8400_write(codec, WM8400_LEFT_OUTPUT_VOLUME, 0x50 | (1<<8));
+	wm8400_write(codec, WM8400_RIGHT_OUTPUT_VOLUME, 0x50 | (1<<8));
+
+	wm8400_codec = codec;
+
+	if (!schedule_work(&priv->work)) {
+		ret = -EINVAL;
+		goto err_regulator;
+	}
+
+	return 0;
+
+err_regulator:
+	wm8400_codec = NULL;
+	regulator_bulk_free(ARRAY_SIZE(power), power);
+err:
+	kfree(priv);
+	return ret;
+}
+
+static int __exit wm8400_codec_remove(struct platform_device *dev)
+{
+	struct wm8400_priv *priv = wm8400_codec->private_data;
+	u16 reg;
+
+	snd_soc_unregister_dai(&wm8400_dai);
+	snd_soc_unregister_codec(wm8400_codec);
+
+	reg = wm8400_read(wm8400_codec, WM8400_POWER_MANAGEMENT_1);
+	wm8400_write(wm8400_codec, WM8400_POWER_MANAGEMENT_1,
+		     reg & (~WM8400_CODEC_ENA));
+
+	regulator_bulk_free(ARRAY_SIZE(power), power);
+	kfree(priv);
+
+	wm8400_codec = NULL;
+
+	return 0;
+}
+
+static struct platform_driver wm8400_codec_driver = {
+	.driver = {
+		.name = "wm8400-codec",
+		.owner = THIS_MODULE,
+	},
+	.probe = wm8400_codec_probe,
+	.remove	= __exit_p(wm8400_codec_remove),
+};
+
+static int __init wm8400_codec_init(void)
+{
+	return platform_driver_register(&wm8400_codec_driver);
+}
+module_init(wm8400_codec_init);
+
+static void __exit wm8400_codec_exit(void)
+{
+	platform_driver_unregister(&wm8400_codec_driver);
+}
+module_exit(wm8400_codec_exit);
+
+EXPORT_SYMBOL_GPL(soc_codec_dev_wm8400);
+
+MODULE_DESCRIPTION("ASoC WM8400 driver");
+MODULE_AUTHOR("Mark Brown");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm8400-codec");
diff --git a/sound/soc/codecs/wm8400.h b/sound/soc/codecs/wm8400.h
new file mode 100644
index 000000000000..79c5934d4776
--- /dev/null
+++ b/sound/soc/codecs/wm8400.h
@@ -0,0 +1,62 @@
+/*
+ * wm8400.h  --  audio driver for WM8400
+ *
+ * Copyright 2008 Wolfson Microelectronics PLC.
+ * Author: Mark Brown <broonie@opensource.wolfsonmicro.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef _WM8400_CODEC_H
+#define _WM8400_CODEC_H
+
+#define WM8400_MCLK_DIV 0
+#define WM8400_DACCLK_DIV 1
+#define WM8400_ADCCLK_DIV 2
+#define WM8400_BCLK_DIV 3
+
+#define WM8400_MCLK_DIV_1 0x400
+#define WM8400_MCLK_DIV_2 0x800
+
+#define WM8400_DAC_CLKDIV_1    0x00
+#define WM8400_DAC_CLKDIV_1_5  0x04
+#define WM8400_DAC_CLKDIV_2    0x08
+#define WM8400_DAC_CLKDIV_3    0x0c
+#define WM8400_DAC_CLKDIV_4    0x10
+#define WM8400_DAC_CLKDIV_5_5  0x14
+#define WM8400_DAC_CLKDIV_6    0x18
+
+#define WM8400_ADC_CLKDIV_1    0x00
+#define WM8400_ADC_CLKDIV_1_5  0x20
+#define WM8400_ADC_CLKDIV_2    0x40
+#define WM8400_ADC_CLKDIV_3    0x60
+#define WM8400_ADC_CLKDIV_4    0x80
+#define WM8400_ADC_CLKDIV_5_5  0xa0
+#define WM8400_ADC_CLKDIV_6    0xc0
+
+
+#define WM8400_BCLK_DIV_1                       (0x0 << 1)
+#define WM8400_BCLK_DIV_1_5                     (0x1 << 1)
+#define WM8400_BCLK_DIV_2                       (0x2 << 1)
+#define WM8400_BCLK_DIV_3                       (0x3 << 1)
+#define WM8400_BCLK_DIV_4                       (0x4 << 1)
+#define WM8400_BCLK_DIV_5_5                     (0x5 << 1)
+#define WM8400_BCLK_DIV_6                       (0x6 << 1)
+#define WM8400_BCLK_DIV_8                       (0x7 << 1)
+#define WM8400_BCLK_DIV_11                      (0x8 << 1)
+#define WM8400_BCLK_DIV_12                      (0x9 << 1)
+#define WM8400_BCLK_DIV_16                      (0xA << 1)
+#define WM8400_BCLK_DIV_22                      (0xB << 1)
+#define WM8400_BCLK_DIV_24                      (0xC << 1)
+#define WM8400_BCLK_DIV_32                      (0xD << 1)
+#define WM8400_BCLK_DIV_44                      (0xE << 1)
+#define WM8400_BCLK_DIV_48                      (0xF << 1)
+
+extern struct snd_soc_dai wm8400_dai;
+extern struct snd_soc_codec_device soc_codec_dev_wm8400;
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 9e6e70f8d8b6698e0017c56b86525aabe9c7cd4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:24 -0400
Subject: NFSv4: Support NFSv4 optional attributes in the struct nfs_fattr

Currently, filling struct nfs_fattr is more or less an all or nothing
operation, since NFSv2 and NFSv3 have only mandatory attributes.
In NFSv4, some attributes are optional, and so we may simply not be able to
fill in those fields. Furthermore, NFSv4 allows you to specify which
attributes you are interested in retrieving, thus permitting you to
optimise away retrieval of attributes that you know will no change...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          | 243 ++++++++++++++++++++++++++++++------------------
 fs/nfs/nfs2xdr.c        |   2 +-
 fs/nfs/nfs3xdr.c        |   6 +-
 fs/nfs/nfs4xdr.c        |   6 +-
 include/linux/nfs_xdr.h |  48 ++++++++--
 5 files changed, 202 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 268ce3a46220..b7656bd3706f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -249,13 +249,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 	struct inode *inode = ERR_PTR(-ENOENT);
 	unsigned long hash;
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
 		goto out_no_inode;
-
-	if (!fattr->nlink) {
-		printk("NFS: Buggy server - nlink == 0!\n");
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
 		goto out_no_inode;
-	}
 
 	hash = nfs_fattr_to_ino_t(fattr);
 
@@ -291,7 +288,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
-			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+			if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+					&& !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
 				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
 					inode->i_op = &nfs_referral_inode_operations;
 				else
@@ -304,28 +302,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
+		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+		nfsi->change_attr = 0;
+		inode->i_size = 0;
+		inode->i_nlink = 0;
+		inode->i_uid = -2;
+		inode->i_gid = -2;
+		inode->i_blocks = 0;
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
-		inode->i_atime = fattr->atime;
-		inode->i_mtime = fattr->mtime;
-		inode->i_ctime = fattr->ctime;
-		if (fattr->valid & NFS_ATTR_FATTR_V4)
+		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+			inode->i_atime = fattr->atime;
+		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			inode->i_mtime = fattr->mtime;
+		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			inode->i_ctime = fattr->ctime;
+		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			nfsi->change_attr = fattr->change_attr;
-		inode->i_size = nfs_size_to_loff_t(fattr->size);
-		inode->i_nlink = fattr->nlink;
-		inode->i_uid = fattr->uid;
-		inode->i_gid = fattr->gid;
-		if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+			inode->i_nlink = fattr->nlink;
+		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+			inode->i_uid = fattr->uid;
+		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+			inode->i_gid = fattr->gid;
+		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+			inode->i_blocks = fattr->du.nfs2.blocks;
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 			/*
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-		} else {
-			inode->i_blocks = fattr->du.nfs2.blocks;
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 		nfsi->access_cache = RB_ROOT;
 
 		unlock_new_inode(inode);
@@ -812,25 +827,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
-			nfsi->change_attr == fattr->pre_change_attr) {
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			&& nfsi->change_attr == fattr->pre_change_attr) {
 		nfsi->change_attr = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	}
 	/* If we have atomic WCC data, we may update some attributes */
-	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-		}
-		if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
-		    nfsi->npages == 0)
-			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 	}
+	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+			&& nfsi->npages == 0)
+			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 }
 
 /**
@@ -850,35 +871,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 
 	/* Has the inode gone and changed behind our back? */
-	if (nfsi->fileid != fattr->fileid
-			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+		return -EIO;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		return -EIO;
-	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
-	cur_size = i_size_read(inode);
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	if (cur_size != new_isize && nfsi->npages == 0)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		cur_size = i_size_read(inode);
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		if (cur_size != new_isize && nfsi->npages == 0)
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	}
 
 	/* Have any file permissions changed? */
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
-			|| inode->i_uid != fattr->uid
-			|| inode->i_gid != fattr->gid)
+	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
-	if (inode->i_nlink != fattr->nlink)
+	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_ATTR;
 
-	if (!timespec_equal(&inode->i_atime, &fattr->atime))
+	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
 	if (invalid != 0)
@@ -890,11 +915,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+		return 0;
 	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 
 static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		return 0;
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
@@ -1030,20 +1059,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	/* Don't do a WCC update if these attributes are already stale */
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
 			!nfs_inode_attrs_need_update(inode, fattr)) {
-		fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+				| NFS_ATTR_FATTR_PRESIZE
+				| NFS_ATTR_FATTR_PREMTIME
+				| NFS_ATTR_FATTR_PRECTIME);
 		goto out_noforce;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
 		fattr->pre_change_attr = NFS_I(inode)->change_attr;
-		fattr->valid |= NFS_ATTR_WCC_V4;
+		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
 		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
 		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
 		fattr->pre_size = i_size_read(inode);
-		fattr->valid |= NFS_ATTR_WCC;
+		fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
 	}
 out_noforce:
 	status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1075,18 +1115,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if (nfsi->fileid != fattr->fileid)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
 	server = NFS_SERVER(inode);
 	/* Update the fsid? */
-	if (S_ISDIR(inode->i_mode) &&
+	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
 			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
 		server->fsid = fattr->fsid;
@@ -1096,14 +1136,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
 
-	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
-			| NFS_INO_REVAL_PAGECACHE);
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
+	    nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+		    | NFS_INO_INVALID_ATIME
+		    | NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
 	/* More cache consistency checks */
-	if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (nfsi->change_attr != fattr->change_attr) {
+			dprintk("NFS: change_attr change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
+			nfsi->change_attr = fattr->change_attr;
+		}
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
 		/* NFSv2/v3: Check if the mtime agrees */
 		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 			dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1111,7 +1164,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
+			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		/* If ctime has changed we should definitely clear access+acl caches */
 		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
@@ -1122,59 +1178,66 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				invalid |= NFS_INO_INVALID_DATA;
 				nfs_force_lookup_revalidate(inode);
 			}
+			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 		}
-	} else if (nfsi->change_attr != fattr->change_attr) {
-		dprintk("NFS: change_attr change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		if (S_ISDIR(inode->i_mode))
-			nfs_force_lookup_revalidate(inode);
 	}
 
 	/* Check if our cached file size is stale */
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	cur_isize = i_size_read(inode);
-	if (new_isize != cur_isize) {
-		/* Do we perhaps have any outstanding writes, or has
-		 * the file grown beyond our last write? */
-		if (nfsi->npages == 0 || new_isize > cur_isize) {
-			i_size_write(inode, new_isize);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		cur_isize = i_size_read(inode);
+		if (new_isize != cur_isize) {
+			/* Do we perhaps have any outstanding writes, or has
+			 * the file grown beyond our last write? */
+			if (nfsi->npages == 0 || new_isize > cur_isize) {
+				i_size_write(inode, new_isize);
+				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			}
+			dprintk("NFS: isize change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
 		}
-		dprintk("NFS: isize change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
 	}
 
 
-	memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-	nfsi->change_attr = fattr->change_attr;
+	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
 
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
-	    inode->i_uid != fattr->uid ||
-	    inode->i_gid != fattr->gid)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-
-	if (inode->i_nlink != fattr->nlink) {
-		invalid |= NFS_INO_INVALID_ATTR;
-		if (S_ISDIR(inode->i_mode))
-			invalid |= NFS_INO_INVALID_DATA;
+	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_mode = fattr->mode;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+		if (inode->i_uid != fattr->uid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_uid = fattr->uid;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+		if (inode->i_gid != fattr->gid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_gid = fattr->gid;
+		}
 	}
 
-	inode->i_mode = fattr->mode;
-	inode->i_nlink = fattr->nlink;
-	inode->i_uid = fattr->uid;
-	inode->i_gid = fattr->gid;
+	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+		if (inode->i_nlink != fattr->nlink) {
+			invalid |= NFS_INO_INVALID_ATTR;
+			if (S_ISDIR(inode->i_mode))
+				invalid |= NFS_INO_INVALID_DATA;
+			inode->i_nlink = fattr->nlink;
+		}
+	}
 
-	if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 		/*
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- 	} else {
- 		inode->i_blocks = fattr->du.nfs2.blocks;
  	}
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+		inode->i_blocks = fattr->du.nfs2.blocks;
 
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..bea99992c302 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -136,7 +136,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
 	p = xdr_decode_time(p, &fattr->ctime);
-	fattr->valid |= NFS_ATTR_FATTR;
+	fattr->valid |= NFS_ATTR_FATTR_V2;
 	fattr->rdev = new_decode_dev(rdev);
 	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
 		fattr->type = NFFIFO;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde46..c0f7d02aced9 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -177,7 +177,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time3(p, &fattr->ctime);
 
 	/* Update the mode bits */
-	fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+	fattr->valid |= NFS_ATTR_FATTR_V3;
 	return p;
 }
 
@@ -233,7 +233,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_hyper(p, &fattr->pre_size);
 	p = xdr_decode_time3(p, &fattr->pre_mtime);
 	p = xdr_decode_time3(p, &fattr->pre_ctime);
-	fattr->valid |= NFS_ATTR_WCC;
+	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PREMTIME
+		| NFS_ATTR_FATTR_PRECTIME;
 	return p;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f0ee3e2bd84..7d220da3db36 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3012,7 +3012,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
 		goto xdr_error;
 	fattr->type = nfs_type2fmt[type].nfs2type;
-	fmode = nfs_type2fmt[type].mode;
+	fattr->mode = nfs_type2fmt[type].mode;
 
 	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
 		goto xdr_error;
@@ -3026,7 +3026,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 						struct nfs4_fs_locations,
 						fattr))) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+	if ((status = decode_attr_mode(xdr, bitmap, &fmode)) != 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
 	if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
@@ -3050,7 +3050,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if (fattr->fileid == 0 && fileid != 0)
 		fattr->fileid = fileid;
 	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
-		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+		fattr->valid = NFS_ATTR_FATTR_V4;
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2e5f00066afd..b99295e07cdf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -27,7 +27,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 }
 
 struct nfs_fattr {
-	unsigned short		valid;		/* which fields are valid */
+	unsigned int		valid;		/* which fields are valid */
 	__u64			pre_size;	/* pre_op_attr.size	  */
 	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
 	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
@@ -59,12 +59,46 @@ struct nfs_fattr {
 	unsigned long		gencount;
 };
 
-#define NFS_ATTR_WCC		0x0001		/* pre-op WCC data    */
-#define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
-#define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
-#define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
-#define NFS_ATTR_WCC_V4		0x0010		/* pre-op change attribute */
-#define NFS_ATTR_FATTR_V4_REFERRAL	0x0020		/* NFSv4 referral */
+#define NFS_ATTR_FATTR_TYPE		(1U << 0)
+#define NFS_ATTR_FATTR_MODE		(1U << 1)
+#define NFS_ATTR_FATTR_NLINK		(1U << 2)
+#define NFS_ATTR_FATTR_OWNER		(1U << 3)
+#define NFS_ATTR_FATTR_GROUP		(1U << 4)
+#define NFS_ATTR_FATTR_RDEV		(1U << 5)
+#define NFS_ATTR_FATTR_SIZE		(1U << 6)
+#define NFS_ATTR_FATTR_PRESIZE		(1U << 7)
+#define NFS_ATTR_FATTR_BLOCKS_USED	(1U << 8)
+#define NFS_ATTR_FATTR_SPACE_USED	(1U << 9)
+#define NFS_ATTR_FATTR_FSID		(1U << 10)
+#define NFS_ATTR_FATTR_FILEID		(1U << 11)
+#define NFS_ATTR_FATTR_ATIME		(1U << 12)
+#define NFS_ATTR_FATTR_MTIME		(1U << 13)
+#define NFS_ATTR_FATTR_CTIME		(1U << 14)
+#define NFS_ATTR_FATTR_PREMTIME		(1U << 15)
+#define NFS_ATTR_FATTR_PRECTIME		(1U << 16)
+#define NFS_ATTR_FATTR_CHANGE		(1U << 17)
+#define NFS_ATTR_FATTR_PRECHANGE	(1U << 18)
+#define NFS_ATTR_FATTR_V4_REFERRAL	(1U << 19)	/* NFSv4 referral */
+
+#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
+		| NFS_ATTR_FATTR_MODE \
+		| NFS_ATTR_FATTR_NLINK \
+		| NFS_ATTR_FATTR_OWNER \
+		| NFS_ATTR_FATTR_GROUP \
+		| NFS_ATTR_FATTR_RDEV \
+		| NFS_ATTR_FATTR_SIZE \
+		| NFS_ATTR_FATTR_FSID \
+		| NFS_ATTR_FATTR_FILEID \
+		| NFS_ATTR_FATTR_ATIME \
+		| NFS_ATTR_FATTR_MTIME \
+		| NFS_ATTR_FATTR_CTIME)
+#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_BLOCKS_USED)
+#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_SPACE_USED)
+#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
+		| NFS_ATTR_FATTR_SPACE_USED \
+		| NFS_ATTR_FATTR_CHANGE)
 
 /*
  * Info on the file system
-- 
cgit v1.2.3-71-gd317


From 1ca277d88dafdbc3c5a69d32590e7184b9af6371 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:25 -0400
Subject: NFS: Shrink the struct nfs_fattr

We don't need the bitmap[] field anymore, since the 'valid' field tells us
all we need to know about which attributes were filled in...
Also move the pre-op attributes in order to improve the structure packing.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c        | 3 ---
 include/linux/nfs_xdr.h | 7 +++----
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7d220da3db36..9f1df8361974 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3002,9 +3002,6 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
 		goto xdr_error;
 
-	fattr->bitmap[0] = bitmap[0];
-	fattr->bitmap[1] = bitmap[1];
-
 	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
 		goto xdr_error;
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b99295e07cdf..6013acb0131f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -28,9 +28,6 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
-	__u64			pre_size;	/* pre_op_attr.size	  */
-	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
-	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	enum nfs_ftype		type;		/* always use NFSv2 types */
 	__u32			mode;
 	__u32			nlink;
@@ -52,9 +49,11 @@ struct nfs_fattr {
 	struct timespec		atime;
 	struct timespec		mtime;
 	struct timespec		ctime;
-	__u32			bitmap[2];	/* NFSv4 returned attribute bitmap */
 	__u64			change_attr;	/* NFSv4 change attribute */
 	__u64			pre_change_attr;/* pre-op NFSv4 change attribute */
+	__u64			pre_size;	/* pre_op_attr.size	  */
+	struct timespec		pre_mtime;	/* pre_op_attr.mtime	  */
+	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	unsigned long		time_start;
 	unsigned long		gencount;
 };
-- 
cgit v1.2.3-71-gd317


From bca794785c2c12ecddeb09e70165b8ff80baa6ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:26 -0400
Subject: NFS: Fix the type of struct nfs_fattr->mode

There is no point in using anything other than umode_t, since we copy the
content pretty much directly into inode->i_mode.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/getroot.c        |  4 ++--
 fs/nfs/nfs2xdr.c        |  7 +++----
 fs/nfs/nfs3xdr.c        | 31 +++++++++++++------------------
 fs/nfs/nfs4xdr.c        | 40 +++++++++++++++++++---------------------
 include/linux/nfs_xdr.h |  3 +--
 5 files changed, 38 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " getroot encountered non-directory\n");
 		return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " lookupfh encountered non-directory\n");
 		return -ENOTDIR;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index bea99992c302..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
 static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-	u32 rdev;
-	fattr->type = (enum nfs_ftype) ntohl(*p++);
+	u32 rdev, type;
+	type = ntohl(*p++);
 	fattr->mode = ntohl(*p++);
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
@@ -138,8 +138,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->ctime);
 	fattr->valid |= NFS_ATTR_FATTR_V2;
 	fattr->rdev = new_decode_dev(rdev);
-	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
-		fattr->type = NFFIFO;
+	if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
 		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
 		fattr->rdev = 0;
 	}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index c0f7d02aced9..e6a1932c7110 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
 /*
  * Map file type to S_IFMT bits
  */
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-      { 0,		NFNON	},
-      { S_IFREG,	NFREG	},
-      { S_IFDIR,	NFDIR	},
-      { S_IFBLK,	NFBLK	},
-      { S_IFCHR,	NFCHR	},
-      { S_IFLNK,	NFLNK	},
-      { S_IFSOCK,	NFSOCK	},
-      { S_IFIFO,	NFFIFO	},
-      { 0,		NFBAD	}
+static const umode_t nfs_type2fmt[] = {
+	[NF3BAD] = 0,
+	[NF3REG] = S_IFREG,
+	[NF3DIR] = S_IFDIR,
+	[NF3BLK] = S_IFBLK,
+	[NF3CHR] = S_IFCHR,
+	[NF3LNK] = S_IFLNK,
+	[NF3SOCK] = S_IFSOCK,
+	[NF3FIFO] = S_IFIFO,
 };
 
 /*
@@ -148,13 +144,12 @@ static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
 	unsigned int	type, major, minor;
-	int		fmode;
+	umode_t		fmode;
 
 	type = ntohl(*p++);
-	if (type >= NF3BAD)
-		type = NF3BAD;
-	fmode = nfs_type2fmt[type].mode;
-	fattr->type = nfs_type2fmt[type].nfs2type;
+	if (type > NF3FIFO)
+		type = NF3NON;
+	fmode = nfs_type2fmt[type];
 	fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1df8361974..c1906d2a226b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
 				 decode_lookup_maxsz + \
 				 decode_fs_locations_maxsz)
 
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-	{ 0,		NFNON	     },
-	{ S_IFREG,	NFREG	     },
-	{ S_IFDIR,	NFDIR	     },
-	{ S_IFBLK,	NFBLK	     },
-	{ S_IFCHR,	NFCHR	     },
-	{ S_IFLNK,	NFLNK	     },
-	{ S_IFSOCK,	NFSOCK	     },
-	{ S_IFIFO,	NFFIFO	     },
-	{ 0,		NFNON	     },
-	{ 0,		NFNON	     },
+static const umode_t nfs_type2fmt[] = {
+	[NF4BAD] = 0,
+	[NF4REG] = S_IFREG,
+	[NF4DIR] = S_IFDIR,
+	[NF4BLK] = S_IFBLK,
+	[NF4CHR] = S_IFCHR,
+	[NF4LNK] = S_IFLNK,
+	[NF4SOCK] = S_IFSOCK,
+	[NF4FIFO] = S_IFIFO,
+	[NF4ATTRDIR] = 0,
+	[NF4NAMEDATTR] = 0,
 };
 
 struct compound_hdr {
@@ -2173,7 +2170,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
 	}
-	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
 	return 0;
 }
 
@@ -2580,8 +2577,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	return status;
 }
 
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
 {
+	uint32_t tmp;
 	__be32 *p;
 
 	*mode = 0;
@@ -2589,8 +2587,8 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(*mode);
-		*mode &= ~S_IFMT;
+		READ32(tmp);
+		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
@@ -2994,7 +2992,8 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	uint32_t attrlen,
 		 bitmap[2] = {0},
 		 type;
-	int status, fmode = 0;
+	int status;
+	umode_t fmode = 0;
 	uint64_t fileid;
 
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3008,8 +3007,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 
 	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
 		goto xdr_error;
-	fattr->type = nfs_type2fmt[type].nfs2type;
-	fattr->mode = nfs_type2fmt[type].mode;
+	fattr->mode = nfs_type2fmt[type];
 
 	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
 		goto xdr_error;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6013acb0131f..0691b9c188d9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -28,8 +28,7 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
-	enum nfs_ftype		type;		/* always use NFSv2 types */
-	__u32			mode;
+	umode_t			mode;
 	__u32			nlink;
 	__u32			uid;
 	__u32			gid;
-- 
cgit v1.2.3-71-gd317


From a65318bf3afc93ce49227e849d213799b072c5fd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:28 -0400
Subject: NFSv4: Simplify some cache consistency post-op GETATTRs

Certain asynchronous operations such as write() do not expect
(or care) that other metadata such as the file owner, mode, acls, ...
change. All they want to do is update and/or check the change attribute,
ctime, and mtime.
By skipping the file owner and group update, we also avoid having to do a
potential idmapper upcall for these asynchronous RPC calls.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c         | 13 ++++++++-----
 include/linux/nfs_fs_sb.h |  5 +++++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aa433d077945..101f5f4c304f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1439,7 +1439,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	if (calldata->arg.seqid == NULL)
 		goto out_free_calldata;
 	calldata->arg.fmode = 0;
-	calldata->arg.bitmask = server->attr_bitmask;
+	calldata->arg.bitmask = server->cache_consistency_bitmask;
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
@@ -1600,6 +1600,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
+		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->acl_bitmask = res.acl_bitmask;
 	}
 	return status;
@@ -2079,7 +2082,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
 
-	args->bitmask = server->attr_bitmask;
+	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2323,7 +2326,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.pages = &page,
 		.pgbase = 0,
 		.count = count,
-		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+		.bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
@@ -2552,7 +2555,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 
@@ -2575,7 +2578,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 9bb81aec91cf..29b1e40dce99 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -106,6 +106,11 @@ struct nfs_server {
 	u32			attr_bitmask[2];/* V4 bitmask representing the set
 						   of attributes supported on this
 						   filesystem */
+	u32			cache_consistency_bitmask[2];
+						/* V4 bitmask representing the subset
+						   of change attribute, size, ctime
+						   and mtime attributes supported by
+						   the server */
 	u32			acl_bitmask;	/* V4 bitmask representing the ACEs
 						   that are supported on this
 						   filesystem */
-- 
cgit v1.2.3-71-gd317


From fb8a1f11b64e213d94dfa1cebb2a42a7b8c115c4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:29 -0400
Subject: NFS: cleanup - remove struct nfs_inode->ncommit

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  1 -
 fs/nfs/write.c         | 25 ++++++++++++++++---------
 include/linux/nfs_fs.h |  3 +--
 3 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b7656bd3706f..00f116cdadc6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1345,7 +1345,6 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..1a999939fedf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -404,7 +404,6 @@ nfs_mark_request_commit(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	nfsi->ncommit++;
 	set_bit(PG_CLEAN, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
@@ -523,6 +522,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
 	}
 }
 
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
+
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
  * nfs_scan_commit - Scan an inode for commit requests
@@ -538,16 +543,18 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int res = 0;
 
-	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(nfsi, dst, idx_start, npages,
-				NFS_PAGE_TAG_COMMIT);
-		nfsi->ncommit -= res;
-	}
-	return res;
+	if (!nfs_need_commit(nfsi))
+		return 0;
+
+	return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
 }
 #else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return 0;
+}
+
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
@@ -820,7 +827,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
-		if (!NFS_I(inode)->ncommit)
+		if (!nfs_need_commit(NFS_I(inode)))
 			data->args.stable = NFS_FILE_SYNC;
 	}
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index db867b04ac3c..c9fecd3e8f0f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -166,8 +166,7 @@ struct nfs_inode {
 	 */
 	struct radix_tree_root	nfs_page_tree;
 
-	unsigned long		ncommit,
-				npages;
+	unsigned long		npages;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
-- 
cgit v1.2.3-71-gd317


From 72cb77f4a5ace37b12dcb47a0e8637a2c28ad881 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:10:30 -0400
Subject: NFS: Throttle page dirtying while we're flushing to disk

The following patch is a combination of a patch by myself and Peter
Staubach.

Trond: If we allow other processes to dirty pages while a process is doing
a consistency sync to disk, we can end up never making progress.

Peter: Attached is a patch which addresses a continuing problem with
the NFS client generating out of order WRITE requests.  While
this is compliant with all of the current protocol
specifications, there are servers in the market which can not
handle out of order WRITE requests very well.  Also, this may
lead to sub-optimal block allocations in the underlying file
system on the server.  This may cause the read throughputs to
be reduced when reading the file from the server.

Peter: There has been a lot of work recently done to address out of
order issues on a systemic level.  However, the NFS client is
still susceptible to the problem.  Out of order WRITE
requests can occur when pdflush is in the middle of writing
out pages while the process dirtying the pages calls
generic_file_buffered_write which calls
generic_perform_write which calls
balance_dirty_pages_rate_limited which ends up calling
writeback_inodes which ends up calling back into the NFS
client to writes out dirty pages for the same file that
pdflush happens to be working with.

Signed-off-by: Peter Staubach <staubach@redhat.com>
[modification by Trond to merge the two similar patches]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          |  9 +++++++++
 fs/nfs/inode.c         | 12 ++++++++++++
 fs/nfs/internal.h      |  1 +
 fs/nfs/nfs4proc.c      | 10 +---------
 fs/nfs/pagelist.c      | 11 -----------
 fs/nfs/write.c         | 28 +++++++++++++++++++---------
 include/linux/nfs_fs.h |  1 +
 7 files changed, 43 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..404c19c866a7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,6 +354,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+	/*
+	 * Prevent starvation issues if someone is doing a consistency
+	 * sync-to-disk
+	 */
+	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		return ret;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00f116cdadc6..c40adc5dd609 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -65,6 +65,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
+/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 /**
  * nfs_compat_user_ino64 - returns the user-visible inode number
  * @fileid: 64-bit fileid
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..a55e69aa52e5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,6 +165,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs4_clear_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
 
 /* super.c */
 void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 101f5f4c304f..95f171e7e05a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start, KM_USER0);
 }
 
-static int nfs4_wait_bit_killable(void *word)
-{
-	if (fatal_signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
 static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 	might_sleep();
 
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-			nfs4_wait_bit_killable, TASK_KILLABLE);
+			nfs_wait_bit_killable, TASK_KILLABLE);
 	return res;
 }
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
 	kref_put(&req->wb_kref, nfs_free_request);
 }
 
-static int nfs_wait_bit_killable(void *word)
-{
-	int ret = 0;
-
-	if (fatal_signal_pending(current))
-		ret = -ERESTARTSYS;
-	else
-		schedule();
-	return ret;
-}
-
 /**
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1a999939fedf..36fd35e0de83 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
+	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
+	/* Stop dirtying of new pages while we sync */
+	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (err)
+		goto out_err;
+
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
+
+	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
 	if (err < 0)
-		return err;
-	if (pgio.pg_error < 0)
-		return pgio.pg_error;
+		goto out_err;
+	err = pgio.pg_error;
+	if (err < 0)
+		goto out_err;
 	return 0;
+out_err:
+	return err;
 }
 
 /*
@@ -1432,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 {
 	struct writeback_control wbc = {
 		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_NONE,
+		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
 		.for_writepages = 1,
 	};
-	int ret;
 
-	ret = __nfs_write_mapping(mapping, &wbc, how);
-	if (ret < 0)
-		return ret;
-	wbc.sync_mode = WB_SYNC_ALL;
 	return __nfs_write_mapping(mapping, &wbc, how);
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c9fecd3e8f0f..933bc261c0df 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -206,6 +206,7 @@ struct nfs_inode {
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
+#define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.3-71-gd317


From 441e3e242903f9b190d5764bed73edb58f977413 Mon Sep 17 00:00:00 2001
From: Tom Talpey <tmtalpey@gmail.com>
Date: Wed, 11 Mar 2009 14:37:56 -0400
Subject: SUNRPC: dynamically load RPC transport modules on-demand

Provide an api to attempt to load any necessary kernel RPC
client transport module automatically. By convention, the
desired module name is "xprt"+"transport name". For example,
when NFS mounting with "-o proto=rdma", attempt to load the
"xprtrdma" module.

Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 11fc71d50c1e..2b0d960603b9 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -235,6 +235,7 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
  */
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
+int			xprt_load_transport(const char *);
 void			xprt_set_retrans_timeout_def(struct rpc_task *task);
 void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 62098d101a1f..d1afec640394 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,6 +151,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
+/**
+ * xprt_load_transport - load a transport implementation
+ * @transport_name: transport to load
+ *
+ * Returns:
+ * 0:		transport successfully loaded
+ * -ENOENT:	transport module not available
+ */
+int xprt_load_transport(const char *transport_name)
+{
+	struct xprt_class *t;
+	char module_name[sizeof t->name + 5];
+	int result;
+
+	result = 0;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (strcmp(t->name, transport_name) == 0) {
+			spin_unlock(&xprt_list_lock);
+			goto out;
+		}
+	}
+	spin_unlock(&xprt_list_lock);
+	strcpy(module_name, "xprt");
+	strncat(module_name, transport_name, sizeof t->name);
+	result = request_module(module_name);
+out:
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_load_transport);
+
 /**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
-- 
cgit v1.2.3-71-gd317


From 76e6eee03353f01bfca707d4dbb1f10a4ee27dc0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 12 Mar 2009 14:35:43 -0600
Subject: cpumask: tsk_cpumask for accessing the struct task_struct's
 cpus_allowed.

This allows us to change the representation (to a dangling bitmap or
cpumask_var_t) without breaking all the callers: they can use
tsk_cpumask() now and won't see a difference as the changes roll into
linux-next.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/sched.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e057c94..011db2f4c94c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1419,6 +1419,9 @@ struct task_struct {
 #endif
 };
 
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
-- 
cgit v1.2.3-71-gd317


From 45e575ab9bfada5a5ef1b6174f8e749b1ecf0864 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 12 Mar 2009 14:35:44 -0600
Subject: cpumask: mm_cpumask for accessing the struct mm_struct's cpu_vm_mask.

This allows us to change the representation (to a dangling bitmap or
cpumask_var_t) without breaking all the callers: they can use
mm_cpumask() now and won't see a difference as the changes roll into
linux-next.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/mm_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 92915e81443f..d84feb7bdbf0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -276,4 +276,7 @@ struct mm_struct {
 #endif
 };
 
+/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
+#define mm_cpumask(mm) (&(mm)->cpu_vm_mask)
+
 #endif /* _LINUX_MM_TYPES_H */
-- 
cgit v1.2.3-71-gd317


From f21cfb258df6dd3ea0b3e56d75c7e994edb81b35 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 12 Mar 2009 21:05:42 +0900
Subject: irq: add remove_irq() for freeing of setup_irq() irqs

Impact: add new API

This patch adds a remove_irq() function for releasing
interrupts requested with setup_irq().

Without this patch we have no way of releasing such
interrupts since free_irq() today tries to kfree()
the irqaction passed with setup_irq().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090312120542.2926.56609.sendpatchset@rx1.opensource.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h |  1 +
 kernel/irq/manage.c | 39 ++++++++++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..56f9988362ec 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -236,6 +236,7 @@ typedef struct irq_desc		irq_desc_t;
 #include <asm/hw_irq.h>
 
 extern int setup_irq(unsigned int irq, struct irqaction *new);
+extern struct irqaction *remove_irq(unsigned int irq, void *dev_id);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 52ee17135092..8b069a7046e9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -551,20 +551,14 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 }
 
 /**
- *	free_irq - free an interrupt
+ *	remove_irq - free an interrupt
  *	@irq: Interrupt line to free
  *	@dev_id: Device identity to free
  *
- *	Remove an interrupt handler. The handler is removed and if the
- *	interrupt line is no longer in use by any driver it is disabled.
- *	On a shared IRQ the caller must ensure the interrupt is disabled
- *	on the card it drives before calling this function. The function
- *	does not return until any executing interrupts for this IRQ
- *	have completed.
- *
- *	This function must not be called from interrupt context.
+ * Used to remove interrupts statically setup by the early boot process.
  */
-void free_irq(unsigned int irq, void *dev_id)
+
+struct irqaction *remove_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
@@ -573,7 +567,7 @@ void free_irq(unsigned int irq, void *dev_id)
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
 
 	if (!desc)
-		return;
+		return NULL;
 
 	spin_lock_irqsave(&desc->lock, flags);
 
@@ -589,7 +583,7 @@ void free_irq(unsigned int irq, void *dev_id)
 			WARN(1, "Trying to free already-free IRQ %d\n", irq);
 			spin_unlock_irqrestore(&desc->lock, flags);
 
-			return;
+			return NULL;
 		}
 
 		if (action->dev_id == dev_id)
@@ -636,7 +630,26 @@ void free_irq(unsigned int irq, void *dev_id)
 		local_irq_restore(flags);
 	}
 #endif
-	kfree(action);
+	return action;
+}
+
+/**
+ *	free_irq - free an interrupt allocated with request_irq
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove an interrupt handler. The handler is removed and if the
+ *	interrupt line is no longer in use by any driver it is disabled.
+ *	On a shared IRQ the caller must ensure the interrupt is disabled
+ *	on the card it drives before calling this function. The function
+ *	does not return until any executing interrupts for this IRQ
+ *	have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+	kfree(remove_irq(irq, dev_id));
 }
 EXPORT_SYMBOL(free_irq);
 
-- 
cgit v1.2.3-71-gd317


From cbf94f06824780183e4bba165c7c29d5c7bd9a51 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 12 Mar 2009 21:05:51 +0900
Subject: irq: match remove_irq() args with setup_irq()

Modify remove_irq() to match setup_irq().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090312120551.2926.43942.sendpatchset@rx1.opensource.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h |  2 +-
 kernel/irq/manage.c | 26 +++++++++++++++++---------
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 56f9988362ec..737eafbc1f3d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -236,7 +236,7 @@ typedef struct irq_desc		irq_desc_t;
 #include <asm/hw_irq.h>
 
 extern int setup_irq(unsigned int irq, struct irqaction *new);
-extern struct irqaction *remove_irq(unsigned int irq, void *dev_id);
+extern void remove_irq(unsigned int irq, struct irqaction *act);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8b069a7046e9..fc16570c9b46 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -550,15 +550,11 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 	return __setup_irq(irq, desc, act);
 }
 
-/**
- *	remove_irq - free an interrupt
- *	@irq: Interrupt line to free
- *	@dev_id: Device identity to free
- *
- * Used to remove interrupts statically setup by the early boot process.
+ /*
+ * Internal function to unregister an irqaction - used to free
+ * regular and special interrupts that are part of the architecture.
  */
-
-struct irqaction *remove_irq(unsigned int irq, void *dev_id)
+static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
@@ -633,6 +629,18 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id)
 	return action;
 }
 
+/**
+ *	remove_irq - free an interrupt
+ *	@irq: Interrupt line to free
+ *	@act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_irq(unsigned int irq, struct irqaction *act)
+{
+	__free_irq(irq, act->dev_id);
+}
+
 /**
  *	free_irq - free an interrupt allocated with request_irq
  *	@irq: Interrupt line to free
@@ -649,7 +657,7 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id)
  */
 void free_irq(unsigned int irq, void *dev_id)
 {
-	kfree(remove_irq(irq, dev_id));
+	kfree(__free_irq(irq, dev_id));
 }
 EXPORT_SYMBOL(free_irq);
 
-- 
cgit v1.2.3-71-gd317


From 71969fd9e2c523d22bf1742eb31f1562247710eb Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 25 Jan 2009 16:50:02 +0200
Subject: [SCSI] major.h: char-major number for OSD device driver

Allocate major 260 for osd.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
CC: Torben Mathiasen <device@lanana.org>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 Documentation/devices.txt | 6 ++++++
 include/linux/major.h     | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 2be08240ee80..62254d4510c6 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -3145,6 +3145,12 @@ Your cooperation is appreciated.
 		  1 = /dev/blockrom1	Second ROM card's translation layer interface
 		  ...
 
+260 char	OSD (Object-based-device) SCSI Device
+		  0 = /dev/osd0		First OSD Device
+		  1 = /dev/osd1		Second OSD Device
+		  ...
+		  255 = /dev/osd255	256th OSD Device
+
  ****	ADDITIONAL /dev DIRECTORY ENTRIES
 
 This section details additional entries that should or may exist in
diff --git a/include/linux/major.h b/include/linux/major.h
index 88249452b935..058ec15dd060 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -171,5 +171,6 @@
 #define VIOTAPE_MAJOR		230
 
 #define BLOCK_EXT_MAJOR		259
+#define SCSI_OSD_MAJOR		260	/* open-osd's OSD scsi device */
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 446c92b2901bedb3725d29b4e73def8aba623ffc Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Thu, 12 Mar 2009 18:03:16 +0100
Subject: [ARM] 5421/1: ftrace: fix crash due to tracing of __naked functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a fix for the following crash observed in 2.6.29-rc3:
http://lkml.org/lkml/2009/1/29/150

On ARM it doesn't make sense to trace a naked function because then
mcount is called without stack and frame pointer being set up and there
is no chance to restore the lr register to the value before mcount was
called.

Reported-by: Matthias Kaehlcke <matthias@kaehlcke.net>
Tested-by: Matthias Kaehlcke <matthias@kaehlcke.net>

Cc: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: Steven Rostedt <rostedt@home.goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/fiq.c           |  4 ++--
 arch/arm/mm/copypage-feroceon.c |  2 +-
 arch/arm/mm/copypage-v3.c       |  2 +-
 arch/arm/mm/copypage-v4mc.c     |  2 +-
 arch/arm/mm/copypage-v4wb.c     |  2 +-
 arch/arm/mm/copypage-v4wt.c     |  2 +-
 arch/arm/mm/copypage-xsc3.c     |  2 +-
 arch/arm/mm/copypage-xscale.c   |  2 +-
 include/linux/compiler-gcc.h    | 10 +++++++++-
 9 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c
index 36f81d967979..6ff7919613d7 100644
--- a/arch/arm/kernel/fiq.c
+++ b/arch/arm/kernel/fiq.c
@@ -88,7 +88,7 @@ void set_fiq_handler(void *start, unsigned int length)
  * disable irqs for the duration.  Note - these functions are almost
  * entirely coded in assembly.
  */
-void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
+void __naked set_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
@@ -106,7 +106,7 @@ void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
 
-void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
+void __naked get_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
diff --git a/arch/arm/mm/copypage-feroceon.c b/arch/arm/mm/copypage-feroceon.c
index c3ba6a94da0c..70997d5bee2d 100644
--- a/arch/arm/mm/copypage-feroceon.c
+++ b/arch/arm/mm/copypage-feroceon.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 
-static void __attribute__((naked))
+static void __naked
 feroceon_copy_user_page(void *kto, const void *kfrom)
 {
 	asm("\
diff --git a/arch/arm/mm/copypage-v3.c b/arch/arm/mm/copypage-v3.c
index 70ed96c8af8e..de9c06854ad7 100644
--- a/arch/arm/mm/copypage-v3.c
+++ b/arch/arm/mm/copypage-v3.c
@@ -15,7 +15,7 @@
  *
  * FIXME: do we need to handle cache stuff...
  */
-static void __attribute__((naked))
+static void __naked
 v3_copy_user_page(void *kto, const void *kfrom)
 {
 	asm("\n\
diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c
index 1601698b9800..7370a7142b04 100644
--- a/arch/arm/mm/copypage-v4mc.c
+++ b/arch/arm/mm/copypage-v4mc.c
@@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_highpage that does the right thing.
  */
-static void __attribute__((naked))
+static void __naked
 mc_copy_user_page(void *from, void *to)
 {
 	asm volatile(
diff --git a/arch/arm/mm/copypage-v4wb.c b/arch/arm/mm/copypage-v4wb.c
index 3ec93dab7656..9ab098414227 100644
--- a/arch/arm/mm/copypage-v4wb.c
+++ b/arch/arm/mm/copypage-v4wb.c
@@ -22,7 +22,7 @@
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_highpage that does the right thing.
  */
-static void __attribute__((naked))
+static void __naked
 v4wb_copy_user_page(void *kto, const void *kfrom)
 {
 	asm("\
diff --git a/arch/arm/mm/copypage-v4wt.c b/arch/arm/mm/copypage-v4wt.c
index 0f1188efae45..300efafd6643 100644
--- a/arch/arm/mm/copypage-v4wt.c
+++ b/arch/arm/mm/copypage-v4wt.c
@@ -20,7 +20,7 @@
  * dirty data in the cache.  However, we do have to ensure that
  * subsequent reads are up to date.
  */
-static void __attribute__((naked))
+static void __naked
 v4wt_copy_user_page(void *kto, const void *kfrom)
 {
 	asm("\
diff --git a/arch/arm/mm/copypage-xsc3.c b/arch/arm/mm/copypage-xsc3.c
index 39a994542cad..bc4525f5ab23 100644
--- a/arch/arm/mm/copypage-xsc3.c
+++ b/arch/arm/mm/copypage-xsc3.c
@@ -29,7 +29,7 @@
  * if we eventually end up using our copied page.
  *
  */
-static void __attribute__((naked))
+static void __naked
 xsc3_mc_copy_user_page(void *kto, const void *kfrom)
 {
 	asm("\
diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c
index d18f2397ee2d..76824d3e966a 100644
--- a/arch/arm/mm/copypage-xscale.c
+++ b/arch/arm/mm/copypage-xscale.c
@@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
  * and merged as appropriate.
  */
-static void __attribute__((naked))
+static void __naked
 mc_copy_user_page(void *from, void *to)
 {
 	/*
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 1514d534deeb..a3ed7cb8ca34 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -52,7 +52,15 @@
 #define __deprecated			__attribute__((deprecated))
 #define __packed			__attribute__((packed))
 #define __weak				__attribute__((weak))
-#define __naked				__attribute__((naked))
+
+/*
+ * it doesn't make sense on ARM (currently the only user of __naked) to trace
+ * naked functions because then mcount is called without stack and frame pointer
+ * being set up and there is no chance to restore the lr register to the value
+ * before mcount was called.
+ */
+#define __naked				__attribute__((naked)) notrace
+
 #define __noreturn			__attribute__((noreturn))
 
 /*
-- 
cgit v1.2.3-71-gd317


From d820ac4c2fa881079e6b689d2098adce337558ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 01:30:40 +0100
Subject: locking: rename trace_softirq_[enter|exit] =>
 lockdep_softirq_[enter|exit]

Impact: cleanup

The naming clashes with upcoming softirq tracepoints, so rename the
APIs to lockdep_*().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irqflags.h | 8 ++++----
 kernel/softirq.c         | 4 ++--
 lib/locking-selftest.c   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13224c9..b02a3f1d46a0 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -24,8 +24,8 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
-# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
+# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
+# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -38,8 +38,8 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
-# define trace_softirq_enter()		do { } while (0)
-# define trace_softirq_exit()		do { } while (0)
+# define lockdep_softirq_enter()	do { } while (0)
+# define lockdep_softirq_exit()		do { } while (0)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7948fe..08a030f85416 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)
 	account_system_vtime(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
+	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
 restart:
@@ -220,7 +220,7 @@ restart:
 	if (pending)
 		wakeup_softirqd();
 
-	trace_softirq_exit();
+	lockdep_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 280332c1827c..619313ed6c46 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -157,11 +157,11 @@ static void init_shared_classes(void)
 #define SOFTIRQ_ENTER()				\
 		local_bh_disable();		\
 		local_irq_disable();		\
-		trace_softirq_enter();		\
+		lockdep_softirq_enter();	\
 		WARN_ON(!in_softirq());
 
 #define SOFTIRQ_EXIT()				\
-		trace_softirq_exit();		\
+		lockdep_softirq_exit();		\
 		local_irq_enable();		\
 		local_bh_enable();
 
-- 
cgit v1.2.3-71-gd317


From 48ead02030f849d011259244bb4ea9b985479006 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 12 Mar 2009 18:24:49 +0100
Subject: tracing/core: bring back raw trace_printk for dynamic formats strings

Impact: fix callsites with dynamic format strings

Since its new binary implementation, trace_printk() internally uses static
containers for the format strings on each callsites. But the value is
assigned once at build time, which means that it can't take dynamic
formats.

So this patch unearthes the raw trace_printk implementation for the callers
that will need trace_printk to be able to carry these dynamic format
strings. The trace_printk() macro will use the appropriate implementation
for each callsite. Most of the time however, the binary implementation will
still be used.

The other impact of this patch is that mmiotrace_printk() will use the old
implementation because it calls the low level trace_vprintk and we can't
guess here whether the format passed in it is dynamic or not.

Some parts of this patch have been written by Steven Rostedt (most notably
the part that chooses the appropriate implementation for each callsites).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/kernel.h               | 40 +++++++++++------
 kernel/trace/trace.c                 | 85 +++++++++++++++++++++++++++++++++---
 kernel/trace/trace.h                 | 13 +++++-
 kernel/trace/trace_event_types.h     | 11 ++++-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_mmiotrace.c       |  7 +--
 kernel/trace/trace_output.c          | 57 +++++++++++++++++++++---
 kernel/trace/trace_printk.c          | 33 +++++++++++---
 8 files changed, 213 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7742798c9208..1daca3b062bb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -452,31 +452,45 @@ do {									\
 
 #define trace_printk(fmt, args...)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
-									\
 	__trace_printk_check_format(fmt, ##args);			\
-	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args);	\
+	} else								\
+		__trace_printk(_THIS_IP_, fmt, ##args);		\
 } while (0)
 
+extern int
+__trace_bprintk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement.
+ */
 #define ftrace_vprintk(fmt, vargs)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
 									\
-	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+		__ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs);	\
+	} else								\
+		__ftrace_vprintk(_THIS_IP_, fmt, vargs);		\
 } while (0)
 
+extern int
+__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62a63b2b33dd..dbb077d8a172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1179,10 +1179,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 
 /**
- * trace_vprintk - write binary msg to tracing buffer
+ * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1191,7 +1191,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	struct print_entry *entry;
+	struct bprint_entry *entry;
 	unsigned long flags;
 	int resched;
 	int cpu, len = 0, size, pc;
@@ -1219,7 +1219,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 
 	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -1240,6 +1240,60 @@ out:
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+	static char trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	int cpu, len = 0, size, pc;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+
+	if (tracing_disabled || tracing_selftest_running)
+		return 0;
+
+	pc = preempt_count();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	pause_graph_tracing();
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
+	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	len = min(len, TRACE_BUF_SIZE-1);
+	trace_buf[len] = 0;
+
+	size = sizeof(*entry) + len + 1;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+
+	memcpy(&entry->buf, trace_buf, len);
+	entry->buf[len] = 0;
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+ out_unlock:
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
+	unpause_graph_tracing();
+ out:
+	preempt_enable_notrace();
+
+	return len;
+}
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
 enum trace_file_type {
@@ -1628,6 +1682,22 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1637,7 +1707,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1702,6 +1772,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_BPRINT &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return print_bprintk_msg_only(iter);
+
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 336324d717f8..cede1ab49d07 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINT,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -117,7 +118,7 @@ struct userstack_entry {
 /*
  * trace_printk entry:
  */
-struct print_entry {
+struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	int			depth;
@@ -125,6 +126,13 @@ struct print_entry {
 	u32			buf[];
 };
 
+struct print_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	int			depth;
+	char			buf[];
+};
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -286,6 +294,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
@@ -570,6 +579,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
+trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5cca4c978bde..d0907d746425 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
@@ -112,6 +112,15 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
+);
+
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8566c14b3e9a..4c388607ed67 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -684,7 +684,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
@@ -781,8 +781,8 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
+	case TRACE_BPRINT: {
+		struct bprint_entry *field;
 		trace_assign_type(field, entry);
 		return print_graph_comment(field, s, entry, iter);
 	}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 23e346a734ca..f095916e477f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,6 +254,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
+	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
 	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
@@ -261,11 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = trace_seq_bprintf(s, print->fmt, print->buf);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 491832af9ba1..ea9d3b410c7a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -832,13 +832,13 @@ static struct trace_event trace_user_stack_event = {
 	.binary		= trace_special_bin,
 };
 
-/* TRACE_PRINT */
+/* TRACE_BPRINT */
 static enum print_line_t
-trace_print_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct print_entry *field;
+	struct bprint_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -858,9 +858,10 @@ trace_print_print(struct trace_iterator *iter, int flags)
 }
 
 
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t
+trace_bprint_raw(struct trace_iterator *iter, int flags)
 {
-	struct print_entry *field;
+	struct bprint_entry *field;
 	struct trace_seq *s = &iter->seq;
 
 	trace_assign_type(field, iter->ent);
@@ -878,12 +879,55 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 }
 
 
+static struct trace_event trace_bprint_event = {
+	.type		= TRACE_BPRINT,
+	.trace		= trace_bprint_print,
+	.raw		= trace_bprint_raw,
+};
+
+/* TRACE_PRINT */
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
+{
+	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
 static struct trace_event trace_print_event = {
-	.type		= TRACE_PRINT,
+	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -891,6 +935,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_bprint_event,
 	&trace_print_event,
 	NULL
 };
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a50aea22e929..f307a11e2332 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -99,7 +99,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
 	.notifier_call = module_trace_bprintk_format_notify,
 };
 
-int __trace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
  {
 	int ret;
 	va_list ap;
@@ -111,13 +111,13 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__trace_printk);
+EXPORT_SYMBOL_GPL(__trace_bprintk);
 
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
  {
 	if (unlikely(!fmt))
 		return 0;
@@ -125,11 +125,34 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
+	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
 	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.2.3-71-gd317


From 5d592b44b29a1d73e13d5c9e3426eed843bdc359 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:33:36 -0400
Subject: tracing: tracepoints for softirq entry/exit - add softirq-to-name
 array

Create a 'softirq_to_name' array, which is indexed by softirq #, so
that we can easily convert between the softirq index # and its name, in
order to get more meaningful output messages.

LKML-Reference: <20090312183336.GB3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/interrupt.h | 5 +++++
 kernel/softirq.c          | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 472f11765f60..9b7e9d743476 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -258,6 +258,11 @@ enum
 	NR_SOFTIRQS
 };
 
+/* map softirq index to softirq name. update 'softirq_to_name' in
+ * kernel/softirq.c when adding a new softirq.
+ */
+extern char *softirq_to_name[NR_SOFTIRQS];
+
 /* softirq mask and active fields moved to irq_cpustat_t in
  * asm/hardirq.h to get better cache usage.  KAO
  */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7571bcb71be4..9f90fdc039f4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -53,6 +53,12 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
+char *softirq_to_name[NR_SOFTIRQS] = {
+	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
+	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
+	"RCU_SOFTIRQ"
+};
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -209,9 +215,10 @@ restart:
 			h->action(h);
 
 			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %td %p"
+				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
 				       " exited with %08x?\n", h - softirq_vec,
+				       softirq_to_name[h - softirq_vec],
 				       h->action, prev_count, preempt_count());
 				preempt_count() = prev_count;
 			}
-- 
cgit v1.2.3-71-gd317


From 4bb9c5c02153dfc89a6c73a6f32091413805ad7d Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Thu, 12 Mar 2009 17:45:27 -0700
Subject: VM, x86, PAT: Change is_linear_pfn_mapping to not use vm_pgoff

Impact: fix false positive PAT warnings - also fix VirtalBox hang

Use of vma->vm_pgoff to identify the pfnmaps that are fully
mapped at mmap time is broken. vm_pgoff is set by generic mmap
code even for cases where drivers are setting up the mappings
at the fault time.

The problem was originally reported here:

 http://marc.info/?l=linux-kernel&m=123383810628583&w=2

Change is_linear_pfn_mapping logic to overload VM_INSERTPAGE
flag along with VM_PFNMAP to mean full PFNMAP setup at mmap
time.

Problem also tracked at:

 http://bugzilla.kernel.org/show_bug.cgi?id=12800

Reported-by: Thomas Hellstrom <thellstrom@vmware.com>
Tested-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha>@intel.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: "ebiederm@xmission.com" <ebiederm@xmission.com>
Cc: <stable@kernel.org> # only for 2.6.29.1, not .28
LKML-Reference: <20090313004527.GA7176@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c  |  5 +++--
 include/linux/mm.h | 15 +++++++++++++--
 mm/memory.c        |  6 ++++--
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e0ab173b6974..21bc1f787ae2 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -641,10 +641,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 
 	/*
-	 * reserve_pfn_range() doesn't support RAM pages.
+	 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+	 * behavior with RAM pages by returning success.
 	 */
 	if (is_ram != 0)
-		return -EINVAL;
+		return 0;
 
 	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
 	if (ret)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065cdf8c09fb..3daa05feed9f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -98,7 +98,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
-#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
+#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it. Refer note in VM_PFNMAP_AT_MMAP below */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
@@ -126,6 +126,17 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
+/*
+ * pfnmap vmas that are fully mapped at mmap time (not mapped on fault).
+ * Used by x86 PAT to identify such PFNMAP mappings and optimize their handling.
+ * Note VM_INSERTPAGE flag is overloaded here. i.e,
+ * VM_INSERTPAGE && !VM_PFNMAP implies
+ *     The vma has had "vm_insert_page()" done on it
+ * VM_INSERTPAGE && VM_PFNMAP implies
+ *     The vma is PFNMAP with full mapping at mmap time
+ */
+#define VM_PFNMAP_AT_MMAP (VM_INSERTPAGE | VM_PFNMAP)
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
@@ -145,7 +156,7 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-	return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff);
+	return ((vma->vm_flags & VM_PFNMAP_AT_MMAP) == VM_PFNMAP_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..d7df5babcba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1665,9 +1665,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * behaviour that some programs depend on. We mark the "original"
 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
 	 */
-	if (addr == vma->vm_start && end == vma->vm_end)
+	if (addr == vma->vm_start && end == vma->vm_end) {
 		vma->vm_pgoff = pfn;
-	else if (is_cow_mapping(vma->vm_flags))
+		vma->vm_flags |= VM_PFNMAP_AT_MMAP;
+	} else if (is_cow_mapping(vma->vm_flags))
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
@@ -1679,6 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		 * needed from higher level routine calling unmap_vmas
 		 */
 		vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
+		vma->vm_flags &= ~VM_PFNMAP_AT_MMAP;
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-71-gd317


From a70f730282019f487aa33a84e5ac9a5e89c5abd0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Mar 2009 14:49:46 +1030
Subject: cpumask: replace node_to_cpumask with cpumask_of_node.

Impact: cleanup

node_to_cpumask (and the blecherous node_to_cpumask_ptr which
contained a declaration) are replaced now everyone implements
cpumask_of_node.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/base/node.c      | 2 +-
 drivers/pci/pci-driver.c | 3 +--
 include/linux/topology.h | 6 +-----
 mm/page_alloc.c          | 6 +++---
 mm/quicklist.c           | 2 +-
 mm/slab.c                | 2 +-
 mm/vmscan.c              | 6 ++++--
 net/sunrpc/svc.c         | 3 +--
 8 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index f8f578a71b25..40b809742a1c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -24,7 +24,7 @@ static struct sysdev_class node_class = {
 static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
 {
 	struct node *node_dev = to_node(dev);
-	node_to_cpumask_ptr(mask, node_dev->sysdev.id);
+	const struct cpumask *mask = cpumask_of_node(node_dev->sysdev.id);
 	int len;
 
 	/* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 93eac1423585..b522f883d674 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -212,10 +212,9 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
 	node = dev_to_node(&dev->dev);
 	if (node >= 0) {
 		int cpu;
-		node_to_cpumask_ptr(nodecpumask, node);
 
 		get_online_cpus();
-		cpu = cpumask_any_and(nodecpumask, cpu_online_mask);
+		cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
 		if (cpu < nr_cpu_ids)
 			error = work_on_cpu(cpu, local_pci_probe, &ddi);
 		else
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a16b9e06f2e5..16b7d6896ce9 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -38,11 +38,7 @@
 #endif
 
 #ifndef nr_cpus_node
-#define nr_cpus_node(node)				\
-	({						\
-		node_to_cpumask_ptr(__tmp__, node);	\
-		cpus_weight(*__tmp__);			\
-	})
+#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
 #endif
 
 #define for_each_node_with_cpus(node)			\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c44ed49ca93..a92b0975b9a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2134,7 +2134,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
-	node_to_cpumask_ptr(tmp, 0);
+	const struct cpumask *tmp = cpumask_of_node(0);
 
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
@@ -2155,8 +2155,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 		val += (n < node);
 
 		/* Give preference to headless and unused nodes */
-		node_to_cpumask_ptr_next(tmp, n);
-		if (!cpus_empty(*tmp))
+		tmp = cpumask_of_node(n);
+		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 
 		/* Slight preference for less loaded node */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 8dbb6805ef35..e66d07d1b4ff 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -29,7 +29,7 @@ static unsigned long max_pages(unsigned long min_pages)
 	int node = numa_node_id();
 	struct zone *zones = NODE_DATA(node)->node_zones;
 	int num_cpus_on_node;
-	node_to_cpumask_ptr(cpumask_on_node, node);
+	const struct cpumask *cpumask_on_node = cpumask_of_node(node);
 
 	node_free_pages =
 #ifdef CONFIG_ZONE_DMA
diff --git a/mm/slab.c b/mm/slab.c
index 4d00855629c4..2daaca0b4541 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1160,7 +1160,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
-	node_to_cpumask_ptr(mask, node);
+	const struct cpumask *mask = cpumask_of_node(node);
 
 	list_for_each_entry(cachep, &cache_chain, next) {
 		struct array_cache *nc;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6177e3bcd66b..cc6135586b44 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1963,7 +1963,7 @@ static int kswapd(void *p)
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
-	node_to_cpumask_ptr(cpumask, pgdat->node_id);
+	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
@@ -2198,7 +2198,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_node_state(nid, N_HIGH_MEMORY) {
 			pg_data_t *pgdat = NODE_DATA(nid);
-			node_to_cpumask_ptr(mask, pgdat->node_id);
+			const struct cpumask *mask;
+
+			mask = cpumask_of_node(pgdat->node_id);
 
 			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..3bdd5bffaca8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -317,8 +317,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	}
 	case SVC_POOL_PERNODE:
 	{
-		node_to_cpumask_ptr(nodecpumask, node);
-		set_cpus_allowed_ptr(task, nodecpumask);
+		set_cpus_allowed_ptr(task, cpumask_of_node(node));
 		break;
 	}
 	}
-- 
cgit v1.2.3-71-gd317


From ee08c6eccb7d1295516f7cf420fddf7b14e9146f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Mar 2009 05:52:59 +0100
Subject: tracing/ftrace: syscall tracing infrastructure, basics

Provide basic callbacks to do syscall tracing.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <1236401580-5758-2-git-send-email-fweisbec@gmail.com>
[ simplified it to a trace_printk() for now. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h        |  21 ++++++++
 kernel/trace/Kconfig          |  10 ++++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace.h          |   2 +
 kernel/trace/trace_syscalls.c | 113 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+)
 create mode 100644 kernel/trace/trace_syscalls.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e1583f2639b0..c146c1021a29 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -503,4 +503,25 @@ static inline void trace_hw_branch_oops(void) {}
 
 #endif /* CONFIG_HW_BRANCH_TRACER */
 
+/*
+ * A syscall entry in the ftrace syscalls array.
+ *
+ * @syscall_nr: syscall number
+ */
+struct syscall_trace_entry {
+	int		syscall_nr;
+};
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern void start_ftrace_syscalls(void);
+extern void stop_ftrace_syscalls(void);
+extern void ftrace_syscall_enter(struct pt_regs *regs);
+extern void ftrace_syscall_exit(struct pt_regs *regs);
+#else
+static inline void start_ftrace_syscalls(void) { }
+static inline void stop_ftrace_syscalls(void) { }
+static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
+static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8e4a2a61cd75..95a0ad191f19 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -34,6 +34,9 @@ config HAVE_FTRACE_MCOUNT_RECORD
 config HAVE_HW_BRANCH_TRACER
 	bool
 
+config HAVE_FTRACE_SYSCALLS
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -175,6 +178,13 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+config FTRACE_SYSCALLS
+	bool "Trace syscalls"
+	depends on HAVE_FTRACE_SYSCALLS
+	select TRACING
+	help
+	  Basic tracer to catch the syscall entry and exit events.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	select TRACING
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7a2943796eb..c3feea01c3e0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -43,5 +43,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d8865fe4..3d49daae47dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_SYSCALL_ENTER,
+	TRACE_SYSCALL_EXIT,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000000..66cf97449af3
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,113 @@
+#include <linux/ftrace.h>
+#include <linux/kernel.h>
+
+#include <asm/syscall.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+static atomic_t refcount;
+
+void start_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_inc_return(&refcount) != 1)
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_dec(&refcount);
+}
+
+void stop_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_dec_return(&refcount))
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_inc(&refcount);
+}
+
+void ftrace_syscall_enter(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d enter\n", syscall_nr);
+}
+
+void ftrace_syscall_exit(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d exit\n", syscall_nr);
+}
+
+static int init_syscall_tracer(struct trace_array *tr)
+{
+	start_ftrace_syscalls();
+
+	return 0;
+}
+
+static void reset_syscall_tracer(struct trace_array *tr)
+{
+	stop_ftrace_syscalls();
+}
+
+static struct trace_event syscall_enter_event = {
+	.type		= TRACE_SYSCALL_ENTER,
+};
+
+static struct trace_event syscall_exit_event = {
+	.type		= TRACE_SYSCALL_EXIT,
+};
+
+static struct tracer syscall_tracer __read_mostly = {
+	.name		= "syscall",
+	.init		= init_syscall_tracer,
+	.reset		= reset_syscall_tracer
+};
+
+__init int register_ftrace_syscalls(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&syscall_enter_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_enter_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	ret = register_ftrace_event(&syscall_exit_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_exit_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	return register_tracer(&syscall_tracer);
+}
+device_initcall(register_ftrace_syscalls);
-- 
cgit v1.2.3-71-gd317


From d1dedb52acd98bd5e13e1ff4c4d045d58bbd16fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic, smp: provide smp_send_stop() wrapper on UP too

Impact: cleanup, no code changed

Remove an ugly #ifdef CONFIG_SMP from panic(), by providing
an smp_send_stop() wrapper on UP too.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 4 +++-
 kernel/panic.c      | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 2d3bcb6b37ff..a69db820eed6 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -38,7 +38,7 @@ int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
 /*
  * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
  * (defined in asm header):
- */ 
+ */
 
 /*
  * stops all CPUs but the current one:
@@ -122,6 +122,8 @@ extern unsigned int setup_max_cpus;
 
 #else /* !SMP */
 
+static inline void smp_send_stop(void) { }
+
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
diff --git a/kernel/panic.c b/kernel/panic.c
index 57fb005de546..ca75e819d0ea 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -85,14 +85,12 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	crash_kexec(NULL);
 
-#ifdef CONFIG_SMP
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
 	smp_send_stop();
-#endif
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-- 
cgit v1.2.3-71-gd317


From e94142a67f8bad494c593f0a07c9fc2fbec98c0e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:51:27 +0800
Subject: ftrace: remove struct list_head from struct dyn_ftrace

Impact: save memory

The struct dyn_ftrace table is very large, this patch will save
about 50%.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2C9F.8020009@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  1 -
 kernel/trace/ftrace.c  | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..9d598bbf28a6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,7 +145,6 @@ enum {
 };
 
 struct dyn_ftrace {
-	struct list_head	list;
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		flags;
 	struct dyn_arch_ftrace	arch;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bf78a4c75c67..90d5729afeff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -272,7 +272,7 @@ enum {
 
 static int ftrace_filtered;
 
-static LIST_HEAD(ftrace_new_addrs);
+static struct dyn_ftrace *ftrace_new_addrs;
 
 static DEFINE_MUTEX(ftrace_regex_lock);
 
@@ -409,8 +409,8 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-
-	list_add(&rec->list, &ftrace_new_addrs);
+	rec->flags = (unsigned long)ftrace_new_addrs;
+	ftrace_new_addrs = rec;
 
 	return rec;
 }
@@ -716,19 +716,21 @@ unsigned long		ftrace_update_tot_cnt;
 
 static int ftrace_update_code(struct module *mod)
 {
-	struct dyn_ftrace *p, *t;
+	struct dyn_ftrace *p;
 	cycle_t start, stop;
 
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
+	while (ftrace_new_addrs) {
 
 		/* If something went wrong, bail without enabling anything */
 		if (unlikely(ftrace_disabled))
 			return -1;
 
-		list_del_init(&p->list);
+		p = ftrace_new_addrs;
+		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
 		if (ftrace_code_disable(mod, p)) {
-- 
cgit v1.2.3-71-gd317


From 3dd3d46b78c22503957230ca5981849b7bb29b9a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Mar 2009 21:48:32 +0100
Subject: genirq: remove unused hw_irq_controller typedef

hw_irq_controller is unused. Remove the typedef

Impact: cleanup

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 737eafbc1f3d..7c07a09931db 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -226,7 +226,6 @@ irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
  * Migration helpers for obsolete names, they will go away:
  */
 #define hw_interrupt_type	irq_chip
-typedef struct irq_chip		hw_irq_controller;
 #define no_irq_type		no_irq_chip
 typedef struct irq_desc		irq_desc_t;
 
-- 
cgit v1.2.3-71-gd317


From bedd30d986a05e32dc3eab874e4b9ed8a38058bb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 Sep 2008 23:14:27 +0200
Subject: genirq: make irqreturn_t an enum

Impact: cleanup

Remove the 2.4 compabiliy cruft

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/irq.h       |  4 ++--
 include/linux/irqreturn.h | 28 ++++++++++------------------
 2 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7c07a09931db..19770923bcb0 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -280,7 +280,7 @@ static inline int irq_balancing_disabled(unsigned int irq)
 }
 
 /* Handle irq action chains: */
-extern int handle_IRQ_event(unsigned int irq, struct irqaction *action);
+extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action);
 
 /*
  * Built-in IRQ handlers for various IRQ types,
@@ -325,7 +325,7 @@ static inline void generic_handle_irq(unsigned int irq)
 
 /* Handling of unhandled and spurious interrupts: */
 extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
-			   int action_ret);
+			   irqreturn_t action_ret);
 
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
index 881883c2009d..c5584ca5b8c9 100644
--- a/include/linux/irqreturn.h
+++ b/include/linux/irqreturn.h
@@ -1,25 +1,17 @@
-/* irqreturn.h */
 #ifndef _LINUX_IRQRETURN_H
 #define _LINUX_IRQRETURN_H
 
-/*
- * For 2.4.x compatibility, 2.4.x can use
- *
- *	typedef void irqreturn_t;
- *	#define IRQ_NONE
- *	#define IRQ_HANDLED
- *	#define IRQ_RETVAL(x)
- *
- * To mix old-style and new-style irq handler returns.
- *
- * IRQ_NONE means we didn't handle it.
- * IRQ_HANDLED means that we did have a valid interrupt and handled it.
- * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled)
+/**
+ * enum irqreturn
+ * @IRQ_NONE		interrupt was not from this device
+ * @IRQ_HANDLED		interrupt was handled by this device
  */
-typedef int irqreturn_t;
+enum irqreturn {
+	IRQ_NONE,
+	IRQ_HANDLED,
+};
 
-#define IRQ_NONE	(0)
-#define IRQ_HANDLED	(1)
-#define IRQ_RETVAL(x)	((x) != 0)
+typedef enum irqreturn irqreturn_t;
+#define IRQ_RETVAL(x)	((x) != IRQ_NONE)
 
 #endif
-- 
cgit v1.2.3-71-gd317


From a9d0a1a38352c4fb8946e73b3e42ba4ada29e733 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Mar 2009 16:58:16 +0100
Subject: genirq: add doc to struct irqaction

Impact: documentation

struct irqaction is not documented. Add kernel doc comments and add
interrupt.h to the genirq docbook.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/DocBook/genericirq.tmpl |  1 +
 include/linux/interrupt.h             | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl
index 3a882d9a90a9..c671a0168096 100644
--- a/Documentation/DocBook/genericirq.tmpl
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -440,6 +440,7 @@ desc->chip->end();
      used in the generic IRQ layer.
      </para>
 !Iinclude/linux/irq.h
+!Iinclude/linux/interrupt.h
   </chapter>
 
   <chapter id="pubfunctions">
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 468e3a25a4a1..91658d076598 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -61,6 +61,17 @@
 
 typedef irqreturn_t (*irq_handler_t)(int, void *);
 
+/**
+ * struct irqaction - per interrupt action descriptor
+ * @handler:	interrupt handler function
+ * @flags:	flags (see IRQF_* above)
+ * @mask:	no comment as it is useless and about to be removed
+ * @name:	name of the device
+ * @dev_id:	cookie to identify the device
+ * @next:	pointer to the next irqaction for shared interrupts
+ * @irq:	interrupt number
+ * @dir:	pointer to the proc/irq/NN/name entry
+ */
 struct irqaction {
 	irq_handler_t handler;
 	unsigned long flags;
-- 
cgit v1.2.3-71-gd317


From 082edb7bf443eb8eda15b482d16ad9dd8137ad24 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Mar 2009 23:43:37 +1030
Subject: numa, cpumask: move numa_node_id default implementation to topology.h

Impact: cleanup, potential bugfix

Not sure what changed to expose this, but clearly that numa_node_id()
doesn't belong in mmzone.h (the inline in gfp.h is probably overkill, too).

In file included from include/linux/topology.h:34,
                 from arch/x86/mm/numa.c:2:
/home/rusty/patches-cpumask/linux-2.6/arch/x86/include/asm/topology.h:64:1: warning: "numa_node_id" redefined
In file included from include/linux/topology.h:32,
                 from arch/x86/mm/numa.c:2:
include/linux/mmzone.h:770:1: warning: this is the location of the previous definition

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Mike Travis <travis@sgi.com>
LKML-Reference: <200903132343.37661.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/gfp.h      | 1 +
 include/linux/mmzone.h   | 6 ------
 include/linux/topology.h | 5 +++++
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dd20cd78faa8..0bbc15f54536 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -4,6 +4,7 @@
 #include <linux/mmzone.h>
 #include <linux/stddef.h>
 #include <linux/linkage.h>
+#include <linux/topology.h>
 
 struct vm_area_struct;
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1aca6cebbb78..e6aacf77986a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -764,12 +764,6 @@ extern int numa_zonelist_order_handler(struct ctl_table *, int,
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
 
-#include <linux/topology.h>
-/* Returns the number of the current Node. */
-#ifndef numa_node_id
-#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
-#endif
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 
 extern struct pglist_data contig_page_data;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 16b7d6896ce9..7402c1a27c4f 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -196,4 +196,9 @@ int arch_update_cpu_topology(void);
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
 #endif
 
+/* Returns the number of the current Node. */
+#ifndef numa_node_id
+#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
+#endif
+
 #endif /* _LINUX_TOPOLOGY_H */
-- 
cgit v1.2.3-71-gd317


From bed1ffca022cc876fb83161d26670e9b5d3cf36b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 13 Mar 2009 15:42:11 +0100
Subject: tracing/syscalls: core infrastructure for syscalls tracing,
 enhancements

Impact: new feature

This adds the generic support for syscalls tracing. This is
currently exploited through a devoted tracer but other tracing
engines can use it. (They just have to play with
{start,stop}_ftrace_syscalls() and use the display callbacks
unless they want to override them.)

The syscalls prototypes definitions are abused here to steal
some metadata informations:

- syscall name, param types, param names, number of params

The syscall addr is not directly saved during this definition
because we don't know if its prototype is available in the
namespace. But we don't really need it. The arch has just to
build a function able to resolve the syscall number to its
metadata struct.

The current tracer prints the syscall names, parameters names
and values (and their types optionally). Currently the value is
a raw hex but higher level values diplaying is on my TODO list.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236955332-10133-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  11 ++-
 include/linux/ftrace.h            |  14 +++-
 include/linux/syscalls.h          |  60 +++++++++++++++-
 kernel/trace/trace.h              |  17 +++++
 kernel/trace/trace_syscalls.c     | 146 +++++++++++++++++++++++++++++++++++---
 5 files changed, 234 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0e0f39be6c8b..d3bc3c86df6a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -77,6 +77,14 @@
 #define TRACE_PRINTKS()
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .;	\
+			 *(__syscalls_metadata)				\
+			 VMLINUX_SYMBOL(__stop_syscalls_metadata) = .;
+#else
+#define TRACE_SYSCALLS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -99,7 +107,8 @@
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
-	FTRACE_EVENTS()
+	FTRACE_EVENTS()							\
+	TRACE_SYSCALLS()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..6dc1c652447e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -506,13 +506,21 @@ static inline void trace_hw_branch_oops(void) {}
 /*
  * A syscall entry in the ftrace syscalls array.
  *
- * @syscall_nr: syscall number
+ * @name: name of the syscall
+ * @nb_args: number of parameters it takes
+ * @types: list of types as strings
+ * @args: list of args as strings (args[i] matches types[i])
  */
-struct syscall_trace_entry {
-	int		syscall_nr;
+struct syscall_metadata {
+	const char	*name;
+	int		nb_args;
+	const char	**types;
+	const char	**args;
 };
 
 #ifdef CONFIG_FTRACE_SYSCALLS
+extern void arch_init_ftrace_syscalls(void);
+extern struct syscall_metadata *syscall_nr_to_meta(int nr);
 extern void start_ftrace_syscalls(void);
 extern void stop_ftrace_syscalls(void);
 extern void ftrace_syscall_enter(struct pt_regs *regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..0cff9bb80b02 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
 #include <asm/signal.h>
 #include <linux/quota.h>
 #include <linux/key.h>
+#include <linux/ftrace.h>
 
 #define __SC_DECL1(t1, a1)	t1 a1
 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
@@ -95,7 +96,46 @@ struct old_linux_dirent;
 #define __SC_TEST5(t5, a5, ...)	__SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define __SC_STR_ADECL1(t, a)		#a
+#define __SC_STR_ADECL2(t, a, ...)	#a, __SC_STR_ADECL1(__VA_ARGS__)
+#define __SC_STR_ADECL3(t, a, ...)	#a, __SC_STR_ADECL2(__VA_ARGS__)
+#define __SC_STR_ADECL4(t, a, ...)	#a, __SC_STR_ADECL3(__VA_ARGS__)
+#define __SC_STR_ADECL5(t, a, ...)	#a, __SC_STR_ADECL4(__VA_ARGS__)
+#define __SC_STR_ADECL6(t, a, ...)	#a, __SC_STR_ADECL5(__VA_ARGS__)
+
+#define __SC_STR_TDECL1(t, a)		#t
+#define __SC_STR_TDECL2(t, a, ...)	#t, __SC_STR_TDECL1(__VA_ARGS__)
+#define __SC_STR_TDECL3(t, a, ...)	#t, __SC_STR_TDECL2(__VA_ARGS__)
+#define __SC_STR_TDECL4(t, a, ...)	#t, __SC_STR_TDECL3(__VA_ARGS__)
+#define __SC_STR_TDECL5(t, a, ...)	#t, __SC_STR_TDECL4(__VA_ARGS__)
+#define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
+
+#define SYSCALL_METADATA(sname, nb)				\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys"#sname,			\
+		.nb_args 	= nb,				\
+		.types		= types_##sname,		\
+		.args		= args_##sname,			\
+	}
+
+#define SYSCALL_DEFINE0(sname)					\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys_"#sname,			\
+		.nb_args 	= 0,				\
+	};							\
+	asmlinkage long sys_##sname(void)
+
+#else
 #define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
+#endif
+
 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
@@ -117,10 +157,26 @@ struct old_linux_dirent;
 #endif
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	static const char *types_##sname[] = {			\
+		__SC_STR_TDECL##x(__VA_ARGS__)			\
+	};							\
+	static const char *args_##sname[] = {			\
+		__SC_STR_ADECL##x(__VA_ARGS__)			\
+	};							\
+	SYSCALL_METADATA(sname, x);				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#else
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#endif
+
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
 	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
 	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
@@ -134,7 +190,7 @@ struct old_linux_dirent;
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3d49daae47dc..d80ca0d464d9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -194,6 +194,19 @@ struct kmemtrace_free_entry {
 	const void *ptr;
 };
 
+struct syscall_trace_enter {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		args[];
+};
+
+struct syscall_trace_exit {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		ret;
+};
+
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -306,6 +319,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct syscall_trace_enter,		\
+			  TRACE_SYSCALL_ENTER);				\
+		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
+			  TRACE_SYSCALL_EXIT);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 66cf97449af3..c72e599230ff 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,5 @@
-#include <linux/ftrace.h>
 #include <linux/kernel.h>
-
+#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -8,6 +7,90 @@
 
 static atomic_t refcount;
 
+/* Our two options */
+enum {
+	TRACE_SYSCALLS_OPT_TYPES = 0x1,
+};
+
+static struct tracer_opt syscalls_opts[] = {
+	{ TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
+	{ }
+};
+
+static struct tracer_flags syscalls_flags = {
+	.val = 0, /* By default: no args types */
+	.opts = syscalls_opts
+};
+
+enum print_line_t
+print_syscall_enter(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_enter *trace;
+	struct syscall_metadata *entry;
+	int i, ret, syscall;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry)
+		goto end;
+
+	ret = trace_seq_printf(s, "%s(", entry->name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	for (i = 0; i < entry->nb_args; i++) {
+		/* parameter types */
+		if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
+			ret = trace_seq_printf(s, "%s ", entry->types[i]);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+		/* parameter values */
+		ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
+				       trace->args[i],
+				       i == entry->nb_args - 1 ? ")" : ",");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+end:
+	trace_seq_printf(s, "\n");
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_syscall_exit(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_exit *trace;
+	int syscall;
+	struct syscall_metadata *entry;
+	int ret;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry) {
+		trace_seq_printf(s, "\n");
+		return TRACE_TYPE_HANDLED;
+	}
+
+	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+				trace->ret);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 void start_ftrace_syscalls(void)
 {
 	unsigned long flags;
@@ -16,6 +99,7 @@ void start_ftrace_syscalls(void)
 	if (atomic_inc_return(&refcount) != 1)
 		goto out;
 
+	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
 
 	do_each_thread(g, t) {
@@ -48,20 +132,63 @@ out:
 
 void ftrace_syscall_enter(struct pt_regs *regs)
 {
+	struct syscall_trace_enter *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
+	int size;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d enter\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
+							0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 void ftrace_syscall_exit(struct pt_regs *regs)
 {
+	struct syscall_trace_exit *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d exit\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
+				sizeof(*entry), 0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	entry->ret = syscall_get_return_value(current, regs);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 static int init_syscall_tracer(struct trace_array *tr)
@@ -77,17 +204,20 @@ static void reset_syscall_tracer(struct trace_array *tr)
 }
 
 static struct trace_event syscall_enter_event = {
-	.type		= TRACE_SYSCALL_ENTER,
+	.type	 	= TRACE_SYSCALL_ENTER,
+	.trace		= print_syscall_enter,
 };
 
 static struct trace_event syscall_exit_event = {
-	.type		= TRACE_SYSCALL_EXIT,
+	.type	 	= TRACE_SYSCALL_EXIT,
+	.trace		= print_syscall_exit,
 };
 
 static struct tracer syscall_tracer __read_mostly = {
-	.name		= "syscall",
+	.name	     	= "syscall",
 	.init		= init_syscall_tracer,
-	.reset		= reset_syscall_tracer
+	.reset		= reset_syscall_tracer,
+	.flags		= &syscalls_flags,
 };
 
 __init int register_ftrace_syscalls(void)
-- 
cgit v1.2.3-71-gd317


From ead2ceb0ec9f85cff19c43b5cdb2f8a054484431 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 11 Mar 2009 09:49:55 +0000
Subject: Network Drop Monitor: Adding kfree_skb_clean for non-drops and
 modifying end-of-line points for skbs

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 include/linux/skbuff.h |    4 +++-
 net/core/datagram.c    |    2 +-
 net/core/skbuff.c      |   22 ++++++++++++++++++++++
 net/ipv4/arp.c         |    2 +-
 net/ipv4/udp.c         |    2 +-
 net/packet/af_packet.c |    2 +-
 6 files changed, 29 insertions(+), 5 deletions(-)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  4 +++-
 net/core/datagram.c    |  2 +-
 net/core/skbuff.c      | 22 ++++++++++++++++++++++
 net/ipv4/arp.c         |  2 +-
 net/ipv4/udp.c         |  2 +-
 net/packet/af_packet.c |  2 +-
 6 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1f659e8c2b88..1fbab2ae613c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -421,6 +421,7 @@ extern void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
 #endif
 
 extern void kfree_skb(struct sk_buff *skb);
+extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
 				   gfp_t priority, int fclone, int node);
@@ -459,7 +460,8 @@ extern int	       skb_to_sgvec(struct sk_buff *skb,
 extern int	       skb_cow_data(struct sk_buff *skb, int tailbits,
 				    struct sk_buff **trailer);
 extern int	       skb_pad(struct sk_buff *skb, int pad);
-#define dev_kfree_skb(a)	kfree_skb(a)
+#define dev_kfree_skb(a)	consume_skb(a)
+#define dev_consume_skb(a)	kfree_skb_clean(a)
 extern void	      skb_over_panic(struct sk_buff *skb, int len,
 				     void *here);
 extern void	      skb_under_panic(struct sk_buff *skb, int len,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 5e2ac0c4b07c..d0de644b378d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
 
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 {
-	kfree_skb(skb);
+	consume_skb(skb);
 	sk_mem_reclaim_partial(sk);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e5e2111a397d..6acbf9e79eb1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,6 +65,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include <trace/skb.h>
 
 #include "kmap_skb.h"
 
@@ -442,10 +443,31 @@ void kfree_skb(struct sk_buff *skb)
 		smp_rmb();
 	else if (likely(!atomic_dec_and_test(&skb->users)))
 		return;
+	trace_kfree_skb(skb, __builtin_return_address(0));
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(kfree_skb);
 
+/**
+ *	consume_skb - free an skbuff
+ *	@skb: buffer to free
+ *
+ *	Drop a ref to the buffer and free it if the usage count has hit zero
+ *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ *	is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+	if (unlikely(!skb))
+		return;
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+	__kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
+
 /**
  *	skb_recycle_check - check if skb can be reused for receive
  *	@skb: buffer
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 3d67d1ffed77..9c220323f353 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -892,7 +892,7 @@ static int arp_process(struct sk_buff *skb)
 out:
 	if (in_dev)
 		in_dev_put(in_dev);
-	kfree_skb(skb);
+	consume_skb(skb);
 	return 0;
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4bd178a111d5..05b7abb99f69 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 			sk = sknext;
 		} while (sknext);
 	} else
-		kfree_skb(skb);
+		consume_skb(skb);
 	spin_unlock(&hslot->lock);
 	return 0;
 }
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d8cc006fac45..74776de523ec 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -584,7 +584,7 @@ drop_n_restore:
 		skb->len = skb_len;
 	}
 drop:
-	kfree_skb(skb);
+	consume_skb(skb);
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 9a8afc8d3962f3ed26fd6b56db34133860ed1e72 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 11 Mar 2009 09:51:26 +0000
Subject: Network Drop Monitor: Adding drop monitor implementation & Netlink
 protocol

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 include/linux/net_dropmon.h |   56 +++++++++
 net/core/drop_monitor.c     |  263 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 319 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net_dropmon.h |  56 ++++++++++
 net/core/drop_monitor.c     | 263 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 319 insertions(+)
 create mode 100644 include/linux/net_dropmon.h
 create mode 100644 net/core/drop_monitor.c

(limited to 'include/linux')

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
new file mode 100644
index 000000000000..0217fb81a630
--- /dev/null
+++ b/include/linux/net_dropmon.h
@@ -0,0 +1,56 @@
+#ifndef __NET_DROPMON_H
+#define __NET_DROPMON_H
+
+#include <linux/netlink.h>
+
+struct net_dm_drop_point {
+	__u8 pc[8];
+	__u32 count;
+};
+
+#define NET_DM_CFG_VERSION  0
+#define NET_DM_CFG_ALERT_COUNT  1
+#define NET_DM_CFG_ALERT_DELAY 2
+#define NET_DM_CFG_MAX 3
+
+struct net_dm_config_entry {
+	__u32 type;
+	__u64 data __attribute__((aligned(8)));
+};
+
+struct net_dm_config_msg {
+	__u32 entries;
+	struct net_dm_config_entry options[0];
+};
+
+struct net_dm_alert_msg {
+	__u32 entries;
+	struct net_dm_drop_point points[0];
+};
+
+struct net_dm_user_msg {
+	union {
+		struct net_dm_config_msg user;
+		struct net_dm_alert_msg alert;
+	} u;
+};
+
+
+/* These are the netlink message types for this protocol */
+
+enum {
+	NET_DM_CMD_UNSPEC = 0,
+	NET_DM_CMD_ALERT,
+	NET_DM_CMD_CONFIG,
+	NET_DM_CMD_START,
+	NET_DM_CMD_STOP,
+	_NET_DM_CMD_MAX,
+};
+
+#define NET_DM_CMD_MAX (_NET_DM_CMD_MAX - 1)
+
+/*
+ * Our group identifiers
+ */
+#define NET_DM_GRP_ALERT 1
+#endif
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 000000000000..9fd0dc3cca99
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,263 @@
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <net/genetlink.h>
+
+#include <trace/skb.h>
+
+#include <asm/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+static void send_dm_alert(struct work_struct *unused);
+
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+struct sock *dm_sock;
+
+struct per_cpu_dm_data {
+	struct work_struct dm_alert_work;
+	struct sk_buff *skb;
+	atomic_t dm_hit_count;
+	struct timer_list send_timer;
+};
+
+static struct genl_family net_drop_monitor_family = {
+	.id             = GENL_ID_GENERATE,
+	.hdrsize        = 0,
+	.name           = "NET_DM",
+	.version        = 1,
+	.maxattr        = NET_DM_CMD_MAX,
+};
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+
+
+static void reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+	size_t al;
+	struct net_dm_alert_msg *msg;
+
+	al = sizeof(struct net_dm_alert_msg);
+	al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+	data->skb = genlmsg_new(al, GFP_KERNEL);
+	genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
+			0, NET_DM_CMD_ALERT);
+	msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg));
+	memset(msg, 0, al);
+	atomic_set(&data->dm_hit_count, dm_hit_limit);
+}
+
+static void send_dm_alert(struct work_struct *unused)
+{
+	struct sk_buff *skb;
+	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+	/*
+	 * Grab the skb we're about to send
+	 */
+	skb = data->skb;
+
+	/*
+	 * Replace it with a new one
+	 */
+	reset_per_cpu_data(data);
+
+	/*
+	 * Ship it!
+	 */
+	genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
+
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period.  Note that it operates under the timer interrupt
+ * so we don't need to disable preemption here
+ */
+static void sched_send_work(unsigned long unused)
+{
+	struct per_cpu_dm_data *data =  &__get_cpu_var(dm_cpu_data);
+
+	schedule_work(&data->dm_alert_work);
+}
+
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	struct net_dm_alert_msg *msg;
+	struct nlmsghdr *nlh;
+	int i;
+	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+
+	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
+		/*
+		 * we're already at zero, discard this hit
+		 */
+		goto out;
+	}
+
+	nlh = (struct nlmsghdr *)data->skb->data;
+	msg = genlmsg_data(nlmsg_data(nlh));
+	for (i = 0; i < msg->entries; i++) {
+		if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
+			msg->points[i].count++;
+			goto out;
+		}
+	}
+
+	/*
+	 * We need to create a new entry
+	 */
+	__nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
+	msg->points[msg->entries].count = 1;
+	msg->entries++;
+
+	if (!timer_pending(&data->send_timer)) {
+		data->send_timer.expires = jiffies + dm_delay * HZ;
+		add_timer_on(&data->send_timer, smp_processor_id());
+	}
+
+out:
+	return;
+}
+
+static int set_all_monitor_traces(int state)
+{
+	int rc = 0;
+
+	switch (state) {
+	case TRACE_ON:
+		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		break;
+	case TRACE_OFF:
+		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+
+		tracepoint_synchronize_unregister();
+		break;
+	default:
+		rc = 1;
+		break;
+	}
+
+	if (rc)
+		return -EINPROGRESS;
+	return rc;
+}
+
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+			struct genl_info *info)
+{
+	return -ENOTSUPP;
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+			struct genl_info *info)
+{
+	switch (info->genlhdr->cmd) {
+	case NET_DM_CMD_START:
+		return set_all_monitor_traces(TRACE_ON);
+		break;
+	case NET_DM_CMD_STOP:
+		return set_all_monitor_traces(TRACE_OFF);
+		break;
+	}
+
+	return -ENOTSUPP;
+}
+
+
+static struct genl_ops dropmon_ops[] = {
+	{
+		.cmd = NET_DM_CMD_CONFIG,
+		.doit = net_dm_cmd_config,
+	},
+	{
+		.cmd = NET_DM_CMD_START,
+		.doit = net_dm_cmd_trace,
+	},
+	{
+		.cmd = NET_DM_CMD_STOP,
+		.doit = net_dm_cmd_trace,
+	},
+};
+
+static int __init init_net_drop_monitor(void)
+{
+	int cpu;
+	int rc, i, ret;
+	struct per_cpu_dm_data *data;
+	printk(KERN_INFO "Initalizing network drop monitor service\n");
+
+	if (sizeof(void *) > 8) {
+		printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
+		return -ENOSPC;
+	}
+
+	if (genl_register_family(&net_drop_monitor_family) < 0) {
+		printk(KERN_ERR "Could not create drop monitor netlink family\n");
+		return -EFAULT;
+	}
+
+	rc = -EFAULT;
+
+	for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) {
+		ret = genl_register_ops(&net_drop_monitor_family,
+					&dropmon_ops[i]);
+		if (ret) {
+			printk(KERN_CRIT "failed to register operation %d\n",
+				dropmon_ops[i].cmd);
+			goto out_unreg;
+		}
+	}
+
+	rc = 0;
+
+	for_each_present_cpu(cpu) {
+		data = &per_cpu(dm_cpu_data, cpu);
+		reset_per_cpu_data(data);
+		INIT_WORK(&data->dm_alert_work, send_dm_alert);
+		init_timer(&data->send_timer);
+		data->send_timer.data = cpu;
+		data->send_timer.function = sched_send_work;
+	}
+	goto out;
+
+out_unreg:
+	genl_unregister_family(&net_drop_monitor_family);
+out:
+	return rc;
+}
+
+late_initcall(init_net_drop_monitor);
-- 
cgit v1.2.3-71-gd317


From 273ae44b9cb9443e0b5265cdc99f127ddb95c8db Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 11 Mar 2009 09:53:16 +0000
Subject: Network Drop Monitor: Adding Build changes to enable drop monitor

Network Drop Monitor: Adding Build changes to enable drop monitor

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 include/linux/Kbuild |    1 +
 net/Kconfig          |   11 +++++++++++
 net/core/Makefile    |    1 +
 3 files changed, 13 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/Kbuild |  1 +
 net/Kconfig          | 11 +++++++++++
 net/core/Makefile    |  1 +
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 106c3ba50844..e9581fd9fb66 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -115,6 +115,7 @@ header-y += mqueue.h
 header-y += mtio.h
 header-y += ncp_no.h
 header-y += neighbour.h
+header-y += net_dropmon.h
 header-y += netfilter_arp.h
 header-y += netrom.h
 header-y += nfs2.h
diff --git a/net/Kconfig b/net/Kconfig
index 6b39ede3b1b1..c9fdcd7e71ea 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -222,6 +222,17 @@ config NET_TCPPROBE
 	To compile this code as a module, choose M here: the
 	module will be called tcp_probe.
 
+config NET_DROP_MONITOR
+	boolean "Network packet drop alerting service"
+	depends on INET && EXPERIMENTAL && TRACEPOINTS
+	---help---
+	This feature provides an alerting service to userspace in the
+	event that packets are discarded in the network stack.  Alerts
+	are broadcast via netlink socket to any listening user space
+	process.  If you don't need network drop alerts, or if you are ok
+	just checking the various proc files and other utilities for
+	drop statistics, say N here.
+
 endmenu
 
 endmenu
diff --git a/net/core/Makefile b/net/core/Makefile
index d47092bc525c..796f46eece5f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -18,4 +18,5 @@ obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_NET_DMA) += user_dma.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
 
-- 
cgit v1.2.3-71-gd317


From 211c738d86f3f423f1b218ab3a356c9538e38047 Mon Sep 17 00:00:00 2001
From: Yi Zou <yi.zou@intel.com>
Date: Fri, 27 Feb 2009 14:06:37 -0800
Subject: [SCSI] net, fcoe: add ETH_P_FCOE for Fibre Channel over Ethernet
 (FCoE)

This adds eth type ETH_P_FCOE for Fibre Channel over Ethernet (FCoE),
consequently, the ETH_P_FCOE from fc_fcoe.h and fcoe skb->protocol
is not set as ETH_P_FCOE.

Signed-off-by: Yi Zou <yi.zou@intel.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/scsi/fcoe/libfcoe.c | 2 +-
 include/linux/if_ether.h    | 1 +
 include/scsi/fc/fc_fcoe.h   | 7 -------
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/fcoe/libfcoe.c b/drivers/scsi/fcoe/libfcoe.c
index 5548bf3bb58b..a99a42807b38 100644
--- a/drivers/scsi/fcoe/libfcoe.c
+++ b/drivers/scsi/fcoe/libfcoe.c
@@ -460,7 +460,7 @@ int fcoe_xmit(struct fc_lport *lp, struct fc_frame *fp)
 	skb_reset_mac_header(skb);
 	skb_reset_network_header(skb);
 	skb->mac_len = elen;
-	skb->protocol = htons(ETH_P_802_3);
+	skb->protocol = htons(ETH_P_FCOE);
 	skb->dev = fc->real_dev;
 
 	/* fill up mac and fcoe headers */
diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index 7f3c735f422b..59d197cb4851 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -78,6 +78,7 @@
 #define ETH_P_PAE	0x888E		/* Port Access Entity (IEEE 802.1X) */
 #define ETH_P_AOE	0x88A2		/* ATA over Ethernet		*/
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
+#define ETH_P_FCOE	0x8906		/* Fibre Channel over Ethernet  */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 
 /*
diff --git a/include/scsi/fc/fc_fcoe.h b/include/scsi/fc/fc_fcoe.h
index f271d9cc0fc2..ccb3dbe90463 100644
--- a/include/scsi/fc/fc_fcoe.h
+++ b/include/scsi/fc/fc_fcoe.h
@@ -24,13 +24,6 @@
  * FCoE - Fibre Channel over Ethernet.
  */
 
-/*
- * The FCoE ethertype eventually goes in net/if_ether.h.
- */
-#ifndef ETH_P_FCOE
-#define	ETH_P_FCOE	0x8906		/* FCOE ether type */
-#endif
-
 /*
  * FC_FCOE_OUI hasn't been standardized yet.   XXX TBD.
  */
-- 
cgit v1.2.3-71-gd317


From 43eb99c5b349b188f82725652f3d1018c619d682 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Fri, 27 Feb 2009 14:06:43 -0800
Subject: [SCSI] net: reclaim 8 upper bits of the netdev->features from GSO

Reclaim 8 upper bits of netdev->features from GSO.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: Yi Zou <yi.zou@intel.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/net/xen-netfront.c | 2 +-
 include/linux/netdevice.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index cd6184ee08ee..2ce536fcd209 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1511,7 +1511,7 @@ static int xennet_set_tso(struct net_device *dev, u32 data)
 static void xennet_set_features(struct net_device *dev)
 {
 	/* Turn off all GSO bits except ROBUST. */
-	dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
+	dev->features &= ~NETIF_F_GSO_MASK;
 	dev->features |= NETIF_F_GSO_ROBUST;
 	xennet_set_sg(dev, 0);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ec54785d34f9..c8238d9ba376 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -652,7 +652,7 @@ struct net_device
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
-#define NETIF_F_GSO_MASK	0xffff0000
+#define NETIF_F_GSO_MASK	0x00ff0000
 #define NETIF_F_TSO		(SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
 #define NETIF_F_UFO		(SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
 #define NETIF_F_GSO_ROBUST	(SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
-- 
cgit v1.2.3-71-gd317


From 01d5b2fca1fa58ed5039239fd531e9f658971ace Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Fri, 27 Feb 2009 14:06:49 -0800
Subject: [SCSI] net: define feature flags for FCoE offloads

Define feature flags for FCoE offloads.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: Yi Zou <yi.zou@intel.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/linux/netdevice.h | 3 +++
 include/linux/skbuff.h    | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c8238d9ba376..5c405571cb60 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -650,6 +650,8 @@ struct net_device
 #define NETIF_F_GRO		16384	/* Generic receive offload */
 #define NETIF_F_LRO		32768	/* large receive offload */
 
+#define NETIF_F_FCOE_CRC	(1 << 24) /* FCoE CRC32 */
+
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
 #define NETIF_F_GSO_MASK	0x00ff0000
@@ -658,6 +660,7 @@ struct net_device
 #define NETIF_F_GSO_ROBUST	(SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
 #define NETIF_F_TSO_ECN		(SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
 #define NETIF_F_TSO6		(SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
+#define NETIF_F_FSO		(SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
 
 	/* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9dcf956ad18a..02adea2099a7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -188,6 +188,8 @@ enum {
 	SKB_GSO_TCP_ECN = 1 << 3,
 
 	SKB_GSO_TCPV6 = 1 << 4,
+
+	SKB_GSO_FCOE = 1 << 5,
 };
 
 #if BITS_PER_LONG > 32
-- 
cgit v1.2.3-71-gd317


From 4d288d5767f853bfca25adc7b6030dc95518cb2e Mon Sep 17 00:00:00 2001
From: Yi Zou <yi.zou@intel.com>
Date: Fri, 27 Feb 2009 14:06:59 -0800
Subject: [SCSI] net: add FCoE offload support through net_device

This adds support to provide Fiber Channel over Ethernet (FCoE) offload
through net_device's net_device_ops struct. The offload through net_device
for FCoE is enabled in kernel as built-in or module driver.

Signed-off-by: Yi Zou <yi.zou@intel.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/linux/netdevice.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5c405571cb60..7ed49f5335b1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -582,6 +582,14 @@ struct net_device_ops {
 #define HAVE_NETDEV_POLL
 	void                    (*ndo_poll_controller)(struct net_device *dev);
 #endif
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	int			(*ndo_fcoe_ddp_setup)(struct net_device *dev,
+						      u16 xid,
+						      struct scatterlist *sgl,
+						      unsigned int sgc);
+	int			(*ndo_fcoe_ddp_done)(struct net_device *dev,
+						     u16 xid);
+#endif
 };
 
 /*
@@ -843,6 +851,11 @@ struct net_device
 	struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
 
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	/* max exchange id for FCoE LRO by ddp */
+	unsigned int		fcoe_ddp_xid;
+#endif
+
 #ifdef CONFIG_COMPAT_NET_DEV_OPS
 	struct {
 		int			(*init)(struct net_device *dev);
-- 
cgit v1.2.3-71-gd317


From 5d82720a7f41f0c877e026c7d17e3bf20ccdbae0 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 13 Mar 2009 21:16:13 +0100
Subject: ide: save the returned value of dma_map_sg

dma_map_sg could return a value different to 'nents' argument of
dma_map_sg so the ide stack needs to save it for the later usage
(e.g. for_each_sg).

The ide stack also needs to save the original sg_nents value for
pci_unmap_sg.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
[bart: backport to Linus' tree]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-dma.c | 12 +++++++++---
 include/linux/ide.h   |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 72ebab0bc755..059c90bb5ad2 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -128,6 +128,7 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
+	int i;
 
 	ide_map_sg(drive, rq);
 
@@ -136,8 +137,13 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq)
 	else
 		hwif->sg_dma_direction = DMA_TO_DEVICE;
 
-	return dma_map_sg(hwif->dev, sg, hwif->sg_nents,
-			  hwif->sg_dma_direction);
+	i = dma_map_sg(hwif->dev, sg, hwif->sg_nents, hwif->sg_dma_direction);
+	if (i) {
+		hwif->orig_sg_nents = hwif->sg_nents;
+		hwif->sg_nents = i;
+	}
+
+	return i;
 }
 EXPORT_SYMBOL_GPL(ide_build_sglist);
 
@@ -156,7 +162,7 @@ void ide_destroy_dmatable(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
-	dma_unmap_sg(hwif->dev, hwif->sg_table, hwif->sg_nents,
+	dma_unmap_sg(hwif->dev, hwif->sg_table, hwif->orig_sg_nents,
 		     hwif->sg_dma_direction);
 }
 EXPORT_SYMBOL_GPL(ide_destroy_dmatable);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e0cedfe9fad4..25087aead657 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -797,6 +797,7 @@ typedef struct hwif_s {
 	struct scatterlist *sg_table;
 	int sg_max_nents;		/* Maximum number of entries in it */
 	int sg_nents;			/* Current number of entries in it */
+	int orig_sg_nents;
 	int sg_dma_direction;		/* dma transfer direction */
 
 	/* data phase of the active command (currently only valid for PIO/DMA) */
-- 
cgit v1.2.3-71-gd317


From 4ab3b73f85ca2e99d9dbdb55ac13e57327a7e915 Mon Sep 17 00:00:00 2001
From: Douglas Gilbert <dgilbert@interlog.com>
Date: Mon, 9 Mar 2009 10:51:38 -0400
Subject: [SCSI] bsg: add linux/types.h include to bsg.h

Since bsg.h has recently been added to the list of kernel
headers that should be exported to the user space, this
attachment makes bsg.h more user space "friendly".
Specifically autotools dislike headers that don't compile
freestanding and bsg.h's use of __u32 types (and friends)
are not standard C (C90 or C99). The inclusion of
linux/types.h fixes that.

Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/linux/bsg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index cf0303a60611..6c0a00dfa90c 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -1,6 +1,8 @@
 #ifndef BSG_H
 #define BSG_H
 
+#include <linux/types.h>
+
 #define BSG_PROTOCOL_SCSI		0
 
 #define BSG_SUB_PROTOCOL_SCSI_CMD	0
-- 
cgit v1.2.3-71-gd317


From dec3f95959bff957f5bcbf16c2a2823f7e33d1e7 Mon Sep 17 00:00:00 2001
From: Eric Moore <eric.moore@lsi.com>
Date: Mon, 9 Mar 2009 01:27:49 -0600
Subject: [SCSI] mpt2sas: add MPT2SAS_MINOR(221) to miscdevice.h

Signed-off-by: Eric Moore <eric.moore@lsi.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/linux/miscdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index a820f816a49e..beb6ec99cfef 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -26,6 +26,7 @@
 #define TUN_MINOR		200
 #define MWAVE_MINOR		219	/* ACP/Mwave Modem */
 #define MPT_MINOR		220
+#define MPT2SAS_MINOR		221
 #define HPET_MINOR		228
 #define FUSE_MINOR		229
 #define KVM_MINOR		232
-- 
cgit v1.2.3-71-gd317


From a390d1f379cf821248b735f43d2e1147ebb8241d Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Fri, 13 Mar 2009 15:41:19 -0700
Subject: phylib: convert state_queue work to delayed_work

It closes a race in phy_stop_machine when reprogramming of phy_timer
(from phy_state_machine) happens between del_timer_sync and cancel_work_sync.

Without this change it could lead to crash if phy_device would be freed after
phy_stop_machine (timer would fire and schedule freed work).

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 41 +++++++++++------------------------------
 include/linux/phy.h   |  3 +--
 2 files changed, 12 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e4ede6080c9d..58b73b08dde0 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -414,7 +414,6 @@ EXPORT_SYMBOL(phy_start_aneg);
 
 static void phy_change(struct work_struct *work);
 static void phy_state_machine(struct work_struct *work);
-static void phy_timer(unsigned long data);
 
 /**
  * phy_start_machine - start PHY state machine tracking
@@ -434,11 +433,8 @@ void phy_start_machine(struct phy_device *phydev,
 {
 	phydev->adjust_state = handler;
 
-	INIT_WORK(&phydev->state_queue, phy_state_machine);
-	init_timer(&phydev->phy_timer);
-	phydev->phy_timer.function = &phy_timer;
-	phydev->phy_timer.data = (unsigned long) phydev;
-	mod_timer(&phydev->phy_timer, jiffies + HZ);
+	INIT_DELAYED_WORK(&phydev->state_queue, phy_state_machine);
+	schedule_delayed_work(&phydev->state_queue, jiffies + HZ);
 }
 
 /**
@@ -451,8 +447,7 @@ void phy_start_machine(struct phy_device *phydev,
  */
 void phy_stop_machine(struct phy_device *phydev)
 {
-	del_timer_sync(&phydev->phy_timer);
-	cancel_work_sync(&phydev->state_queue);
+	cancel_delayed_work_sync(&phydev->state_queue);
 
 	mutex_lock(&phydev->lock);
 	if (phydev->state > PHY_UP)
@@ -680,11 +675,9 @@ static void phy_change(struct work_struct *work)
 	if (err)
 		goto irq_enable_err;
 
-	/* Stop timer and run the state queue now.  The work function for
-	 * state_queue will start the timer up again.
-	 */
-	del_timer(&phydev->phy_timer);
-	schedule_work(&phydev->state_queue);
+	/* reschedule state queue work to run as soon as possible */
+	cancel_delayed_work_sync(&phydev->state_queue);
+	schedule_delayed_work(&phydev->state_queue, 0);
 
 	return;
 
@@ -761,14 +754,13 @@ EXPORT_SYMBOL(phy_start);
 /**
  * phy_state_machine - Handle the state machine
  * @work: work_struct that describes the work to be done
- *
- * Description: Scheduled by the state_queue workqueue each time
- *   phy_timer is triggered.
  */
 static void phy_state_machine(struct work_struct *work)
 {
+	struct delayed_work *dwork =
+			container_of(work, struct delayed_work, work);
 	struct phy_device *phydev =
-			container_of(work, struct phy_device, state_queue);
+			container_of(dwork, struct phy_device, state_queue);
 	int needs_aneg = 0;
 	int err = 0;
 
@@ -946,17 +938,6 @@ static void phy_state_machine(struct work_struct *work)
 	if (err < 0)
 		phy_error(phydev);
 
-	mod_timer(&phydev->phy_timer, jiffies + PHY_STATE_TIME * HZ);
-}
-
-/* PHY timer which schedules the state machine work */
-static void phy_timer(unsigned long data)
-{
-	struct phy_device *phydev = (struct phy_device *)data;
-
-	/*
-	 * PHY I/O operations can potentially sleep so we ensure that
-	 * it's done from a process context
-	 */
-	schedule_work(&phydev->state_queue);
+	schedule_delayed_work(&phydev->state_queue,
+				jiffies + PHY_STATE_TIME * HZ);
 }
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d7e54d98869f..32cf14a4b034 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -315,8 +315,7 @@ struct phy_device {
 
 	/* Interrupt and Polling infrastructure */
 	struct work_struct phy_queue;
-	struct work_struct state_queue;
-	struct timer_list phy_timer;
+	struct delayed_work state_queue;
 	atomic_t irq_disable;
 
 	struct mutex lock;
-- 
cgit v1.2.3-71-gd317


From 9c705260feea6ae329bc6b6d5f6d2ef0227eda0a Mon Sep 17 00:00:00 2001
From: Gabriele Paoloni <gabriele.paoloni@intel.com>
Date: Fri, 13 Mar 2009 16:09:12 -0700
Subject: ppp: ppp_mp_explode() redesign

I found the PPP subsystem to not work properly when connecting channels
with different speeds to the same bundle.

Problem Description:

As the "ppp_mp_explode" function fragments the sk_buff buffer evenly
among the PPP channels that are connected to a certain PPP unit to
make up a bundle, if we are transmitting using an upper layer protocol
that requires an Ack before sending the next packet (like TCP/IP for
example), we will have a bandwidth bottleneck on the slowest channel
of the bundle.

Let's clarify by an example. Let's consider a scenario where we have
two PPP links making up a bundle: a slow link (10KB/sec) and a fast
link (1000KB/sec) working at the best (full bandwidth). On the top we
have a TCP/IP stack sending a 1000 Bytes sk_buff buffer down to the
PPP subsystem. The "ppp_mp_explode" function will divide the buffer in
two fragments of 500B each (we are neglecting all the headers, crc,
flags etc?.). Before the TCP/IP stack sends out the next buffer, it
will have to wait for the ACK response from the remote peer, so it
will have to wait for both fragments to have been sent over the two
PPP links, received by the remote peer and reconstructed. The
resulting behaviour is that, rather than having a bundle working
@1010KB/sec (the sum of the channels bandwidths), we'll have a bundle
working @20KB/sec (the double of the slowest channels bandwidth).


Problem Solution:

The problem has been solved by redesigning the "ppp_mp_explode"
function in such a way to make it split the sk_buff buffer according
to the speeds of the underlying PPP channels (the speeds of the serial
interfaces respectively attached to the PPP channels). Referring to
the above example, the redesigned "ppp_mp_explode" function will now
divide the 1000 Bytes buffer into two fragments whose sizes are set
according to the speeds of the channels where they are going to be
sent on (e.g .  10 Byets on 10KB/sec channel and 990 Bytes on
1000KB/sec channel).  The reworked function grants the same
performances of the original one in optimal working conditions (i.e. a
bundle made up of PPP links all working at the same speed), while
greatly improving performances on the bundles made up of channels
working at different speeds.

Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp_async.c     |   3 +
 drivers/net/ppp_generic.c   | 211 +++++++++++++++++++++++++-------------------
 drivers/net/ppp_synctty.c   |   3 +
 include/linux/ppp_channel.h |   2 +-
 4 files changed, 127 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index 5de6fedd1d76..6de8399d6dd9 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -157,6 +157,7 @@ ppp_asynctty_open(struct tty_struct *tty)
 {
 	struct asyncppp *ap;
 	int err;
+	int speed;
 
 	if (tty->ops->write == NULL)
 		return -EOPNOTSUPP;
@@ -187,6 +188,8 @@ ppp_asynctty_open(struct tty_struct *tty)
 	ap->chan.private = ap;
 	ap->chan.ops = &async_ops;
 	ap->chan.mtu = PPP_MRU;
+	speed = tty_get_baud_rate(tty);
+	ap->chan.speed = speed;
 	err = ppp_register_channel(&ap->chan);
 	if (err)
 		goto out_free;
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 42d455578453..8ee91421db12 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -167,6 +167,7 @@ struct channel {
 	u8		avail;		/* flag used in multilink stuff */
 	u8		had_frag;	/* >= 1 fragments have been sent */
 	u32		lastseq;	/* MP: last sequence # received */
+	int     speed;		/* speed of the corresponding ppp channel*/
 #endif /* CONFIG_PPP_MULTILINK */
 };
 
@@ -1307,138 +1308,181 @@ ppp_push(struct ppp *ppp)
  */
 static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 {
-	int len, fragsize;
-	int i, bits, hdrlen, mtu;
-	int flen;
-	int navail, nfree;
-	int nbigger;
+	int	len, totlen;
+	int	i, bits, hdrlen, mtu;
+	int	flen;
+	int	navail,	nfree, nzero;
+	int	nbigger;
+	int	totspeed;
+	int	totfree;
 	unsigned char *p, *q;
 	struct list_head *list;
 	struct channel *pch;
 	struct sk_buff *frag;
 	struct ppp_channel *chan;
 
-	nfree = 0;	/* # channels which have no packet already queued */
+	totspeed = 0; /*total bitrate of the bundle*/
+	nfree =	0;	/* # channels which	have no	packet already queued */
 	navail = 0;	/* total # of usable channels (not deregistered) */
+	nzero =	0; /* number of	channels with zero speed associated*/
+	totfree	= 0; /*total # of channels available and
+				  *having no queued packets before
+				  *starting the fragmentation*/
+
 	hdrlen = (ppp->flags & SC_MP_XSHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN;
-	i = 0;
-	list_for_each_entry(pch, &ppp->channels, clist) {
+	i =	0;
+	list_for_each_entry(pch, &ppp->channels, clist)	{
 		navail += pch->avail = (pch->chan != NULL);
-		if (pch->avail) {
+		pch->speed = pch->chan->speed;
+		if (pch->avail)	{
 			if (skb_queue_empty(&pch->file.xq) ||
-			    !pch->had_frag) {
-				pch->avail = 2;
-				++nfree;
-			}
-			if (!pch->had_frag && i < ppp->nxchan)
-				ppp->nxchan = i;
+				!pch->had_frag)	{
+					if (pch->speed == 0)
+						nzero++;
+					else
+						totspeed += pch->speed;
+
+					pch->avail = 2;
+					++nfree;
+					++totfree;
+				}
+			if (!pch->had_frag && i	< ppp->nxchan)
+				ppp->nxchan	= i;
 		}
 		++i;
 	}
-
 	/*
-	 * Don't start sending this packet unless at least half of
-	 * the channels are free.  This gives much better TCP
-	 * performance if we have a lot of channels.
+	 * Don't start sending this	packet unless at least half	of
+	 * the channels	are	free.  This	gives much better TCP
+	 * performance if we have a	lot	of channels.
 	 */
-	if (nfree == 0 || nfree < navail / 2)
-		return 0;	/* can't take now, leave it in xmit_pending */
+	if (nfree == 0 || nfree	< navail / 2)
+		return 0; /* can't take now, leave it in xmit_pending	*/
 
 	/* Do protocol field compression (XXX this should be optional) */
-	p = skb->data;
-	len = skb->len;
+	p =	skb->data;
+	len	= skb->len;
 	if (*p == 0) {
 		++p;
 		--len;
 	}
 
-	/*
-	 * Decide on fragment size.
-	 * We create a fragment for each free channel regardless of
-	 * how small they are (i.e. even 0 length) in order to minimize
-	 * the time that it will take to detect when a channel drops
-	 * a fragment.
-	 */
-	fragsize = len;
-	if (nfree > 1)
-		fragsize = DIV_ROUND_UP(fragsize, nfree);
-	/* nbigger channels get fragsize bytes, the rest get fragsize-1,
-	   except if nbigger==0, then they all get fragsize. */
-	nbigger = len % nfree;
-
-	/* skip to the channel after the one we last used
-	   and start at that one */
+	totlen = len;
+	nbigger	= len %	nfree;
+
+	/* skip	to the channel after the one we	last used
+	   and start at	that one */
 	list = &ppp->channels;
-	for (i = 0; i < ppp->nxchan; ++i) {
+	for	(i = 0;	i <	ppp->nxchan; ++i) {
 		list = list->next;
-		if (list == &ppp->channels) {
-			i = 0;
+		if (list ==	&ppp->channels)	{
+			i =	0;
 			break;
 		}
 	}
 
-	/* create a fragment for each channel */
+	/* create a	fragment for each channel */
 	bits = B;
-	while (nfree > 0 || len > 0) {
+	while (nfree > 0 &&	len	> 0) {
 		list = list->next;
-		if (list == &ppp->channels) {
-			i = 0;
+		if (list ==	&ppp->channels)	{
+			i =	0;
 			continue;
 		}
-		pch = list_entry(list, struct channel, clist);
+		pch	= list_entry(list, struct channel, clist);
 		++i;
 		if (!pch->avail)
 			continue;
 
 		/*
-		 * Skip this channel if it has a fragment pending already and
-		 * we haven't given a fragment to all of the free channels.
+		 * Skip	this channel if	it has a fragment pending already and
+		 * we haven't given	a fragment to all of the free channels.
 		 */
 		if (pch->avail == 1) {
-			if (nfree > 0)
+			if (nfree >	0)
 				continue;
 		} else {
-			--nfree;
 			pch->avail = 1;
 		}
 
 		/* check the channel's mtu and whether it is still attached. */
 		spin_lock_bh(&pch->downl);
 		if (pch->chan == NULL) {
-			/* can't use this channel, it's being deregistered */
+			/* can't use this channel, it's	being deregistered */
+			if (pch->speed == 0)
+				nzero--;
+			else
+				totspeed -=	pch->speed;
+
 			spin_unlock_bh(&pch->downl);
 			pch->avail = 0;
-			if (--navail == 0)
+			totlen = len;
+			totfree--;
+			nfree--;
+			if (--navail ==	0)
 				break;
 			continue;
 		}
 
 		/*
-		 * Create a fragment for this channel of
-		 * min(max(mtu+2-hdrlen, 4), fragsize, len) bytes.
-		 * If mtu+2-hdrlen < 4, that is a ridiculously small
-		 * MTU, so we use mtu = 2 + hdrlen.
+		*if the channel speed is not set divide
+		*the packet	evenly among the free channels;
+		*otherwise divide it according to the speed
+		*of the channel we are going to transmit on
+		*/
+		if (pch->speed == 0) {
+			flen = totlen/nfree	;
+			if (nbigger > 0) {
+				flen++;
+				nbigger--;
+			}
+		} else {
+			flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) /
+				((totspeed*totfree)/pch->speed)) - hdrlen;
+			if (nbigger > 0) {
+				flen += ((totfree - nzero)*pch->speed)/totspeed;
+				nbigger -= ((totfree - nzero)*pch->speed)/
+							totspeed;
+			}
+		}
+		nfree--;
+
+		/*
+		 *check	if we are on the last channel or
+		 *we exceded the lenght	of the data	to
+		 *fragment
+		 */
+		if ((nfree == 0) || (flen > len))
+			flen = len;
+		/*
+		 *it is not worth to tx on slow channels:
+		 *in that case from the resulting flen according to the
+		 *above formula will be equal or less than zero.
+		 *Skip the channel in this case
 		 */
-		if (fragsize > len)
-			fragsize = len;
-		flen = fragsize;
-		mtu = pch->chan->mtu + 2 - hdrlen;
-		if (mtu < 4)
-			mtu = 4;
+		if (flen <=	0) {
+			pch->avail = 2;
+			spin_unlock_bh(&pch->downl);
+			continue;
+		}
+
+		mtu	= pch->chan->mtu + 2 - hdrlen;
+		if (mtu	< 4)
+			mtu	= 4;
 		if (flen > mtu)
 			flen = mtu;
-		if (flen == len && nfree == 0)
-			bits |= E;
-		frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC);
+		if (flen ==	len)
+			bits |=	E;
+		frag = alloc_skb(flen +	hdrlen + (flen == 0), GFP_ATOMIC);
 		if (!frag)
 			goto noskb;
-		q = skb_put(frag, flen + hdrlen);
+		q =	skb_put(frag, flen + hdrlen);
 
-		/* make the MP header */
+		/* make	the	MP header */
 		q[0] = PPP_MP >> 8;
 		q[1] = PPP_MP;
 		if (ppp->flags & SC_MP_XSHORTSEQ) {
-			q[2] = bits + ((ppp->nxseq >> 8) & 0xf);
+			q[2] = bits	+ ((ppp->nxseq >> 8) & 0xf);
 			q[3] = ppp->nxseq;
 		} else {
 			q[2] = bits;
@@ -1447,43 +1491,28 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 			q[5] = ppp->nxseq;
 		}
 
-		/*
-		 * Copy the data in.
-		 * Unfortunately there is a bug in older versions of
-		 * the Linux PPP multilink reconstruction code where it
-		 * drops 0-length fragments.  Therefore we make sure the
-		 * fragment has at least one byte of data.  Any bytes
-		 * we add in this situation will end up as padding on the
-		 * end of the reconstructed packet.
-		 */
-		if (flen == 0)
-			*skb_put(frag, 1) = 0;
-		else
-			memcpy(q + hdrlen, p, flen);
+		memcpy(q + hdrlen, p, flen);
 
 		/* try to send it down the channel */
 		chan = pch->chan;
-		if (!skb_queue_empty(&pch->file.xq) ||
-		    !chan->ops->start_xmit(chan, frag))
+		if (!skb_queue_empty(&pch->file.xq)	||
+			!chan->ops->start_xmit(chan, frag))
 			skb_queue_tail(&pch->file.xq, frag);
-		pch->had_frag = 1;
+		pch->had_frag =	1;
 		p += flen;
-		len -= flen;
+		len	-= flen;
 		++ppp->nxseq;
 		bits = 0;
 		spin_unlock_bh(&pch->downl);
-
-		if (--nbigger == 0 && fragsize > 0)
-			--fragsize;
 	}
-	ppp->nxchan = i;
+	ppp->nxchan	= i;
 
 	return 1;
 
  noskb:
 	spin_unlock_bh(&pch->downl);
 	if (ppp->debug & 1)
-		printk(KERN_ERR "PPP: no memory (fragment)\n");
+		printk(KERN_ERR	"PPP: no memory	(fragment)\n");
 	++ppp->dev->stats.tx_errors;
 	++ppp->nxseq;
 	return 1;	/* abandon the frame */
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index 3ea791d16b00..d2fa2db13586 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -206,6 +206,7 @@ ppp_sync_open(struct tty_struct *tty)
 {
 	struct syncppp *ap;
 	int err;
+	int speed;
 
 	if (tty->ops->write == NULL)
 		return -EOPNOTSUPP;
@@ -234,6 +235,8 @@ ppp_sync_open(struct tty_struct *tty)
 	ap->chan.ops = &sync_ops;
 	ap->chan.mtu = PPP_MRU;
 	ap->chan.hdrlen = 2;	/* for A/C bytes */
+	speed = tty_get_baud_rate(tty);
+	ap->chan.speed = speed;
 	err = ppp_register_channel(&ap->chan);
 	if (err)
 		goto out_free;
diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h
index 9d64bdf14770..0d3fa63e90ea 100644
--- a/include/linux/ppp_channel.h
+++ b/include/linux/ppp_channel.h
@@ -40,8 +40,8 @@ struct ppp_channel {
 	int		mtu;		/* max transmit packet size */
 	int		hdrlen;		/* amount of headroom channel needs */
 	void		*ppp;		/* opaque to channel */
-	/* the following are not used at present */
 	int		speed;		/* transfer rate (bytes/second) */
+	/* the following is not used at present */
 	int		latency;	/* overhead time in milliseconds */
 };
 
-- 
cgit v1.2.3-71-gd317


From 895791dac6946d535991edd11341046f8e85ea77 Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Fri, 13 Mar 2009 16:35:44 -0700
Subject: VM, x86, PAT: add a new vm flag to track full pfnmap at mmap

Impact: cleanup

Add a new vm flag VM_PFN_AT_MMAP to identify a PFNMAP that is
fully mapped with remap_pfn_range. Patch removes the overloading
of VM_INSERTPAGE from the earlier patch.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Nick Piggin <npiggin@suse.de>
LKML-Reference: <20090313233543.GA19909@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mm.h | 16 +++-------------
 mm/memory.c        |  4 ++--
 2 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3daa05feed9f..b1ea37fc7a24 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -98,12 +98,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
-#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it. Refer note in VM_PFNMAP_AT_MMAP below */
+#define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
+#define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -126,17 +127,6 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
-/*
- * pfnmap vmas that are fully mapped at mmap time (not mapped on fault).
- * Used by x86 PAT to identify such PFNMAP mappings and optimize their handling.
- * Note VM_INSERTPAGE flag is overloaded here. i.e,
- * VM_INSERTPAGE && !VM_PFNMAP implies
- *     The vma has had "vm_insert_page()" done on it
- * VM_INSERTPAGE && VM_PFNMAP implies
- *     The vma is PFNMAP with full mapping at mmap time
- */
-#define VM_PFNMAP_AT_MMAP (VM_INSERTPAGE | VM_PFNMAP)
-
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
@@ -156,7 +146,7 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-	return ((vma->vm_flags & VM_PFNMAP_AT_MMAP) == VM_PFNMAP_AT_MMAP);
+	return (vma->vm_flags & VM_PFN_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
diff --git a/mm/memory.c b/mm/memory.c
index d7df5babcba9..2032ad2fc34b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1667,7 +1667,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 */
 	if (addr == vma->vm_start && end == vma->vm_end) {
 		vma->vm_pgoff = pfn;
-		vma->vm_flags |= VM_PFNMAP_AT_MMAP;
+		vma->vm_flags |= VM_PFN_AT_MMAP;
 	} else if (is_cow_mapping(vma->vm_flags))
 		return -EINVAL;
 
@@ -1680,7 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		 * needed from higher level routine calling unmap_vmas
 		 */
 		vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-		vma->vm_flags &= ~VM_PFNMAP_AT_MMAP;
+		vma->vm_flags &= ~VM_PFN_AT_MMAP;
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 87092698c665e0a358caf9825ae13114343027e8 Mon Sep 17 00:00:00 2001
From: un'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 9 Mar 2009 10:40:52 +0100
Subject: block: Add gfp_mask parameter to bio_integrity_clone()

Stricter gfp_mask might be required for clone allocation.
For example, request-based dm may clone bio in interrupt context
so it has to use GFP_ATOMIC.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c  | 5 +++--
 fs/bio.c            | 2 +-
 include/linux/bio.h | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 549b0144da11..fe2b1aa2464e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split);
  * bio_integrity_clone - Callback for cloning bios with integrity metadata
  * @bio:	New bio
  * @bio_src:	Original bio
+ * @gfp_mask:	Memory allocation mask
  * @bs:		bio_set to allocate bip from
  *
  * Description:	Called to allocate a bip when cloning a bio
  */
 int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-			struct bio_set *bs)
+			gfp_t gfp_mask, struct bio_set *bs)
 {
 	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
 	struct bio_integrity_payload *bip;
 
 	BUG_ON(bip_src == NULL);
 
-	bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+	bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
 
 	if (bip == NULL)
 		return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 124b95c4d582..cf747378b977 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -463,7 +463,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 	if (bio_integrity(bio)) {
 		int ret;
 
-		ret = bio_integrity_clone(b, bio, fs_bio_set);
+		ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
 
 		if (ret < 0)
 			return NULL;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1b16108a5417..d8bd43bfdcf5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -531,7 +531,7 @@ extern void bio_integrity_endio(struct bio *, int);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
 extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
-extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *);
+extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t, struct bio_set *);
 extern int bioset_integrity_create(struct bio_set *, int);
 extern void bioset_integrity_free(struct bio_set *);
 extern void bio_integrity_init_slab(void);
@@ -542,7 +542,7 @@ extern void bio_integrity_init_slab(void);
 #define bioset_integrity_create(a, b)	(0)
 #define bio_integrity_prep(a)		(0)
 #define bio_integrity_enabled(a)	(0)
-#define bio_integrity_clone(a, b, c)	(0)
+#define bio_integrity_clone(a, b, c,d )	(0)
 #define bioset_integrity_free(a)	do { } while (0)
 #define bio_integrity_free(a, b)	do { } while (0)
 #define bio_integrity_endio(a, b)	do { } while (0)
-- 
cgit v1.2.3-71-gd317


From 8bdd663aba341c15cd2fa9dbd7061b8b387964dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 15 Mar 2009 19:59:13 -0700
Subject: net: reorder fields of struct socket

On x86_64, its rather unfortunate that "wait_queue_head_t wait"
field of "struct socket" spans two cache lines (assuming a 64
bytes cache line in current cpus)

offsetof(struct socket, wait)=0x30
sizeof(wait_queue_head_t)=0x18

This might explain why Kenny Chang noticed that his multicast workload
was performing bad with 64 bit kernels, since more cache lines ping pongs
were involved.

This litle patch moves "wait" field next "fasync_list" so that both
fields share a single cache line, to speedup sock_def_readable()

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 4515efae4c39..4fc2ffd527f9 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -129,11 +129,15 @@ struct socket {
 	socket_state		state;
 	short			type;
 	unsigned long		flags;
-	const struct proto_ops	*ops;
+	/*
+	 * Please keep fasync_list & wait fields in the same cache line
+	 */
 	struct fasync_struct	*fasync_list;
+	wait_queue_head_t	wait;
+
 	struct file		*file;
 	struct sock		*sk;
-	wait_queue_head_t	wait;
+	const struct proto_ops	*ops;
 };
 
 struct vm_area_struct;
-- 
cgit v1.2.3-71-gd317


From 0c54b85f2828128274f319a1eb3ce7f604fe2a53 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 14 Mar 2009 14:23:05 +0000
Subject: tcp: simplify tcp_current_mss
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's very little need for most of the callsites to get
tp->xmit_goal_size updated. That will cost us divide as is,
so slice the function in two. Also, the only users of the
tp->xmit_goal_size are directly behind tcp_current_mss(),
so there's no need to store that variable into tcp_sock
at all! The drop of xmit_goal_size currently leaves 16-bit
hole and some reorganization would again be necessary to
change that (but I'm aiming to fill that hole with u16
xmit_goal_size_segs to cache the results of the remaining
divide to get that tso on regression).

Bring xmit_goal_size parts into tcp.c

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 -
 include/net/tcp.h     | 13 +++++++++++--
 net/ipv4/tcp.c        | 43 +++++++++++++++++++++++++++++++++++--------
 net/ipv4/tcp_input.c  |  2 +-
 net/ipv4/tcp_output.c | 41 +++++++----------------------------------
 5 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4b86ad71e054..ad2021ccc55a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -248,7 +248,6 @@ struct tcp_sock {
 	/* inet_connection_sock has to be the first member of tcp_sock */
 	struct inet_connection_sock	inet_conn;
 	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
-	u16	xmit_size_goal;	/* Goal for segmenting output packets	*/
 
 /*
  *	Header prediction flags
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 255ca35bea05..e54c76d75495 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -481,7 +481,16 @@ static inline void tcp_clear_xmit_timers(struct sock *sk)
 }
 
 extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
-extern unsigned int tcp_current_mss(struct sock *sk, int large);
+extern unsigned int tcp_current_mss(struct sock *sk);
+
+/* Bound MSS / TSO packet size with the half of the window */
+static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
+{
+	if (tp->max_window && pktsize > (tp->max_window >> 1))
+		return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
+	else
+		return pktsize;
+}
 
 /* tcp.c */
 extern void tcp_get_info(struct sock *, struct tcp_info *);
@@ -822,7 +831,7 @@ static inline void tcp_push_pending_frames(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	__tcp_push_pending_frames(sk, tcp_current_mss(sk, 1), tp->nonagle);
+	__tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
 }
 
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d3f9beee74c0..886596ff0aae 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -661,6 +661,37 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 	return NULL;
 }
 
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+				       int large_allowed)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 xmit_size_goal;
+
+	xmit_size_goal = mss_now;
+
+	if (large_allowed && sk_can_gso(sk)) {
+		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
+				  inet_csk(sk)->icsk_af_ops->net_header_len -
+				  inet_csk(sk)->icsk_ext_hdr_len -
+				  tp->tcp_header_len);
+
+		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
+		xmit_size_goal -= (xmit_size_goal % mss_now);
+	}
+
+	return xmit_size_goal;
+}
+
+static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+	int mss_now;
+
+	mss_now = tcp_current_mss(sk);
+	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+	return mss_now;
+}
+
 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 			 size_t psize, int flags)
 {
@@ -677,8 +708,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
-	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-	size_goal = tp->xmit_size_goal;
+	mss_now = tcp_send_mss(sk, &size_goal, flags);
 	copied = 0;
 
 	err = -EPIPE;
@@ -761,8 +791,7 @@ wait_for_memory:
 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 			goto do_error;
 
-		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-		size_goal = tp->xmit_size_goal;
+		mss_now = tcp_send_mss(sk, &size_goal, flags);
 	}
 
 out:
@@ -844,8 +873,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	/* This should be in poll */
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
-	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-	size_goal = tp->xmit_size_goal;
+	mss_now = tcp_send_mss(sk, &size_goal, flags);
 
 	/* Ok commence sending. */
 	iovlen = msg->msg_iovlen;
@@ -1007,8 +1035,7 @@ wait_for_memory:
 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 				goto do_error;
 
-			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-			size_goal = tp->xmit_size_goal;
+			mss_now = tcp_send_mss(sk, &size_goal, flags);
 		}
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 311c30f73ee4..fae78e3eccc4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2864,7 +2864,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int mss = tcp_current_mss(sk, 0);
+	unsigned int mss = tcp_current_mss(sk);
 	u32 prior_lost = tp->lost_out;
 
 	tcp_for_write_queue(skb, sk) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 325658039139..c1f259d2d33b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -921,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	 * factor and mss.
 	 */
 	if (tcp_skb_pcount(skb) > 1)
-		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
+		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
 
 	return 0;
 }
@@ -982,15 +982,6 @@ void tcp_mtup_init(struct sock *sk)
 	icsk->icsk_mtup.probe_size = 0;
 }
 
-/* Bound MSS / TSO packet size with the half of the window */
-static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
-{
-	if (tp->max_window && pktsize > (tp->max_window >> 1))
-		return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
-	else
-		return pktsize;
-}
-
 /* This function synchronize snd mss to current pmtu/exthdr set.
 
    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -1037,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 /* Compute the current effective MSS, taking SACKs and IP options,
  * and even PMTU discovery events into account.
  */
-unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
+unsigned int tcp_current_mss(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	u32 mss_now;
-	u16 xmit_size_goal;
-	int doing_tso = 0;
 	unsigned header_len;
 	struct tcp_out_options opts;
 	struct tcp_md5sig_key *md5;
 
 	mss_now = tp->mss_cache;
 
-	if (large_allowed && sk_can_gso(sk))
-		doing_tso = 1;
-
 	if (dst) {
 		u32 mtu = dst_mtu(dst);
 		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
@@ -1070,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 		mss_now -= delta;
 	}
 
-	xmit_size_goal = mss_now;
-
-	if (doing_tso) {
-		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-				  inet_csk(sk)->icsk_af_ops->net_header_len -
-				  inet_csk(sk)->icsk_ext_hdr_len -
-				  tp->tcp_header_len);
-
-		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
-		xmit_size_goal -= (xmit_size_goal % mss_now);
-	}
-	tp->xmit_size_goal = xmit_size_goal;
-
 	return mss_now;
 }
 
@@ -1264,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk)
 	struct sk_buff *skb = tcp_send_head(sk);
 
 	return (skb &&
-		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+		tcp_snd_test(sk, skb, tcp_current_mss(sk),
 			     (tcp_skb_is_last(sk, skb) ?
 			      tp->nonagle : TCP_NAGLE_PUSH)));
 }
@@ -1421,7 +1394,7 @@ static int tcp_mtu_probe(struct sock *sk)
 		return -1;
 
 	/* Very simple search strategy: just double the MSS. */
-	mss_now = tcp_current_mss(sk, 0);
+	mss_now = tcp_current_mss(sk);
 	probe_size = 2 * tp->mss_cache;
 	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
 	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
@@ -1903,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
 
-	cur_mss = tcp_current_mss(sk, 0);
+	cur_mss = tcp_current_mss(sk);
 
 	/* If receiver has shrunk his window, and skb is out of
 	 * new window, do not retransmit it. The exception is the
@@ -2111,7 +2084,7 @@ void tcp_send_fin(struct sock *sk)
 	 * unsent frames.  But be careful about outgoing SACKS
 	 * and IP options.
 	 */
-	mss_now = tcp_current_mss(sk, 1);
+	mss_now = tcp_current_mss(sk);
 
 	if (tcp_send_head(sk) != NULL) {
 		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -2523,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk)
 	if ((skb = tcp_send_head(sk)) != NULL &&
 	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
 		int err;
-		unsigned int mss = tcp_current_mss(sk, 0);
+		unsigned int mss = tcp_current_mss(sk);
 		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
 
 		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
-- 
cgit v1.2.3-71-gd317


From 2a3a041c4e2c1685e668b280c121a5a40a029a03 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 14 Mar 2009 22:45:16 +0000
Subject: tcp: cache result of earlier divides when mss-aligning things
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The results is very unlikely change every so often so we
hardly need to divide again after doing that once for a
connection. Yet, if divide still becomes necessary we
detect that and do the right thing and again settle for
non-divide state. Takes the u16 space which was previously
taken by the plain xmit_size_goal.

This should take care part of the tso vs non-tso difference
we found earlier.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h |  1 +
 net/ipv4/tcp.c      | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ad2021ccc55a..9d5078bd23a3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -248,6 +248,7 @@ struct tcp_sock {
 	/* inet_connection_sock has to be the first member of tcp_sock */
 	struct inet_connection_sock	inet_conn;
 	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
+	u16	xmit_size_goal_segs; /* Goal for segmenting output packets */
 
 /*
  *	Header prediction flags
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 886596ff0aae..0db9f3b984f7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -665,7 +665,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 				       int large_allowed)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 xmit_size_goal;
+	u32 xmit_size_goal, old_size_goal;
 
 	xmit_size_goal = mss_now;
 
@@ -676,7 +676,17 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 				  tp->tcp_header_len);
 
 		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
-		xmit_size_goal -= (xmit_size_goal % mss_now);
+
+		/* We try hard to avoid divides here */
+		old_size_goal = tp->xmit_size_goal_segs * mss_now;
+
+		if (likely(old_size_goal <= xmit_size_goal &&
+			   old_size_goal + mss_now > xmit_size_goal)) {
+			xmit_size_goal = old_size_goal;
+		} else {
+			tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
+		}
 	}
 
 	return xmit_size_goal;
-- 
cgit v1.2.3-71-gd317


From 684999149002dd046269666a390458e0acb38280 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 6 Feb 2009 13:52:43 -0700
Subject: Rename struct file->f_ep_lock

This lock moves out of the CONFIG_EPOLL ifdef and becomes f_lock.  For now,
epoll remains the only user, but a future patch will use it to protect
f_flags as well.

Cc: Davide Libenzi <davidel@xmailserver.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/eventpoll.c            | 12 +++++++-----
 fs/file_table.c           |  1 +
 include/linux/eventpoll.h |  1 -
 include/linux/fs.h        |  2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 011b9b8c90c6..c5c424f23fd5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -417,10 +417,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	ep_unregister_pollwait(ep, epi);
 
 	/* Remove the current item from the list of epoll hooks */
-	spin_lock(&file->f_ep_lock);
+	spin_lock(&file->f_lock);
 	if (ep_is_linked(&epi->fllink))
 		list_del_init(&epi->fllink);
-	spin_unlock(&file->f_ep_lock);
+	spin_unlock(&file->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
 
@@ -538,7 +538,7 @@ void eventpoll_release_file(struct file *file)
 	struct epitem *epi;
 
 	/*
-	 * We don't want to get "file->f_ep_lock" because it is not
+	 * We don't want to get "file->f_lock" because it is not
 	 * necessary. It is not necessary because we're in the "struct file"
 	 * cleanup path, and this means that noone is using this file anymore.
 	 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
@@ -547,6 +547,8 @@ void eventpoll_release_file(struct file *file)
 	 * will correctly serialize the operation. We do need to acquire
 	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
 	 * from anywhere but ep_free().
+	 *
+	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
 
@@ -785,9 +787,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		goto error_unregister;
 
 	/* Add the current item to the list of active epoll hook for this file */
-	spin_lock(&tfile->f_ep_lock);
+	spin_lock(&tfile->f_lock);
 	list_add_tail(&epi->fllink, &tfile->f_ep_links);
-	spin_unlock(&tfile->f_ep_lock);
+	spin_unlock(&tfile->f_lock);
 
 	/*
 	 * Add the current item to the RB tree. All RB tree operations are
diff --git a/fs/file_table.c b/fs/file_table.c
index bbeeac6efa1a..aa1e18050282 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -127,6 +127,7 @@ struct file *get_empty_filp(void)
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_cred = get_cred(cred);
+	spin_lock_init(&f->f_lock);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f1e1d3c47125..f6856a5a1d4b 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -61,7 +61,6 @@ struct file;
 static inline void eventpoll_init_file(struct file *file)
 {
 	INIT_LIST_HEAD(&file->f_ep_links);
-	spin_lock_init(&file->f_ep_lock);
 }
 
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 92734c0012e6..2011600d12c7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -848,6 +848,7 @@ struct file {
 #define f_dentry	f_path.dentry
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
+	spinlock_t		f_lock;  /* f_ep_links */
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
@@ -866,7 +867,6 @@ struct file {
 #ifdef CONFIG_EPOLL
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
 	struct list_head	f_ep_links;
-	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
 #ifdef CONFIG_DEBUG_WRITECOUNT
-- 
cgit v1.2.3-71-gd317


From db1dd4d376134eba0e08af523b61cc566a4ea1cd Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 6 Feb 2009 15:25:24 -0700
Subject: Use f_lock to protect f_flags

Traditionally, changes to struct file->f_flags have been done under BKL
protection, or with no protection at all.  This patch causes all f_flags
changes after file open/creation time to be done under protection of
f_lock.  This allows the removal of some BKL usage and fixes a number of
longstanding (if microscopic) races.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 drivers/char/tty_io.c             | 5 ++---
 drivers/usb/gadget/file_storage.c | 7 ++++++-
 fs/fcntl.c                        | 2 ++
 fs/ioctl.c                        | 7 ++++---
 fs/nfsd/vfs.c                     | 5 ++++-
 include/linux/fs.h                | 2 +-
 ipc/mqueue.c                      | 2 ++
 sound/core/oss/pcm_oss.c          | 2 ++
 sound/oss/au1550_ac97.c           | 2 ++
 sound/oss/audio.c                 | 2 ++
 sound/oss/sh_dac_audio.c          | 2 ++
 sound/oss/swarm_cs4297a.c         | 2 ++
 sound/oss/vwsnd.c                 | 2 ++
 13 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index bc84e125c6bc..224f271d8cbe 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2162,13 +2162,12 @@ static int fionbio(struct file *file, int __user *p)
 	if (get_user(nonblock, p))
 		return -EFAULT;
 
-	/* file->f_flags is still BKL protected in the fs layer - vomit */
-	lock_kernel();
+	spin_lock(&file->f_lock);
 	if (nonblock)
 		file->f_flags |= O_NONBLOCK;
 	else
 		file->f_flags &= ~O_NONBLOCK;
-	unlock_kernel();
+	spin_unlock(&file->f_lock);
 	return 0;
 }
 
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 1ab9dac7e12d..33bb76cef33c 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1711,7 +1711,9 @@ static int do_write(struct fsg_dev *fsg)
 		curlun->sense_data = SS_WRITE_PROTECTED;
 		return -EINVAL;
 	}
+	spin_lock(&curlun->filp->f_lock);
 	curlun->filp->f_flags &= ~O_SYNC;	// Default is not to wait
+	spin_unlock(&curlun->filp->f_lock);
 
 	/* Get the starting Logical Block Address and check that it's
 	 * not too big */
@@ -1728,8 +1730,11 @@ static int do_write(struct fsg_dev *fsg)
 			curlun->sense_data = SS_INVALID_FIELD_IN_CDB;
 			return -EINVAL;
 		}
-		if (fsg->cmnd[1] & 0x08)	// FUA
+		if (fsg->cmnd[1] & 0x08) {	// FUA
+			spin_lock(&curlun->filp->f_lock);
 			curlun->filp->f_flags |= O_SYNC;
+			spin_unlock(&curlun->filp->f_lock);
+		}
 	}
 	if (lba >= curlun->num_sectors) {
 		curlun->sense_data = SS_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bd215cc791da..04df8570a2d2 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -189,7 +189,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		}
 	}
 
+	spin_lock(&filp->f_lock);
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+	spin_unlock(&filp->f_lock);
  out:
 	unlock_kernel();
 	return error;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 240ec63984cb..421aab465dab 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -404,10 +404,12 @@ static int ioctl_fionbio(struct file *filp, int __user *argp)
 	if (O_NONBLOCK != O_NDELAY)
 		flag |= O_NDELAY;
 #endif
+	spin_lock(&filp->f_lock);
 	if (on)
 		filp->f_flags |= flag;
 	else
 		filp->f_flags &= ~flag;
+	spin_unlock(&filp->f_lock);
 	return error;
 }
 
@@ -432,10 +434,12 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 	if (error)
 		return error;
 
+	spin_lock(&filp->f_lock);
 	if (on)
 		filp->f_flags |= FASYNC;
 	else
 		filp->f_flags &= ~FASYNC;
+	spin_unlock(&filp->f_lock);
 	return error;
 }
 
@@ -499,10 +503,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIONBIO:
-		/* BKL needed to avoid races tweaking f_flags */
-		lock_kernel();
 		error = ioctl_fionbio(filp, argp);
-		unlock_kernel();
 		break;
 
 	case FIOASYNC:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6e50aaa56ca2..c165a6403df0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -998,8 +998,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (!EX_ISSYNC(exp))
 		stable = 0;
-	if (stable && !EX_WGATHER(exp))
+	if (stable && !EX_WGATHER(exp)) {
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_SYNC;
+		spin_unlock(&file->f_lock);
+	}
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2011600d12c7..7428c6d35e65 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -848,7 +848,7 @@ struct file {
 #define f_dentry	f_path.dentry
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
-	spinlock_t		f_lock;  /* f_ep_links */
+	spinlock_t		f_lock;  /* f_ep_links, f_flags */
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 54b4077fed79..a8ddadbc7459 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1156,10 +1156,12 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
 	omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
 	if (u_mqstat) {
 		audit_mq_getsetattr(mqdes, &mqstat);
+		spin_lock(&filp->f_lock);
 		if (mqstat.mq_flags & O_NONBLOCK)
 			filp->f_flags |= O_NONBLOCK;
 		else
 			filp->f_flags &= ~O_NONBLOCK;
+		spin_unlock(&filp->f_lock);
 
 		inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	}
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 0a1798eafb0b..d4460f18e76c 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1895,7 +1895,9 @@ static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsig
 
 static int snd_pcm_oss_nonblock(struct file * file)
 {
+	spin_lock(&file->f_lock);
 	file->f_flags |= O_NONBLOCK;
+	spin_unlock(&file->f_lock);
 	return 0;
 }
 
diff --git a/sound/oss/au1550_ac97.c b/sound/oss/au1550_ac97.c
index 81e1f443d094..4191acccbcdb 100644
--- a/sound/oss/au1550_ac97.c
+++ b/sound/oss/au1550_ac97.c
@@ -1627,7 +1627,9 @@ au1550_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 				    sizeof(abinfo)) ? -EFAULT : 0;
 
 	case SNDCTL_DSP_NONBLOCK:
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_NONBLOCK;
+		spin_unlock(&file->f_lock);
 		return 0;
 
 	case SNDCTL_DSP_GETODELAY:
diff --git a/sound/oss/audio.c b/sound/oss/audio.c
index 89bd27a5e865..b69c05b7ea7b 100644
--- a/sound/oss/audio.c
+++ b/sound/oss/audio.c
@@ -433,7 +433,9 @@ int audio_ioctl(int dev, struct file *file, unsigned int cmd, void __user *arg)
 			return dma_ioctl(dev, cmd, arg);
 		
 		case SNDCTL_DSP_NONBLOCK:
+			spin_lock(&file->f_lock);
 			file->f_flags |= O_NONBLOCK;
+			spin_unlock(&file->f_lock);
 			return 0;
 
 		case SNDCTL_DSP_GETCAPS:
diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c
index e5d423994918..78cfb66e4c59 100644
--- a/sound/oss/sh_dac_audio.c
+++ b/sound/oss/sh_dac_audio.c
@@ -135,7 +135,9 @@ static int dac_audio_ioctl(struct inode *inode, struct file *file,
 		return put_user(AFMT_U8, (int *)arg);
 
 	case SNDCTL_DSP_NONBLOCK:
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_NONBLOCK;
+		spin_unlock(&file->f_lock);
 		return 0;
 
 	case SNDCTL_DSP_GETCAPS:
diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c
index 41562ecde5bb..1edab7b4ea83 100644
--- a/sound/oss/swarm_cs4297a.c
+++ b/sound/oss/swarm_cs4297a.c
@@ -2200,7 +2200,9 @@ static int cs4297a_ioctl(struct inode *inode, struct file *file,
 				    sizeof(abinfo)) ? -EFAULT : 0;
 
 	case SNDCTL_DSP_NONBLOCK:
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_NONBLOCK;
+		spin_unlock(&file->f_lock);
 		return 0;
 
 	case SNDCTL_DSP_GETODELAY:
diff --git a/sound/oss/vwsnd.c b/sound/oss/vwsnd.c
index 78b8acc7c3b9..187f72750e8f 100644
--- a/sound/oss/vwsnd.c
+++ b/sound/oss/vwsnd.c
@@ -2673,7 +2673,9 @@ static int vwsnd_audio_do_ioctl(struct inode *inode,
 
 	case SNDCTL_DSP_NONBLOCK:	/* _SIO  ('P',14) */
 		DBGX("SNDCTL_DSP_NONBLOCK\n");
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_NONBLOCK;
+		spin_unlock(&file->f_lock);
 		return 0;
 
 	case SNDCTL_DSP_RESET:		/* _SIO  ('P', 0) */
-- 
cgit v1.2.3-71-gd317


From acc738fec03bdaa5b77340c32a82fbfedaaabef0 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Mon, 16 Mar 2009 15:35:29 +0100
Subject: netfilter: xtables: avoid pointer to self

Commit 784544739a25c30637397ace5489eeb6e15d7d49 (netfilter: iptables:
lock free counters) broke a number of modules whose rule data referenced
itself. A reallocation would not reestablish the correct references, so
it is best to use a separate struct that does not fall under RCU.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_limit.h     |  9 ++++----
 include/linux/netfilter/xt_quota.h     |  4 +++-
 include/linux/netfilter/xt_statistic.h |  7 +++---
 net/netfilter/xt_limit.c               | 40 ++++++++++++++++++++++++----------
 net/netfilter/xt_quota.c               | 31 ++++++++++++++++++++------
 net/netfilter/xt_statistic.c           | 28 +++++++++++++++++++-----
 6 files changed, 88 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_limit.h b/include/linux/netfilter/xt_limit.h
index b3ce65375ecb..fda222c7953b 100644
--- a/include/linux/netfilter/xt_limit.h
+++ b/include/linux/netfilter/xt_limit.h
@@ -4,6 +4,8 @@
 /* timings are in milliseconds. */
 #define XT_LIMIT_SCALE 10000
 
+struct xt_limit_priv;
+
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
    seconds, or one every 59 hours. */
 struct xt_rateinfo {
@@ -11,11 +13,10 @@ struct xt_rateinfo {
 	u_int32_t burst;  /* Period multiplier for upper limit. */
 
 	/* Used internally by the kernel */
-	unsigned long prev;
-	u_int32_t credit;
+	unsigned long prev; /* moved to xt_limit_priv */
+	u_int32_t credit; /* moved to xt_limit_priv */
 	u_int32_t credit_cap, cost;
 
-	/* Ugly, ugly fucker. */
-	struct xt_rateinfo *master;
+	struct xt_limit_priv *master;
 };
 #endif /*_XT_RATE_H*/
diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h
index 4c8368d781e5..8dc89dfc1361 100644
--- a/include/linux/netfilter/xt_quota.h
+++ b/include/linux/netfilter/xt_quota.h
@@ -6,13 +6,15 @@ enum xt_quota_flags {
 };
 #define XT_QUOTA_MASK		0x1
 
+struct xt_quota_priv;
+
 struct xt_quota_info {
 	u_int32_t		flags;
 	u_int32_t		pad;
 
 	/* Used internally by the kernel */
 	aligned_u64		quota;
-	struct xt_quota_info	*master;
+	struct xt_quota_priv	*master;
 };
 
 #endif /* _XT_QUOTA_H */
diff --git a/include/linux/netfilter/xt_statistic.h b/include/linux/netfilter/xt_statistic.h
index 3d38bc975048..8f521ab49ef7 100644
--- a/include/linux/netfilter/xt_statistic.h
+++ b/include/linux/netfilter/xt_statistic.h
@@ -13,6 +13,8 @@ enum xt_statistic_flags {
 };
 #define XT_STATISTIC_MASK		0x1
 
+struct xt_statistic_priv;
+
 struct xt_statistic_info {
 	u_int16_t			mode;
 	u_int16_t			flags;
@@ -23,11 +25,10 @@ struct xt_statistic_info {
 		struct {
 			u_int32_t	every;
 			u_int32_t	packet;
-			/* Used internally by the kernel */
-			u_int32_t	count;
+			u_int32_t	count; /* unused */
 		} nth;
 	} u;
-	struct xt_statistic_info	*master __attribute__((aligned(8)));
+	struct xt_statistic_priv *master __attribute__((aligned(8)));
 };
 
 #endif /* _XT_STATISTIC_H */
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index c908d69a5595..2e8089ecd0af 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -14,6 +14,11 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_limit.h>
 
+struct xt_limit_priv {
+	unsigned long prev;
+	uint32_t credit;
+};
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
 MODULE_DESCRIPTION("Xtables: rate-limit match");
@@ -60,18 +65,18 @@ static DEFINE_SPINLOCK(limit_lock);
 static bool
 limit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	struct xt_rateinfo *r =
-		((const struct xt_rateinfo *)par->matchinfo)->master;
+	const struct xt_rateinfo *r = par->matchinfo;
+	struct xt_limit_priv *priv = r->master;
 	unsigned long now = jiffies;
 
 	spin_lock_bh(&limit_lock);
-	r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
-	if (r->credit > r->credit_cap)
-		r->credit = r->credit_cap;
+	priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
+	if (priv->credit > r->credit_cap)
+		priv->credit = r->credit_cap;
 
-	if (r->credit >= r->cost) {
+	if (priv->credit >= r->cost) {
 		/* We're not limited. */
-		r->credit -= r->cost;
+		priv->credit -= r->cost;
 		spin_unlock_bh(&limit_lock);
 		return true;
 	}
@@ -95,6 +100,7 @@ user2credits(u_int32_t user)
 static bool limit_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_rateinfo *r = par->matchinfo;
+	struct xt_limit_priv *priv;
 
 	/* Check for overflow. */
 	if (r->burst == 0
@@ -104,19 +110,30 @@ static bool limit_mt_check(const struct xt_mtchk_param *par)
 		return false;
 	}
 
-	/* For SMP, we only want to use one set of counters. */
-	r->master = r;
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	/* For SMP, we only want to use one set of state. */
+	r->master = priv;
 	if (r->cost == 0) {
 		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
 		   128. */
-		r->prev = jiffies;
-		r->credit = user2credits(r->avg * r->burst);	 /* Credits full. */
+		priv->prev = jiffies;
+		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 		r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
 		r->cost = user2credits(r->avg);
 	}
 	return true;
 }
 
+static void limit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_rateinfo *info = par->matchinfo;
+
+	kfree(info->master);
+}
+
 #ifdef CONFIG_COMPAT
 struct compat_xt_rateinfo {
 	u_int32_t avg;
@@ -167,6 +184,7 @@ static struct xt_match limit_mt_reg __read_mostly = {
 	.family           = NFPROTO_UNSPEC,
 	.match            = limit_mt,
 	.checkentry       = limit_mt_check,
+	.destroy          = limit_mt_destroy,
 	.matchsize        = sizeof(struct xt_rateinfo),
 #ifdef CONFIG_COMPAT
 	.compatsize       = sizeof(struct compat_xt_rateinfo),
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index c84fce5e0f3e..01dd07b764ec 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -9,6 +9,10 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_quota.h>
 
+struct xt_quota_priv {
+	uint64_t quota;
+};
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
 MODULE_DESCRIPTION("Xtables: countdown quota match");
@@ -20,18 +24,20 @@ static DEFINE_SPINLOCK(quota_lock);
 static bool
 quota_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	struct xt_quota_info *q =
-		((const struct xt_quota_info *)par->matchinfo)->master;
+	struct xt_quota_info *q = (void *)par->matchinfo;
+	struct xt_quota_priv *priv = q->master;
 	bool ret = q->flags & XT_QUOTA_INVERT;
 
 	spin_lock_bh(&quota_lock);
-	if (q->quota >= skb->len) {
-		q->quota -= skb->len;
+	if (priv->quota >= skb->len) {
+		priv->quota -= skb->len;
 		ret = !ret;
 	} else {
 		/* we do not allow even small packets from now on */
-		q->quota = 0;
+		priv->quota = 0;
 	}
+	/* Copy quota back to matchinfo so that iptables can display it */
+	q->quota = priv->quota;
 	spin_unlock_bh(&quota_lock);
 
 	return ret;
@@ -43,17 +49,28 @@ static bool quota_mt_check(const struct xt_mtchk_param *par)
 
 	if (q->flags & ~XT_QUOTA_MASK)
 		return false;
-	/* For SMP, we only want to use one set of counters. */
-	q->master = q;
+
+	q->master = kmalloc(sizeof(*q->master), GFP_KERNEL);
+	if (q->master == NULL)
+		return -ENOMEM;
+
 	return true;
 }
 
+static void quota_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_quota_info *q = par->matchinfo;
+
+	kfree(q->master);
+}
+
 static struct xt_match quota_mt_reg __read_mostly = {
 	.name       = "quota",
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.match      = quota_mt,
 	.checkentry = quota_mt_check,
+	.destroy    = quota_mt_destroy,
 	.matchsize  = sizeof(struct xt_quota_info),
 	.me         = THIS_MODULE,
 };
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 0d75141139d5..d8c0f8f1a78e 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -16,6 +16,10 @@
 #include <linux/netfilter/xt_statistic.h>
 #include <linux/netfilter/x_tables.h>
 
+struct xt_statistic_priv {
+	uint32_t count;
+};
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)");
@@ -27,7 +31,7 @@ static DEFINE_SPINLOCK(nth_lock);
 static bool
 statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	struct xt_statistic_info *info = (void *)par->matchinfo;
+	const struct xt_statistic_info *info = par->matchinfo;
 	bool ret = info->flags & XT_STATISTIC_INVERT;
 
 	switch (info->mode) {
@@ -36,10 +40,9 @@ statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			ret = !ret;
 		break;
 	case XT_STATISTIC_MODE_NTH:
-		info = info->master;
 		spin_lock_bh(&nth_lock);
-		if (info->u.nth.count++ == info->u.nth.every) {
-			info->u.nth.count = 0;
+		if (info->master->count++ == info->u.nth.every) {
+			info->master->count = 0;
 			ret = !ret;
 		}
 		spin_unlock_bh(&nth_lock);
@@ -56,16 +59,31 @@ static bool statistic_mt_check(const struct xt_mtchk_param *par)
 	if (info->mode > XT_STATISTIC_MODE_MAX ||
 	    info->flags & ~XT_STATISTIC_MASK)
 		return false;
-	info->master = info;
+
+	info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
+	if (info->master == NULL) {
+		printk(KERN_ERR KBUILD_MODNAME ": Out of memory\n");
+		return false;
+	}
+	info->master->count = info->u.nth.count;
+
 	return true;
 }
 
+static void statistic_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_statistic_info *info = par->matchinfo;
+
+	kfree(info->master);
+}
+
 static struct xt_match xt_statistic_mt_reg __read_mostly = {
 	.name       = "statistic",
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.match      = statistic_mt,
 	.checkentry = statistic_mt_check,
+	.destroy    = statistic_mt_destroy,
 	.matchsize  = sizeof(struct xt_statistic_info),
 	.me         = THIS_MODULE,
 };
-- 
cgit v1.2.3-71-gd317


From 0269ea4937343536ec7e85649932bc8c9686ea78 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 16 Mar 2009 17:10:36 +0100
Subject: netfilter: xtables: add cluster match

This patch adds the iptables cluster match. This match can be used
to deploy gateway and back-end load-sharing clusters. The cluster
can be composed of 32 nodes maximum (although I have only tested
this with two nodes, so I cannot tell what is the real scalability
limit of this solution in terms of cluster nodes).

Assuming that all the nodes see all packets (see below for an
example on how to do that if your switch does not allow this), the
cluster match decides if this node has to handle a packet given:

	(jhash(source IP) % total_nodes) & node_mask

For related connections, the master conntrack is used. The following
is an example of its use to deploy a gateway cluster composed of two
nodes (where this is the node 1):

iptables -I PREROUTING -t mangle -i eth1 -m cluster \
	--cluster-total-nodes 2 --cluster-local-node 1 \
	--cluster-proc-name eth1 -j MARK --set-mark 0xffff
iptables -A PREROUTING -t mangle -i eth1 \
	-m mark ! --mark 0xffff -j DROP
iptables -A PREROUTING -t mangle -i eth2 -m cluster \
	--cluster-total-nodes 2 --cluster-local-node 1 \
	--cluster-proc-name eth2 -j MARK --set-mark 0xffff
iptables -A PREROUTING -t mangle -i eth2 \
	-m mark ! --mark 0xffff -j DROP

And the following commands to make all nodes see the same packets:

ip maddr add 01:00:5e:00:01:01 dev eth1
ip maddr add 01:00:5e:00:01:02 dev eth2
arptables -I OUTPUT -o eth1 --h-length 6 \
	-j mangle --mangle-mac-s 01:00:5e:00:01:01
arptables -I INPUT -i eth1 --h-length 6 \
	--destination-mac 01:00:5e:00:01:01 \
	-j mangle --mangle-mac-d 00:zz:yy:xx:5a:27
arptables -I OUTPUT -o eth2 --h-length 6 \
	-j mangle --mangle-mac-s 01:00:5e:00:01:02
arptables -I INPUT -i eth2 --h-length 6 \
	--destination-mac 01:00:5e:00:01:02 \
	-j mangle --mangle-mac-d 00:zz:yy:xx:5a:27

In the case of TCP connections, pickup facility has to be disabled
to avoid marking TCP ACK packets coming in the reply direction as
valid.

echo 0 > /proc/sys/net/netfilter/nf_conntrack_tcp_loose

BTW, some final notes:

 * This match mangles the skbuff pkt_type in case that it detects
PACKET_MULTICAST for a non-multicast address. This may be done in
a PKTTYPE target for this sole purpose.
 * This match supersedes the CLUSTERIP target.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild       |   1 +
 include/linux/netfilter/xt_cluster.h |  15 ++++
 net/netfilter/Kconfig                |  16 ++++
 net/netfilter/Makefile               |   1 +
 net/netfilter/xt_cluster.c           | 164 +++++++++++++++++++++++++++++++++++
 5 files changed, 197 insertions(+)
 create mode 100644 include/linux/netfilter/xt_cluster.h
 create mode 100644 net/netfilter/xt_cluster.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index 947b47d7f6c0..af9d2fb97212 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -21,6 +21,7 @@ header-y += xt_connbytes.h
 header-y += xt_connlimit.h
 header-y += xt_connmark.h
 header-y += xt_conntrack.h
+header-y += xt_cluster.h
 header-y += xt_dccp.h
 header-y += xt_dscp.h
 header-y += xt_esp.h
diff --git a/include/linux/netfilter/xt_cluster.h b/include/linux/netfilter/xt_cluster.h
new file mode 100644
index 000000000000..5e0a0d07b526
--- /dev/null
+++ b/include/linux/netfilter/xt_cluster.h
@@ -0,0 +1,15 @@
+#ifndef _XT_CLUSTER_MATCH_H
+#define _XT_CLUSTER_MATCH_H
+
+enum xt_cluster_flags {
+	XT_CLUSTER_F_INV	= (1 << 0)
+};
+
+struct xt_cluster_match_info {
+	u_int32_t		total_nodes;
+	u_int32_t		node_mask;
+	u_int32_t		hash_seed;
+	u_int32_t		flags;
+};
+
+#endif /* _XT_CLUSTER_MATCH_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index cdbaaff6d0d6..2562d05dbaf5 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -527,6 +527,22 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP
 	  This option adds a "TCPOPTSTRIP" target, which allows you to strip
 	  TCP options from TCP packets.
 
+config NETFILTER_XT_MATCH_CLUSTER
+	tristate '"cluster" match support'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option allows you to build work-load-sharing clusters of
+	  network servers/stateful firewalls without having a dedicated
+	  load-balancing router/server/switch. Basically, this match returns
+	  true when the packet must be handled by this cluster node. Thus,
+	  all nodes see all packets and this match decides which node handles
+	  what packets. The work-load sharing algorithm is based on source
+	  address hashing.
+
+	  If you say Y or M here, try `iptables -m cluster --help` for
+	  more information.
+
 config NETFILTER_XT_MATCH_COMMENT
 	tristate  '"comment" match support'
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 7a9b8397573a..6282060fbda9 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
 
 # matches
+obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
new file mode 100644
index 000000000000..ad5bd890e4e8
--- /dev/null
+++ b/net/netfilter/xt_cluster.c
@@ -0,0 +1,164 @@
+/*
+ * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/xt_cluster.h>
+
+static inline u_int32_t nf_ct_orig_ipv4_src(const struct nf_conn *ct)
+{
+	return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+}
+
+static inline const void *nf_ct_orig_ipv6_src(const struct nf_conn *ct)
+{
+	return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6;
+}
+
+static inline u_int32_t
+xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info)
+{
+	return jhash_1word(ip, info->hash_seed);
+}
+
+static inline u_int32_t
+xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info)
+{
+	return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed);
+}
+
+static inline u_int32_t
+xt_cluster_hash(const struct nf_conn *ct,
+		const struct xt_cluster_match_info *info)
+{
+	u_int32_t hash = 0;
+
+	switch(nf_ct_l3num(ct)) {
+	case AF_INET:
+		hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info);
+		break;
+	case AF_INET6:
+		hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	return (((u64)hash * info->total_nodes) >> 32);
+}
+
+static inline bool
+xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
+{
+	bool is_multicast = false;
+
+	switch(family) {
+	case NFPROTO_IPV4:
+		is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr);
+		break;
+	case NFPROTO_IPV6:
+		is_multicast = ipv6_addr_type(&ipv6_hdr(skb)->daddr) &
+						IPV6_ADDR_MULTICAST;
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	return is_multicast;
+}
+
+static bool
+xt_cluster_mt(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	struct sk_buff *pskb = (struct sk_buff *)skb;
+	const struct xt_cluster_match_info *info = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned long hash;
+
+	/* This match assumes that all nodes see the same packets. This can be
+	 * achieved if the switch that connects the cluster nodes support some
+	 * sort of 'port mirroring'. However, if your switch does not support
+	 * this, your cluster nodes can reply ARP request using a multicast MAC
+	 * address. Thus, your switch will flood the same packets to the
+	 * cluster nodes with the same multicast MAC address. Using a multicast
+	 * link address is a RFC 1812 (section 3.3.2) violation, but this works
+	 * fine in practise.
+	 *
+	 * Unfortunately, if you use the multicast MAC address, the link layer
+	 * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted
+	 * by TCP and others for packets coming to this node. For that reason,
+	 * this match mangles skbuff's pkt_type if it detects a packet
+	 * addressed to a unicast address but using PACKET_MULTICAST. Yes, I
+	 * know, matches should not alter packets, but we are doing this here
+	 * because we would need to add a PKTTYPE target for this sole purpose.
+	 */
+	if (!xt_cluster_is_multicast_addr(skb, par->family) &&
+	    skb->pkt_type == PACKET_MULTICAST) {
+	    	pskb->pkt_type = PACKET_HOST;
+	}
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return false;
+
+	if (ct == &nf_conntrack_untracked)
+		return false;
+
+	if (ct->master)
+		hash = xt_cluster_hash(ct->master, info);
+	else
+		hash = xt_cluster_hash(ct, info);
+
+	return !!((1 << hash) & info->node_mask) ^
+	       !!(info->flags & XT_CLUSTER_F_INV);
+}
+
+static bool xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
+{
+	struct xt_cluster_match_info *info = par->matchinfo;
+
+	if (info->node_mask >= (1 << info->total_nodes)) {
+		printk(KERN_ERR "xt_cluster: this node mask cannot be "
+				"higher than the total number of nodes\n");
+		return false;
+	}
+	return true;
+}
+
+static struct xt_match xt_cluster_match __read_mostly = {
+	.name		= "cluster",
+	.family		= NFPROTO_UNSPEC,
+	.match		= xt_cluster_mt,
+	.checkentry	= xt_cluster_mt_checkentry,
+	.matchsize	= sizeof(struct xt_cluster_match_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_cluster_mt_init(void)
+{
+	return xt_register_match(&xt_cluster_match);
+}
+
+static void __exit xt_cluster_mt_fini(void)
+{
+	xt_unregister_match(&xt_cluster_match);
+}
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: hash-based cluster match");
+MODULE_ALIAS("ipt_cluster");
+MODULE_ALIAS("ip6t_cluster");
+module_init(xt_cluster_mt_init);
+module_exit(xt_cluster_mt_fini);
-- 
cgit v1.2.3-71-gd317


From d1c76af9e2434fac3add561e26c61b06503de986 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 16 Mar 2009 10:50:02 -0700
Subject: GRO: Move netpoll checks to correct location

As my netpoll fix for net doesn't really work for net-next, we
need this update to move the checks into the right place.  As it
stands we may pass freed skbs to netpoll_receive_skb.

This patch also introduces a netpoll_rx_on function to avoid GRO
completely if we're invoked through netpoll.  This might seem
paranoid but as netpoll may have an external receive hook it's
better to be safe than sorry.  I don't think we need this for
2.6.29 though since there's nothing immediately broken by it.

This patch also moves the GRO_* return values to netdevice.h since
VLAN needs them too (I tried to avoid this originally but alas
this seems to be the easiest way out).  This fixes a bug in VLAN
where it continued to use the old return value 2 instead of the
correct GRO_DROP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 ++++++++
 include/linux/netpoll.h   | 11 +++++++++++
 net/8021q/vlan_core.c     | 11 ++++-------
 net/core/dev.c            | 17 +++--------------
 4 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 493b065f76d7..be3ebd7e8ce5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -330,6 +330,14 @@ enum
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 };
 
+enum {
+	GRO_MERGED,
+	GRO_MERGED_FREE,
+	GRO_HELD,
+	GRO_NORMAL,
+	GRO_DROP,
+};
+
 extern void __napi_schedule(struct napi_struct *n);
 
 static inline int napi_disable_pending(struct napi_struct *n)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index e38d3c9dccda..de99025f2c5d 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -63,6 +63,13 @@ static inline int netpoll_rx(struct sk_buff *skb)
 	return ret;
 }
 
+static inline int netpoll_rx_on(struct sk_buff *skb)
+{
+	struct netpoll_info *npinfo = skb->dev->npinfo;
+
+	return npinfo && (npinfo->rx_np || npinfo->rx_flags);
+}
+
 static inline int netpoll_receive_skb(struct sk_buff *skb)
 {
 	if (!list_empty(&skb->dev->napi_list))
@@ -99,6 +106,10 @@ static inline int netpoll_rx(struct sk_buff *skb)
 {
 	return 0;
 }
+static inline int netpoll_rx_on(struct sk_buff *skb)
+{
+	return 0;
+}
 static inline int netpoll_receive_skb(struct sk_buff *skb)
 {
 	return 0;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 2d6e405fc498..6227248597c4 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -79,6 +79,9 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 {
 	struct sk_buff *p;
 
+	if (netpoll_rx_on(skb))
+		return GRO_NORMAL;
+
 	if (skb_bond_should_drop(skb))
 		goto drop;
 
@@ -98,7 +101,7 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 	return dev_gro_receive(napi, skb);
 
 drop:
-	return 2;
+	return GRO_DROP;
 }
 
 int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
@@ -106,9 +109,6 @@ int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 {
 	skb_gro_reset_offset(skb);
 
-	if (netpoll_receive_skb(skb))
-		return NET_RX_DROP;
-
 	return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
 }
 EXPORT_SYMBOL(vlan_gro_receive);
@@ -121,9 +121,6 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 	if (!skb)
 		return NET_RX_DROP;
 
-	if (netpoll_receive_skb(skb))
-		return NET_RX_DROP;
-
 	return napi_frags_finish(napi, skb,
 				 vlan_gro_common(napi, grp, vlan_tci, skb));
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 033d7ca28e6e..7bd3c29c5a78 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,14 +135,6 @@
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
-enum {
-	GRO_MERGED,
-	GRO_MERGED_FREE,
-	GRO_HELD,
-	GRO_NORMAL,
-	GRO_DROP,
-};
-
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -2474,6 +2466,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff *p;
 
+	if (netpoll_rx_on(skb))
+		return GRO_NORMAL;
+
 	for (p = napi->gro_list; p; p = p->next) {
 		NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
 			skb_mac_header(p), skb_gro_mac_header(skb));
@@ -2487,9 +2482,6 @@ int napi_skb_finish(int ret, struct sk_buff *skb)
 {
 	int err = NET_RX_SUCCESS;
 
-	if (netpoll_receive_skb(skb))
-		return NET_RX_DROP;
-
 	switch (ret) {
 	case GRO_NORMAL:
 		return netif_receive_skb(skb);
@@ -2587,9 +2579,6 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 {
 	int err = NET_RX_SUCCESS;
 
-	if (netpoll_receive_skb(skb))
-		return NET_RX_DROP;
-
 	switch (ret) {
 	case GRO_NORMAL:
 	case GRO_HELD:
-- 
cgit v1.2.3-71-gd317


From 7db90f4a25bd4184f3d36dfa4f512f53b0448da7 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Mon, 9 Mar 2009 22:07:41 -0400
Subject: cfg80211: move enum reg_set_by to nl80211.h

We do this so we can later inform userspace who set the
regulatory domain and provide details of the request.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath9k/main.c |  2 +-
 drivers/net/wireless/ath9k/regd.c | 31 ++++++++-------
 drivers/net/wireless/ath9k/regd.h |  3 +-
 include/linux/nl80211.h           | 19 +++++++++
 include/net/cfg80211.h            | 24 ++----------
 net/wireless/core.c               |  2 +-
 net/wireless/core.h               |  3 +-
 net/wireless/reg.c                | 82 +++++++++++++++++++++------------------
 8 files changed, 90 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath9k/main.c b/drivers/net/wireless/ath9k/main.c
index 1d6b05c0d800..e9b3f365f099 100644
--- a/drivers/net/wireless/ath9k/main.c
+++ b/drivers/net/wireless/ath9k/main.c
@@ -1670,7 +1670,7 @@ int ath_attach(u16 devid, struct ath_softc *sc)
 	}
 	wiphy_apply_custom_regulatory(hw->wiphy, regd);
 	ath9k_reg_apply_radar_flags(hw->wiphy);
-	ath9k_reg_apply_world_flags(hw->wiphy, REGDOM_SET_BY_DRIVER);
+	ath9k_reg_apply_world_flags(hw->wiphy, NL80211_REGDOM_SET_BY_DRIVER);
 
 	INIT_WORK(&sc->chan_work, ath9k_wiphy_chan_work);
 	INIT_DELAYED_WORK(&sc->wiphy_work, ath9k_wiphy_work);
diff --git a/drivers/net/wireless/ath9k/regd.c b/drivers/net/wireless/ath9k/regd.c
index ff0afc02f3ce..b8f9b6d6bec4 100644
--- a/drivers/net/wireless/ath9k/regd.c
+++ b/drivers/net/wireless/ath9k/regd.c
@@ -168,8 +168,9 @@ static bool ath9k_is_radar_freq(u16 center_freq)
  *   received a beacon on a channel we can enable active scan and
  *   adhoc (or beaconing).
  */
-static void ath9k_reg_apply_beaconing_flags(struct wiphy *wiphy,
-					     enum reg_set_by setby)
+static void ath9k_reg_apply_beaconing_flags(
+	struct wiphy *wiphy,
+	enum nl80211_reg_initiator initiator)
 {
 	enum ieee80211_band band;
 	struct ieee80211_supported_band *sband;
@@ -194,7 +195,7 @@ static void ath9k_reg_apply_beaconing_flags(struct wiphy *wiphy,
 			    (ch->flags & IEEE80211_CHAN_RADAR))
 				continue;
 
-			if (setby == REGDOM_SET_BY_COUNTRY_IE) {
+			if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 				r = freq_reg_info(wiphy, ch->center_freq,
 					&bandwidth, &reg_rule);
 				if (r)
@@ -226,8 +227,9 @@ static void ath9k_reg_apply_beaconing_flags(struct wiphy *wiphy,
 }
 
 /* Allows active scan scan on Ch 12 and 13 */
-static void ath9k_reg_apply_active_scan_flags(struct wiphy *wiphy,
-					      enum reg_set_by setby)
+static void ath9k_reg_apply_active_scan_flags(
+	struct wiphy *wiphy,
+	enum nl80211_reg_initiator initiator)
 {
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_channel *ch;
@@ -241,7 +243,7 @@ static void ath9k_reg_apply_active_scan_flags(struct wiphy *wiphy,
 	 * If no country IE has been received always enable active scan
 	 * on these channels. This is only done for specific regulatory SKUs
 	 */
-	if (setby != REGDOM_SET_BY_COUNTRY_IE) {
+	if (initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 		ch = &sband->channels[11]; /* CH 12 */
 		if (ch->flags & IEEE80211_CHAN_PASSIVE_SCAN)
 			ch->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN;
@@ -308,7 +310,8 @@ void ath9k_reg_apply_radar_flags(struct wiphy *wiphy)
 	}
 }
 
-void ath9k_reg_apply_world_flags(struct wiphy *wiphy, enum reg_set_by setby)
+void ath9k_reg_apply_world_flags(struct wiphy *wiphy,
+				 enum nl80211_reg_initiator initiator)
 {
 	struct ieee80211_hw *hw = wiphy_to_ieee80211_hw(wiphy);
 	struct ath_wiphy *aphy = hw->priv;
@@ -320,11 +323,11 @@ void ath9k_reg_apply_world_flags(struct wiphy *wiphy, enum reg_set_by setby)
 	case 0x63:
 	case 0x66:
 	case 0x67:
-		ath9k_reg_apply_beaconing_flags(wiphy, setby);
+		ath9k_reg_apply_beaconing_flags(wiphy, initiator);
 		break;
 	case 0x68:
-		ath9k_reg_apply_beaconing_flags(wiphy, setby);
-		ath9k_reg_apply_active_scan_flags(wiphy, setby);
+		ath9k_reg_apply_beaconing_flags(wiphy, initiator);
+		ath9k_reg_apply_active_scan_flags(wiphy, initiator);
 		break;
 	}
 	return;
@@ -340,11 +343,11 @@ int ath9k_reg_notifier(struct wiphy *wiphy, struct regulatory_request *request)
 	ath9k_reg_apply_radar_flags(wiphy);
 
 	switch (request->initiator) {
-	case REGDOM_SET_BY_DRIVER:
-	case REGDOM_SET_BY_CORE:
-	case REGDOM_SET_BY_USER:
+	case NL80211_REGDOM_SET_BY_DRIVER:
+	case NL80211_REGDOM_SET_BY_CORE:
+	case NL80211_REGDOM_SET_BY_USER:
 		break;
-	case REGDOM_SET_BY_COUNTRY_IE:
+	case NL80211_REGDOM_SET_BY_COUNTRY_IE:
 		if (ath9k_is_world_regd(sc->sc_ah))
 			ath9k_reg_apply_world_flags(wiphy, request->initiator);
 		break;
diff --git a/drivers/net/wireless/ath9k/regd.h b/drivers/net/wireless/ath9k/regd.h
index d48160d0c0e9..8f885f3bc8df 100644
--- a/drivers/net/wireless/ath9k/regd.h
+++ b/drivers/net/wireless/ath9k/regd.h
@@ -236,7 +236,8 @@ enum CountryCode {
 bool ath9k_is_world_regd(struct ath_hw *ah);
 const struct ieee80211_regdomain *ath9k_world_regdomain(struct ath_hw *ah);
 const struct ieee80211_regdomain *ath9k_default_world_regdomain(void);
-void ath9k_reg_apply_world_flags(struct wiphy *wiphy, enum reg_set_by setby);
+void ath9k_reg_apply_world_flags(struct wiphy *wiphy,
+				 enum nl80211_reg_initiator initiator);
 void ath9k_reg_apply_radar_flags(struct wiphy *wiphy);
 int ath9k_regd_init(struct ath_hw *ah);
 bool ath9k_regd_is_eeprom_valid(struct ath_hw *ah);
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index f6e56370ea65..c0fd432b57dc 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -672,6 +672,25 @@ enum nl80211_bitrate_attr {
 	NL80211_BITRATE_ATTR_MAX = __NL80211_BITRATE_ATTR_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_initiator - Indicates the initiator of a reg domain request
+ * @NL80211_REGDOM_SET_BY_CORE: Core queried CRDA for a dynamic world
+ * 	regulatory domain.
+ * @NL80211_REGDOM_SET_BY_USER: User asked the wireless core to set the
+ * 	regulatory domain.
+ * @NL80211_REGDOM_SET_BY_DRIVER: a wireless drivers has hinted to the
+ * 	wireless core it thinks its knows the regulatory domain we should be in.
+ * @NL80211_REGDOM_SET_BY_COUNTRY_IE: the wireless core has received an
+ * 	802.11 country information element with regulatory information it
+ * 	thinks we should consider.
+ */
+enum nl80211_reg_initiator {
+	NL80211_REGDOM_SET_BY_CORE,
+	NL80211_REGDOM_SET_BY_USER,
+	NL80211_REGDOM_SET_BY_DRIVER,
+	NL80211_REGDOM_SET_BY_COUNTRY_IE,
+};
+
 /**
  * enum nl80211_reg_rule_attr - regulatory rule attributes
  * @NL80211_ATTR_REG_RULE_FLAGS: a set of flags which specify additional
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f195ea460811..50f3fd9ff524 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -348,28 +348,10 @@ struct bss_parameters {
 	u8 basic_rates_len;
 };
 
-/**
- * enum reg_set_by - Indicates who is trying to set the regulatory domain
- * @REGDOM_SET_BY_CORE: Core queried CRDA for a dynamic world regulatory domain.
- * @REGDOM_SET_BY_USER: User asked the wireless core to set the
- * 	regulatory domain.
- * @REGDOM_SET_BY_DRIVER: a wireless drivers has hinted to the wireless core
- *	it thinks its knows the regulatory domain we should be in.
- * @REGDOM_SET_BY_COUNTRY_IE: the wireless core has received an 802.11 country
- *	information element with regulatory information it thinks we
- *	should consider.
- */
-enum reg_set_by {
-	REGDOM_SET_BY_CORE,
-	REGDOM_SET_BY_USER,
-	REGDOM_SET_BY_DRIVER,
-	REGDOM_SET_BY_COUNTRY_IE,
-};
-
 /**
  * enum environment_cap - Environment parsed from country IE
  * @ENVIRON_ANY: indicates country IE applies to both indoor and
- * 	outdoor operation.
+ *	outdoor operation.
  * @ENVIRON_INDOOR: indicates country IE applies only to indoor operation
  * @ENVIRON_OUTDOOR: indicates country IE applies only to outdoor operation
  */
@@ -388,7 +370,7 @@ enum environment_cap {
  * 	and potentially inform users of which devices specifically
  * 	cased the conflicts.
  * @initiator: indicates who sent this request, could be any of
- * 	of those set in reg_set_by, %REGDOM_SET_BY_*
+ * 	of those set in nl80211_reg_initiator (%NL80211_REGDOM_SET_BY_*)
  * @alpha2: the ISO / IEC 3166 alpha2 country code of the requested
  * 	regulatory domain. We have a few special codes:
  * 	00 - World regulatory domain
@@ -405,7 +387,7 @@ enum environment_cap {
  */
 struct regulatory_request {
 	int wiphy_idx;
-	enum reg_set_by initiator;
+	enum nl80211_reg_initiator initiator;
 	char alpha2[2];
 	bool intersect;
 	u32 country_ie_checksum;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index dd7f222919fe..c939f5ee065e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -350,7 +350,7 @@ int wiphy_register(struct wiphy *wiphy)
 	mutex_lock(&cfg80211_mutex);
 
 	/* set up regulatory info */
-	wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE);
+	wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
 
 	res = device_add(&drv->wiphy.dev);
 	if (res)
diff --git a/net/wireless/core.h b/net/wireless/core.h
index f6c53f5807f4..6acd483a61f8 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -136,7 +136,8 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv,
 			       char *newname);
 
 void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
-void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby);
+void wiphy_update_regulatory(struct wiphy *wiphy,
+			     enum nl80211_reg_initiator setby);
 
 void cfg80211_bss_expire(struct cfg80211_registered_device *dev);
 void cfg80211_bss_age(struct cfg80211_registered_device *dev,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 47ff44751b70..68fde6d33dc3 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -857,8 +857,8 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
 	 * Follow the driver's regulatory domain, if present, unless a country
 	 * IE has been processed or a user wants to help complaince further
 	 */
-	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE &&
-	    last_request->initiator != REGDOM_SET_BY_USER &&
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+	    last_request->initiator != NL80211_REGDOM_SET_BY_USER &&
 	    wiphy->regd)
 		regd = wiphy->regd;
 
@@ -943,7 +943,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 		 * http://tinyurl.com/11d-clarification
 		 */
 		if (r == -ERANGE &&
-		    last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
+		    last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 #ifdef CONFIG_CFG80211_REG_DEBUG
 			printk(KERN_DEBUG "cfg80211: Leaving channel %d MHz "
 				"intact on %s - no rule found in band on "
@@ -956,7 +957,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 		 * for the band so we respect its band definitions
 		 */
 #ifdef CONFIG_CFG80211_REG_DEBUG
-			if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+			if (last_request->initiator ==
+			    NL80211_REGDOM_SET_BY_COUNTRY_IE)
 				printk(KERN_DEBUG "cfg80211: Disabling "
 					"channel %d MHz on %s due to "
 					"Country IE\n",
@@ -970,7 +972,7 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 
 	power_rule = &reg_rule->power_rule;
 
-	if (last_request->initiator == REGDOM_SET_BY_DRIVER &&
+	if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
 	    request_wiphy && request_wiphy == wiphy &&
 	    request_wiphy->strict_regulatory) {
 		/*
@@ -1011,11 +1013,12 @@ static void handle_band(struct wiphy *wiphy, enum ieee80211_band band)
 		handle_channel(wiphy, band, i);
 }
 
-static bool ignore_reg_update(struct wiphy *wiphy, enum reg_set_by setby)
+static bool ignore_reg_update(struct wiphy *wiphy,
+			      enum nl80211_reg_initiator initiator)
 {
 	if (!last_request)
 		return true;
-	if (setby == REGDOM_SET_BY_CORE &&
+	if (initiator == NL80211_REGDOM_SET_BY_CORE &&
 		  wiphy->custom_regulatory)
 		return true;
 	/*
@@ -1028,12 +1031,12 @@ static bool ignore_reg_update(struct wiphy *wiphy, enum reg_set_by setby)
 	return false;
 }
 
-static void update_all_wiphy_regulatory(enum reg_set_by setby)
+static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
 {
 	struct cfg80211_registered_device *drv;
 
 	list_for_each_entry(drv, &cfg80211_drv_list, list)
-		wiphy_update_regulatory(&drv->wiphy, setby);
+		wiphy_update_regulatory(&drv->wiphy, initiator);
 }
 
 static void handle_reg_beacon(struct wiphy *wiphy,
@@ -1124,7 +1127,7 @@ static bool reg_is_world_roaming(struct wiphy *wiphy)
 	if (is_world_regdom(cfg80211_regdomain->alpha2) ||
 	    (wiphy->regd && is_world_regdom(wiphy->regd->alpha2)))
 		return true;
-	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE &&
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
 	    wiphy->custom_regulatory)
 		return true;
 	return false;
@@ -1138,11 +1141,12 @@ static void reg_process_beacons(struct wiphy *wiphy)
 	wiphy_update_beacon_reg(wiphy);
 }
 
-void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby)
+void wiphy_update_regulatory(struct wiphy *wiphy,
+			     enum nl80211_reg_initiator initiator)
 {
 	enum ieee80211_band band;
 
-	if (ignore_reg_update(wiphy, setby))
+	if (ignore_reg_update(wiphy, initiator))
 		goto out;
 	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
 		if (wiphy->bands[band])
@@ -1255,15 +1259,16 @@ static int ignore_request(struct wiphy *wiphy,
 		return 0;
 
 	switch (pending_request->initiator) {
-	case REGDOM_SET_BY_CORE:
+	case NL80211_REGDOM_SET_BY_CORE:
 		return -EINVAL;
-	case REGDOM_SET_BY_COUNTRY_IE:
+	case NL80211_REGDOM_SET_BY_COUNTRY_IE:
 
 		last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
 
 		if (unlikely(!is_an_alpha2(pending_request->alpha2)))
 			return -EINVAL;
-		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
+		if (last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 			if (last_wiphy != wiphy) {
 				/*
 				 * Two cards with two APs claiming different
@@ -1284,8 +1289,8 @@ static int ignore_request(struct wiphy *wiphy,
 			return -EALREADY;
 		}
 		return REG_INTERSECT;
-	case REGDOM_SET_BY_DRIVER:
-		if (last_request->initiator == REGDOM_SET_BY_CORE) {
+	case NL80211_REGDOM_SET_BY_DRIVER:
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) {
 			if (is_old_static_regdom(cfg80211_regdomain))
 				return 0;
 			if (regdom_changes(pending_request->alpha2))
@@ -1298,28 +1303,28 @@ static int ignore_request(struct wiphy *wiphy,
 		 * back in or if you add a new device for which the previously
 		 * loaded card also agrees on the regulatory domain.
 		 */
-		if (last_request->initiator == REGDOM_SET_BY_DRIVER &&
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
 		    !regdom_changes(pending_request->alpha2))
 			return -EALREADY;
 
 		return REG_INTERSECT;
-	case REGDOM_SET_BY_USER:
-		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+	case NL80211_REGDOM_SET_BY_USER:
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)
 			return REG_INTERSECT;
 		/*
 		 * If the user knows better the user should set the regdom
 		 * to their country before the IE is picked up
 		 */
-		if (last_request->initiator == REGDOM_SET_BY_USER &&
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_USER &&
 			  last_request->intersect)
 			return -EOPNOTSUPP;
 		/*
 		 * Process user requests only after previous user/driver/core
 		 * requests have been processed
 		 */
-		if (last_request->initiator == REGDOM_SET_BY_CORE ||
-		    last_request->initiator == REGDOM_SET_BY_DRIVER ||
-		    last_request->initiator == REGDOM_SET_BY_USER) {
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE ||
+		    last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
+		    last_request->initiator == NL80211_REGDOM_SET_BY_USER) {
 			if (regdom_changes(last_request->alpha2))
 				return -EAGAIN;
 		}
@@ -1359,7 +1364,8 @@ static int __regulatory_hint(struct wiphy *wiphy,
 	r = ignore_request(wiphy, pending_request);
 
 	if (r == REG_INTERSECT) {
-		if (pending_request->initiator == REGDOM_SET_BY_DRIVER) {
+		if (pending_request->initiator ==
+		    NL80211_REGDOM_SET_BY_DRIVER) {
 			r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
 			if (r) {
 				kfree(pending_request);
@@ -1374,7 +1380,8 @@ static int __regulatory_hint(struct wiphy *wiphy,
 		 * wiphy
 		 */
 		if (r == -EALREADY &&
-		    pending_request->initiator == REGDOM_SET_BY_DRIVER) {
+		    pending_request->initiator ==
+		    NL80211_REGDOM_SET_BY_DRIVER) {
 			r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
 			if (r) {
 				kfree(pending_request);
@@ -1425,7 +1432,7 @@ static void reg_process_hint(struct regulatory_request *reg_request)
 	if (wiphy_idx_valid(reg_request->wiphy_idx))
 		wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
 
-	if (reg_request->initiator == REGDOM_SET_BY_DRIVER &&
+	if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
 	    !wiphy) {
 		kfree(reg_request);
 		goto out;
@@ -1439,7 +1446,7 @@ out:
 	mutex_unlock(&cfg80211_mutex);
 }
 
-/* Processes regulatory hints, this is all the REGDOM_SET_BY_* */
+/* Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* */
 static void reg_process_pending_hints(void)
 	{
 	struct regulatory_request *reg_request;
@@ -1523,7 +1530,7 @@ static int regulatory_hint_core(const char *alpha2)
 
 	request->alpha2[0] = alpha2[0];
 	request->alpha2[1] = alpha2[1];
-	request->initiator = REGDOM_SET_BY_CORE;
+	request->initiator = NL80211_REGDOM_SET_BY_CORE;
 
 	queue_regulatory_request(request);
 
@@ -1544,7 +1551,7 @@ int regulatory_hint_user(const char *alpha2)
 	request->wiphy_idx = WIPHY_IDX_STALE;
 	request->alpha2[0] = alpha2[0];
 	request->alpha2[1] = alpha2[1];
-	request->initiator = REGDOM_SET_BY_USER,
+	request->initiator = NL80211_REGDOM_SET_BY_USER,
 
 	queue_regulatory_request(request);
 
@@ -1570,7 +1577,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
 
 	request->alpha2[0] = alpha2[0];
 	request->alpha2[1] = alpha2[1];
-	request->initiator = REGDOM_SET_BY_DRIVER;
+	request->initiator = NL80211_REGDOM_SET_BY_DRIVER;
 
 	queue_regulatory_request(request);
 
@@ -1719,7 +1726,7 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 	request->wiphy_idx = get_wiphy_idx(wiphy);
 	request->alpha2[0] = rd->alpha2[0];
 	request->alpha2[1] = rd->alpha2[1];
-	request->initiator = REGDOM_SET_BY_COUNTRY_IE;
+	request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
 	request->country_ie_checksum = checksum;
 	request->country_ie_env = env;
 
@@ -1827,7 +1834,8 @@ static void print_regdomain(const struct ieee80211_regdomain *rd)
 
 	if (is_intersected_alpha2(rd->alpha2)) {
 
-		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
+		if (last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 			struct cfg80211_registered_device *drv;
 			drv = cfg80211_drv_by_wiphy_idx(
 				last_request->wiphy_idx);
@@ -1919,7 +1927,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 	 * rd is non static (it means CRDA was present and was used last)
 	 * and the pending request came in from a country IE
 	 */
-	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) {
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 		/*
 		 * If someone else asked us to change the rd lets only bother
 		 * checking if the alpha2 changes if CRDA was already called
@@ -1951,7 +1959,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 	if (!last_request->intersect) {
 		int r;
 
-		if (last_request->initiator != REGDOM_SET_BY_DRIVER) {
+		if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) {
 			reset_regdomains();
 			cfg80211_regdomain = rd;
 			return 0;
@@ -1975,7 +1983,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 
 	/* Intersection requires a bit more work */
 
-	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) {
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 
 		intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
 		if (!intersected_rd)
@@ -1986,7 +1994,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 		 * However if a driver requested this specific regulatory
 		 * domain we keep it for its private use
 		 */
-		if (last_request->initiator == REGDOM_SET_BY_DRIVER)
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER)
 			request_wiphy->regd = rd;
 		else
 			kfree(rd);
-- 
cgit v1.2.3-71-gd317


From 73d54c9e74c4d8ee8a41bc516f481f0f754eca32 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Mon, 9 Mar 2009 22:07:42 -0400
Subject: cfg80211: add regulatory netlink multicast group

This allows us to send to userspace "regulatory" events.
For now we just send an event when we change regulatory domains.
We also notify userspace when devices are using their own custom
world roaming regulatory domains.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 48 ++++++++++++++++++++++++++++++++++++++
 net/wireless/core.c     | 11 +++++++++
 net/wireless/nl80211.c  | 62 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h  |  5 ++++
 net/wireless/reg.c      | 13 ++++++++++-
 5 files changed, 138 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index c0fd432b57dc..f33aa08dd9b3 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -150,6 +150,17 @@
  * @NL80211_CMD_SCAN_ABORTED: scan was aborted, for unspecified reasons,
  *	partial scan results may be available
  *
+ * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain
+ * 	has been changed and provides details of the request information
+ * 	that caused the change such as who initiated the regulatory request
+ * 	(%NL80211_ATTR_REG_INITIATOR), the wiphy_idx
+ * 	(%NL80211_ATTR_REG_ALPHA2) on which the request was made from if
+ * 	the initiator was %NL80211_REGDOM_SET_BY_COUNTRY_IE or
+ * 	%NL80211_REGDOM_SET_BY_DRIVER, the type of regulatory domain
+ * 	set (%NL80211_ATTR_REG_TYPE), if the type of regulatory domain is
+ * 	%NL80211_REG_TYPE_COUNTRY the alpha2 to which we have moved on
+ * 	to (%NL80211_ATTR_REG_ALPHA2).
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -204,6 +215,8 @@ enum nl80211_commands {
 	NL80211_CMD_NEW_SCAN_RESULTS,
 	NL80211_CMD_SCAN_ABORTED,
 
+	NL80211_CMD_REG_CHANGE,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -218,6 +231,8 @@ enum nl80211_commands {
 #define NL80211_CMD_SET_BSS NL80211_CMD_SET_BSS
 #define NL80211_CMD_SET_MGMT_EXTRA_IE NL80211_CMD_SET_MGMT_EXTRA_IE
 
+#define NL80211_CMD_REG_CHANGE NL80211_CMD_REG_CHANGE
+
 /**
  * enum nl80211_attrs - nl80211 netlink attributes
  *
@@ -329,6 +344,11 @@ enum nl80211_commands {
  *	messages carried the same generation number)
  * @NL80211_ATTR_BSS: scan result BSS
  *
+ * @NL80211_ATTR_REG_INITIATOR: indicates who requested the regulatory domain
+ * 	currently in effect. This could be any of the %NL80211_REGDOM_SET_BY_*
+ * @NL80211_ATTR_REG_TYPE: indicates the type of the regulatory domain currently
+ * 	set. This can be one of the nl80211_reg_type (%NL80211_REGDOM_TYPE_*)
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -403,6 +423,9 @@ enum nl80211_attrs {
 	NL80211_ATTR_SCAN_GENERATION,
 	NL80211_ATTR_BSS,
 
+	NL80211_ATTR_REG_INITIATOR,
+	NL80211_ATTR_REG_TYPE,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -420,6 +443,8 @@ enum nl80211_attrs {
 #define NL80211_ATTR_WIPHY_CHANNEL_TYPE NL80211_ATTR_WIPHY_CHANNEL_TYPE
 #define NL80211_ATTR_MGMT_SUBTYPE NL80211_ATTR_MGMT_SUBTYPE
 #define NL80211_ATTR_IE NL80211_ATTR_IE
+#define NL80211_ATTR_REG_INITIATOR NL80211_ATTR_REG_INITIATOR
+#define NL80211_ATTR_REG_TYPE NL80211_ATTR_REG_TYPE
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
@@ -691,6 +716,29 @@ enum nl80211_reg_initiator {
 	NL80211_REGDOM_SET_BY_COUNTRY_IE,
 };
 
+/**
+ * enum nl80211_reg_type - specifies the type of regulatory domain
+ * @NL80211_REGDOM_TYPE_COUNTRY: the regulatory domain set is one that pertains
+ *	to a specific country. When this is set you can count on the
+ *	ISO / IEC 3166 alpha2 country code being valid.
+ * @NL80211_REGDOM_TYPE_WORLD: the regulatory set domain is the world regulatory
+ * 	domain.
+ * @NL80211_REGDOM_TYPE_CUSTOM_WORLD: the regulatory domain set is a custom
+ * 	driver specific world regulatory domain. These do not apply system-wide
+ * 	and are only applicable to the individual devices which have requested
+ * 	them to be applied.
+ * @NL80211_REGDOM_TYPE_INTERSECTION: the regulatory domain set is the product
+ *	of an intersection between two regulatory domains -- the previously
+ *	set regulatory domain on the system and the last accepted regulatory
+ *	domain request to be processed.
+ */
+enum nl80211_reg_type {
+	NL80211_REGDOM_TYPE_COUNTRY,
+	NL80211_REGDOM_TYPE_WORLD,
+	NL80211_REGDOM_TYPE_CUSTOM_WORLD,
+	NL80211_REGDOM_TYPE_INTERSECTION,
+};
+
 /**
  * enum nl80211_reg_rule_attr - regulatory rule attributes
  * @NL80211_ATTR_REG_RULE_FLAGS: a set of flags which specify additional
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c939f5ee065e..17fe39049740 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -365,6 +365,17 @@ int wiphy_register(struct wiphy *wiphy)
 	if (IS_ERR(drv->wiphy.debugfsdir))
 		drv->wiphy.debugfsdir = NULL;
 
+	if (wiphy->custom_regulatory) {
+		struct regulatory_request request;
+
+		request.wiphy_idx = get_wiphy_idx(wiphy);
+		request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
+		request.alpha2[0] = '9';
+		request.alpha2[1] = '9';
+
+		nl80211_send_reg_change_event(&request);
+	}
+
 	res = 0;
 out_unlock:
 	mutex_unlock(&cfg80211_mutex);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 531bb67cf502..8ac3d26014a8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2739,6 +2739,9 @@ static struct genl_multicast_group nl80211_config_mcgrp = {
 static struct genl_multicast_group nl80211_scan_mcgrp = {
 	.name = "scan",
 };
+static struct genl_multicast_group nl80211_regulatory_mcgrp = {
+	.name = "regulatory",
+};
 
 /* notification functions */
 
@@ -2818,6 +2821,61 @@ void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
 	genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL);
 }
 
+/*
+ * This can happen on global regulatory changes or device specific settings
+ * based on custom world regulatory domains.
+ */
+void nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	/* Userspace can always count this one always being set */
+	NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator);
+
+	if (request->alpha2[0] == '0' && request->alpha2[1] == '0')
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_WORLD);
+	else if (request->alpha2[0] == '9' && request->alpha2[1] == '9')
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_CUSTOM_WORLD);
+	else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') ||
+		 request->intersect)
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_INTERSECTION);
+	else {
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_COUNTRY);
+		NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2);
+	}
+
+	if (wiphy_idx_valid(request->wiphy_idx))
+		NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_regulatory_mcgrp.id, GFP_KERNEL);
+
+	return;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 /* initialisation/exit functions */
 
 int nl80211_init(void)
@@ -2842,6 +2900,10 @@ int nl80211_init(void)
 	if (err)
 		goto err_out;
 
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp);
+	if (err)
+		goto err_out;
+
 	return 0;
  err_out:
 	genl_unregister_family(&nl80211_fam);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 69787b621365..e65a3c38c52f 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -11,6 +11,7 @@ extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
 				   struct net_device *netdev);
 extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
 				      struct net_device *netdev);
+extern void nl80211_send_reg_change_event(struct regulatory_request *request);
 #else
 static inline int nl80211_init(void)
 {
@@ -31,6 +32,10 @@ static inline void nl80211_send_scan_aborted(
 					struct cfg80211_registered_device *rdev,
 					struct net_device *netdev)
 {}
+static inline void
+nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+}
 #endif /* CONFIG_NL80211 */
 
 #endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 68fde6d33dc3..eb8b8ed16155 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -41,6 +41,7 @@
 #include <net/cfg80211.h>
 #include "core.h"
 #include "reg.h"
+#include "nl80211.h"
 
 /* Receipt of information from last regulatory request */
 static struct regulatory_request *last_request;
@@ -1403,8 +1404,16 @@ new_request:
 	pending_request = NULL;
 
 	/* When r == REG_INTERSECT we do need to call CRDA */
-	if (r < 0)
+	if (r < 0) {
+		/*
+		 * Since CRDA will not be called in this case as we already
+		 * have applied the requested regulatory domain before we just
+		 * inform userspace we have processed the request
+		 */
+		if (r == -EALREADY)
+			nl80211_send_reg_change_event(last_request);
 		return r;
+	}
 
 	/*
 	 * Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled
@@ -2084,6 +2093,8 @@ int set_regdom(const struct ieee80211_regdomain *rd)
 
 	print_regdomain(cfg80211_regdomain);
 
+	nl80211_send_reg_change_event(last_request);
+
 	return r;
 }
 
-- 
cgit v1.2.3-71-gd317


From ac26c18bd35d982d1ba06020a992b1085fefc3e2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 12 Feb 2009 16:19:13 +0100
Subject: dma-debug: add function to dump dma mappings

This adds a function to dump the DMA mappings that the debugging code is
aware of -- either for a single device, or for _all_ devices.

This can be useful for debugging -- sticking a call to it in the DMA
page fault handler, for example, to see if the faulting address _should_
be mapped or not, and hence work out whether it's IOMMU bugs we're
seeing, or driver bugs.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/dma-debug.h |  6 ++++++
 lib/dma-debug.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 4985c6c5237e..46a11c10da04 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -76,6 +76,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
 					 struct scatterlist *sg,
 					 int nelems, int direction);
 
+extern void debug_dma_dump_mappings(struct device *dev);
+
 #else /* CONFIG_DMA_API_DEBUG */
 
 static inline void dma_debug_init(u32 num_entries)
@@ -156,6 +158,10 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev,
 {
 }
 
+static inline void debug_dma_dump_mappings(struct device *dev)
+{
+}
+
 #endif /* CONFIG_DMA_API_DEBUG */
 
 #endif /* __DMA_DEBUG_H */
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 9d11e89c2ee2..91ed1dfdbaac 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -193,6 +193,36 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
 	list_del(&entry->list);
 }
 
+/*
+ * Dump mapping entries for debugging purposes
+ */
+void debug_dma_dump_mappings(struct device *dev)
+{
+	int idx;
+
+	for (idx = 0; idx < HASH_SIZE; idx++) {
+		struct hash_bucket *bucket = &dma_entry_hash[idx];
+		struct dma_debug_entry *entry;
+		unsigned long flags;
+
+		spin_lock_irqsave(&bucket->lock, flags);
+
+		list_for_each_entry(entry, &bucket->list, list) {
+			if (!dev || dev == entry->dev) {
+				dev_info(entry->dev,
+					 "%s idx %d P=%Lx D=%Lx L=%Lx %s\n",
+					 type2name[entry->type], idx,
+					 (unsigned long long)entry->paddr,
+					 entry->dev_addr, entry->size,
+					 dir2name[entry->direction]);
+			}
+		}
+
+		spin_unlock_irqrestore(&bucket->lock, flags);
+	}
+}
+EXPORT_SYMBOL(debug_dma_dump_mappings);
+
 /*
  * Wrapper function for adding an entry to the hash.
  * This function takes care of locking itself.
-- 
cgit v1.2.3-71-gd317


From 41531c8f5f05aba5ec645d9770557eedbf75b422 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 16 Mar 2009 17:32:14 +0100
Subject: dma-debug: add a check dma memory leaks

Impact: allow architectures to monitor busses for dma mem leakage

This patch adds checking code to detect if a device has pending DMA
operations when it is about to be unbound from its device driver.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/dma-debug.h |  7 ++++++
 lib/dma-debug.c           | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 46a11c10da04..e851d23e91eb 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -24,9 +24,12 @@
 
 struct device;
 struct scatterlist;
+struct bus_type;
 
 #ifdef CONFIG_DMA_API_DEBUG
 
+extern void dma_debug_add_bus(struct bus_type *bus);
+
 extern void dma_debug_init(u32 num_entries);
 
 extern void debug_dma_map_page(struct device *dev, struct page *page,
@@ -80,6 +83,10 @@ extern void debug_dma_dump_mappings(struct device *dev);
 
 #else /* CONFIG_DMA_API_DEBUG */
 
+void dma_debug_add_bus(struct bus_type *bus)
+{
+}
+
 static inline void dma_debug_init(u32 num_entries)
 {
 }
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 6022eb4a0cd0..9a350b414a50 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -400,6 +400,61 @@ out_err:
 	return -ENOMEM;
 }
 
+static int device_dma_allocations(struct device *dev)
+{
+	struct dma_debug_entry *entry;
+	unsigned long flags;
+	int count = 0, i;
+
+	for (i = 0; i < HASH_SIZE; ++i) {
+		spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
+		list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
+			if (entry->dev == dev)
+				count += 1;
+		}
+		spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
+	}
+
+	return count;
+}
+
+static int dma_debug_device_change(struct notifier_block *nb,
+				    unsigned long action, void *data)
+{
+	struct device *dev = data;
+	int count;
+
+
+	switch (action) {
+	case BUS_NOTIFY_UNBIND_DRIVER:
+		count = device_dma_allocations(dev);
+		if (count == 0)
+			break;
+		err_printk(dev, NULL, "DMA-API: device driver has pending "
+				"DMA allocations while released from device "
+				"[count=%d]\n", count);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+void dma_debug_add_bus(struct bus_type *bus)
+{
+	struct notifier_block *nb;
+
+	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+	if (nb == NULL) {
+		printk(KERN_ERR "dma_debug_add_bus: out of memory\n");
+		return;
+	}
+
+	nb->notifier_call = dma_debug_device_change;
+
+	bus_register_notifier(bus, nb);
+}
 
 /*
  * Let the architectures decide how many entries should be preallocated.
-- 
cgit v1.2.3-71-gd317


From 76a67ec6fb79ff3570dcb5342142c16098299911 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 16 Mar 2009 18:34:20 -0400
Subject: nfsd: nfsd should drop CAP_MKNOD for non-root

Since creating a device node is normally an operation requiring special
privilege, Igor Zhbanov points out that it is surprising (to say the
least) that a client can, for example, create a device node on a
filesystem exported with root_squash.

So, make sure CAP_MKNOD is among the capabilities dropped when an nfsd
thread handles a request from a non-root user.

Reported-by: Igor Zhbanov <izh1979@gmail.com>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/capability.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 1b9872556131..4864a43b2b45 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -393,8 +393,10 @@ struct cpu_vfs_cap_data {
 # define CAP_FULL_SET     ((kernel_cap_t){{ ~0, ~0 }})
 # define CAP_INIT_EFF_SET ((kernel_cap_t){{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }})
 # define CAP_FS_SET       ((kernel_cap_t){{ CAP_FS_MASK_B0, CAP_FS_MASK_B1 } })
-# define CAP_NFSD_SET     ((kernel_cap_t){{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), \
-					CAP_FS_MASK_B1 } })
+# define CAP_NFSD_SET     ((kernel_cap_t){{ CAP_FS_MASK_B0 \
+					    | CAP_TO_MASK(CAP_SYS_RESOURCE) \
+					    | CAP_TO_MASK(CAP_MKNOD), \
+					    CAP_FS_MASK_B1 } })
 
 #endif /* _KERNEL_CAPABILITY_U32S != 2 */
 
-- 
cgit v1.2.3-71-gd317


From 9d783ba042771284fb4ee5013c3d94220755ae7f Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:04:55 -0700
Subject: x86, x2apic: enable fault handling for intr-remapping

Impact: interface augmentation (not yet used)

Enable fault handling flow for intr-remapping aswell. Fault handling
code now shared by both dma-remapping and intr-remapping.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/msidef.h   |   1 +
 arch/x86/kernel/apic/io_apic.c  |   9 +++-
 arch/x86/kernel/apic/probe_64.c |   9 ++++
 drivers/pci/dmar.c              | 102 +++++++++++++++++++++++++++++++++-------
 drivers/pci/intel-iommu.c       |   3 +-
 drivers/pci/intr_remapping.c    |   2 +-
 include/linux/dmar.h            |   5 +-
 include/linux/intel-iommu.h     |   4 +-
 8 files changed, 107 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 6706b3006f13..4cc48af23fef 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -47,6 +47,7 @@
 #define	 MSI_ADDR_DEST_ID_MASK		0x00ffff0
 #define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
 					 MSI_ADDR_DEST_ID_MASK)
+#define MSI_ADDR_EXT_DEST_ID(dest)	((dest) & 0xffffff00)
 
 #define MSI_ADDR_IR_EXT_INT		(1 << 4)
 #define MSI_ADDR_IR_SHV			(1 << 3)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc4..b18a7734d689 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3294,7 +3294,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 	} else
 #endif
 	{
-		msg->address_hi = MSI_ADDR_BASE_HI;
+		if (x2apic_enabled())
+			msg->address_hi = MSI_ADDR_BASE_HI |
+					  MSI_ADDR_EXT_DEST_ID(dest);
+		else
+			msg->address_hi = MSI_ADDR_BASE_HI;
+
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
 			((apic->irq_dest_mode == 0) ?
@@ -3528,7 +3533,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 	destroy_irq(irq);
 }
 
-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8d7748efe6a8..8297c2b8ed20 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -68,6 +68,15 @@ void __init default_setup_apic_routing(void)
 			apic = &apic_physflat;
 		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 	}
+
+#ifdef CONFIG_X86_X2APIC
+	/*
+	 * Now that apic routing model is selected, configure the
+	 * fault handling for intr remapping.
+	 */
+	if (intr_remapping_enabled)
+		enable_drhd_fault_handling();
+#endif
 }
 
 /* Same for both flat and physical. */
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 75d34bf2db50..bb4ed985f9c7 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -511,6 +511,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 		return -ENOMEM;
 
 	iommu->seq_id = iommu_allocated++;
+	sprintf (iommu->name, "dmar%d", iommu->seq_id);
 
 	iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
 	if (!iommu->reg) {
@@ -817,7 +818,13 @@ int dmar_enable_qi(struct intel_iommu *iommu)
 
 /* iommu interrupt handling. Most stuff are MSI-like. */
 
-static const char *fault_reason_strings[] =
+enum faulttype {
+	DMA_REMAP,
+	INTR_REMAP,
+	UNKNOWN,
+};
+
+static const char *dma_remap_fault_reasons[] =
 {
 	"Software",
 	"Present bit in root entry is clear",
@@ -833,14 +840,33 @@ static const char *fault_reason_strings[] =
 	"non-zero reserved fields in CTP",
 	"non-zero reserved fields in PTE",
 };
+
+static const char *intr_remap_fault_reasons[] =
+{
+	"Detected reserved fields in the decoded interrupt-remapped request",
+	"Interrupt index exceeded the interrupt-remapping table size",
+	"Present field in the IRTE entry is clear",
+	"Error accessing interrupt-remapping table pointed by IRTA_REG",
+	"Detected reserved fields in the IRTE entry",
+	"Blocked a compatibility format interrupt request",
+	"Blocked an interrupt request due to source-id verification failure",
+};
+
 #define MAX_FAULT_REASON_IDX 	(ARRAY_SIZE(fault_reason_strings) - 1)
 
-const char *dmar_get_fault_reason(u8 fault_reason)
+const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
 {
-	if (fault_reason > MAX_FAULT_REASON_IDX)
+	if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
+				     ARRAY_SIZE(intr_remap_fault_reasons))) {
+		*fault_type = INTR_REMAP;
+		return intr_remap_fault_reasons[fault_reason - 0x20];
+	} else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
+		*fault_type = DMA_REMAP;
+		return dma_remap_fault_reasons[fault_reason];
+	} else {
+		*fault_type = UNKNOWN;
 		return "Unknown";
-	else
-		return fault_reason_strings[fault_reason];
+	}
 }
 
 void dmar_msi_unmask(unsigned int irq)
@@ -897,16 +923,25 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 		u8 fault_reason, u16 source_id, unsigned long long addr)
 {
 	const char *reason;
+	int fault_type;
 
-	reason = dmar_get_fault_reason(fault_reason);
+	reason = dmar_get_fault_reason(fault_reason, &fault_type);
 
-	printk(KERN_ERR
-		"DMAR:[%s] Request device [%02x:%02x.%d] "
-		"fault addr %llx \n"
-		"DMAR:[fault reason %02d] %s\n",
-		(type ? "DMA Read" : "DMA Write"),
-		(source_id >> 8), PCI_SLOT(source_id & 0xFF),
-		PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+	if (fault_type == INTR_REMAP)
+		printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] "
+		       "fault index %llx\n"
+			"INTR-REMAP:[fault reason %02d] %s\n",
+			(source_id >> 8), PCI_SLOT(source_id & 0xFF),
+			PCI_FUNC(source_id & 0xFF), addr >> 48,
+			fault_reason, reason);
+	else
+		printk(KERN_ERR
+		       "DMAR:[%s] Request device [%02x:%02x.%d] "
+		       "fault addr %llx \n"
+		       "DMAR:[fault reason %02d] %s\n",
+		       (type ? "DMA Read" : "DMA Write"),
+		       (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+		       PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
 	return 0;
 }
 
@@ -920,10 +955,13 @@ static irqreturn_t dmar_fault(int irq, void *dev_id)
 
 	spin_lock_irqsave(&iommu->register_lock, flag);
 	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+	if (fault_status)
+		printk(KERN_ERR "DRHD: handling fault status reg %x\n",
+		       fault_status);
 
 	/* TBD: ignore advanced fault log currently */
 	if (!(fault_status & DMA_FSTS_PPF))
-		goto clear_overflow;
+		goto clear_rest;
 
 	fault_index = dma_fsts_fault_record_index(fault_status);
 	reg = cap_fault_reg_offset(iommu->cap);
@@ -964,11 +1002,10 @@ static irqreturn_t dmar_fault(int irq, void *dev_id)
 			fault_index = 0;
 		spin_lock_irqsave(&iommu->register_lock, flag);
 	}
-clear_overflow:
-	/* clear primary fault overflow */
+clear_rest:
+	/* clear all the other faults */
 	fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-	if (fault_status & DMA_FSTS_PFO)
-		writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
+	writel(fault_status, iommu->reg + DMAR_FSTS_REG);
 
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 	return IRQ_HANDLED;
@@ -978,6 +1015,12 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 {
 	int irq, ret;
 
+	/*
+	 * Check if the fault interrupt is already initialized.
+	 */
+	if (iommu->irq)
+		return 0;
+
 	irq = create_irq();
 	if (!irq) {
 		printk(KERN_ERR "IOMMU: no free vectors\n");
@@ -1003,3 +1046,26 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 		printk(KERN_ERR "IOMMU: can't request irq\n");
 	return ret;
 }
+
+int __init enable_drhd_fault_handling(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	/*
+	 * Enable fault control interrupt.
+	 */
+	for_each_drhd_unit(drhd) {
+		int ret;
+		struct intel_iommu *iommu = drhd->iommu;
+		ret = dmar_set_interrupt(iommu);
+
+		if (ret) {
+			printk(KERN_ERR "DRHD %Lx: failed to enable fault, "
+			       " interrupt, ret %d\n",
+			       (unsigned long long)drhd->reg_base_addr, ret);
+			return -1;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 4a4ab651b709..25fc1df486bb 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1799,7 +1799,7 @@ static int __init init_dmars(void)
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int i, ret, unit = 0;
+	int i, ret;
 
 	/*
 	 * for each drhd
@@ -1921,7 +1921,6 @@ static int __init init_dmars(void)
 		if (drhd->ignored)
 			continue;
 		iommu = drhd->iommu;
-		sprintf (iommu->name, "dmar%d", unit++);
 
 		iommu_flush_write_buffer(iommu);
 
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 5ffa65fffb6a..c38e3f437a81 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -308,7 +308,7 @@ int modify_irte(int irq, struct irte *irte_modified)
 	index = irq_iommu->irte_index + irq_iommu->sub_handle;
 	irte = &iommu->ir_table->base[index];
 
-	set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1));
+	set_64bit((unsigned long *)irte, irte_modified->low);
 	__iommu_flush_cache(iommu, irte, sizeof(*irte));
 
 	rc = qi_flush_iec(iommu, index, 0);
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f28440784cf0..c7768330c11d 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -49,6 +49,7 @@ extern int dmar_dev_scope_init(void);
 
 /* Intel IOMMU detection */
 extern void detect_intel_iommu(void);
+extern int enable_drhd_fault_handling(void);
 
 
 extern int parse_ioapics_under_ir(void);
@@ -116,9 +117,6 @@ extern struct intel_iommu *map_ioapic_to_ir(int apic);
 #define intr_remapping_enabled		(0)
 #endif
 
-#ifdef CONFIG_DMAR
-extern const char *dmar_get_fault_reason(u8 fault_reason);
-
 /* Can't use the common MSI interrupt functions
  * since DMAR is not a pci device
  */
@@ -129,6 +127,7 @@ extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
 extern int arch_setup_dmar_msi(unsigned int irq);
 
+#ifdef CONFIG_DMAR
 extern int iommu_detected, no_iommu;
 extern struct list_head dmar_rmrr_units;
 struct dmar_rmrr_unit {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d2e3cbfba14f..a9563840644b 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -292,6 +292,8 @@ struct intel_iommu {
 	spinlock_t	register_lock; /* protect register handling */
 	int		seq_id;	/* sequence id of the iommu */
 	int		agaw; /* agaw of this iommu */
+	unsigned int 	irq;
+	unsigned char 	name[13];    /* Device Name */
 
 #ifdef CONFIG_DMAR
 	unsigned long 	*domain_ids; /* bitmap of domains */
@@ -299,8 +301,6 @@ struct intel_iommu {
 	spinlock_t	lock; /* protect context, domain ids */
 	struct root_entry *root_entry; /* virtual address */
 
-	unsigned int irq;
-	unsigned char name[7];    /* Device Name */
 	struct iommu_flush flush;
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
-- 
cgit v1.2.3-71-gd317


From eba67e5da6e971993b2899d2cdf459ce77d3dbc5 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:04:56 -0700
Subject: x86, dmar: routines for disabling queued invalidation and intr
 remapping

Impact: new interfaces (not yet used)

Routines for disabling queued invalidation and interrupt remapping.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 drivers/pci/dmar.c           | 36 ++++++++++++++++++++++++++++++++++++
 drivers/pci/intr_remapping.c | 27 +++++++++++++++++++++++++++
 include/linux/intel-iommu.h  |  1 +
 3 files changed, 64 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index bb4ed985f9c7..932e5e3930fc 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -753,6 +753,42 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	return qi_submit_sync(&desc, iommu);
 }
 
+/*
+ * Disable Queued Invalidation interface.
+ */
+void dmar_disable_qi(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+	cycles_t start_time = get_cycles();
+
+	if (!ecap_qis(iommu->ecap))
+		return;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	sts =  dmar_readq(iommu->reg + DMAR_GSTS_REG);
+	if (!(sts & DMA_GSTS_QIES))
+		goto end;
+
+	/*
+	 * Give a chance to HW to complete the pending invalidation requests.
+	 */
+	while ((readl(iommu->reg + DMAR_IQT_REG) !=
+		readl(iommu->reg + DMAR_IQH_REG)) &&
+		(DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+		cpu_relax();
+
+	iommu->gcmd &= ~DMA_GCMD_QIE;
+
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
+		      !(sts & DMA_GSTS_QIES), sts);
+end:
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
 /*
  * Enable Queued Invalidation interface. This is a must to support
  * interrupt-remapping. Also used by DMA-remapping, which replaces
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index c38e3f437a81..0d202d73a1ac 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -467,6 +467,33 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
 	return 0;
 }
 
+/*
+ * Disable Interrupt Remapping.
+ */
+static void disable_intr_remapping(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+
+	if (!ecap_ir_support(iommu->ecap))
+		return;
+
+	spin_lock_irqsave(&iommu->register_lock, flags);
+
+	sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
+	if (!(sts & DMA_GSTS_IRES))
+		goto end;
+
+	iommu->gcmd &= ~DMA_GCMD_IRE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+	spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
 int __init enable_intr_remapping(int eim)
 {
 	struct dmar_drhd_unit *drhd;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a9563840644b..78c1262e8704 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -321,6 +321,7 @@ extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
 extern int alloc_iommu(struct dmar_drhd_unit *drhd);
 extern void free_iommu(struct intel_iommu *iommu);
 extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void dmar_disable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
 extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
-- 
cgit v1.2.3-71-gd317


From 1531a6a6b81a4e6f9eec9a5608758a6ea14b96e0 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:04:57 -0700
Subject: x86, dmar: start with sane state while enabling dma and
 interrupt-remapping

Impact: cleanup/sanitization

Start from a sane state while enabling dma and interrupt-remapping, by
clearing the previous recorded faults and disabling previously
enabled queued invalidation and interrupt-remapping.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 drivers/pci/dmar.c           |  5 +----
 drivers/pci/intel-iommu.c    | 29 +++++++++++++++++++++++++++++
 drivers/pci/intr_remapping.c | 17 +++++++++++++++++
 include/linux/dmar.h         |  2 ++
 4 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 932e5e3930fc..f1805002e436 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -982,7 +982,7 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 }
 
 #define PRIMARY_FAULT_REG_LEN (16)
-static irqreturn_t dmar_fault(int irq, void *dev_id)
+irqreturn_t dmar_fault(int irq, void *dev_id)
 {
 	struct intel_iommu *iommu = dev_id;
 	int reg, fault_index;
@@ -1074,9 +1074,6 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 		return 0;
 	}
 
-	/* Force fault register is cleared */
-	dmar_fault(irq, iommu);
-
 	ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
 	if (ret)
 		printk(KERN_ERR "IOMMU: can't request irq\n");
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 25fc1df486bb..ef167b8b047d 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1855,11 +1855,40 @@ static int __init init_dmars(void)
 		}
 	}
 
+	/*
+	 * Start from the sane iommu hardware state.
+	 */
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
 
 		iommu = drhd->iommu;
+
+		/*
+		 * If the queued invalidation is already initialized by us
+		 * (for example, while enabling interrupt-remapping) then
+		 * we got the things already rolling from a sane state.
+		 */
+		if (iommu->qi)
+			continue;
+
+		/*
+		 * Clear any previous faults.
+		 */
+		dmar_fault(-1, iommu);
+		/*
+		 * Disable queued invalidation if supported and already enabled
+		 * before OS handover.
+		 */
+		dmar_disable_qi(iommu);
+	}
+
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+
+		iommu = drhd->iommu;
+
 		if (dmar_enable_qi(iommu)) {
 			/*
 			 * Queued Invalidate not enabled, use Register Based
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 0d202d73a1ac..a84686b2478b 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -499,6 +499,23 @@ int __init enable_intr_remapping(int eim)
 	struct dmar_drhd_unit *drhd;
 	int setup = 0;
 
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		/*
+		 * Clear previous faults.
+		 */
+		dmar_fault(-1, iommu);
+
+		/*
+		 * Disable intr remapping and queued invalidation, if already
+		 * enabled prior to OS handover.
+		 */
+		disable_intr_remapping(iommu);
+
+		dmar_disable_qi(iommu);
+	}
+
 	/*
 	 * check for the Interrupt-remapping support
 	 */
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index c7768330c11d..8a035aec14a9 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -24,6 +24,7 @@
 #include <linux/acpi.h>
 #include <linux/types.h>
 #include <linux/msi.h>
+#include <linux/irqreturn.h>
 
 #if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
 struct intel_iommu;
@@ -125,6 +126,7 @@ extern void dmar_msi_mask(unsigned int irq);
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
+extern irqreturn_t dmar_fault(int irq, void *dev_id);
 extern int arch_setup_dmar_msi(unsigned int irq);
 
 #ifdef CONFIG_DMAR
-- 
cgit v1.2.3-71-gd317


From 29b61be65a33c95564fa82e7e8d60d97adb68ea8 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:05:02 -0700
Subject: x86, x2apic: cleanup ifdef CONFIG_INTR_REMAP in io_apic code

Impact: cleanup

Clean up #ifdefs and replace them with helper functions.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/io_apic.c  | 44 +++++++++-------------------------------
 arch/x86/kernel/apic/probe_64.c |  2 --
 include/linux/dmar.h            | 45 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 48 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e074eac5bd35..cf27795c641c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -554,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
 
 		apic = entry->apic;
 		pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
 		/*
 		 * With interrupt-remapping, destination information comes
 		 * from interrupt-remapping table entry.
 		 */
 		if (!irq_remapped(irq))
 			io_apic_write(apic, 0x11 + pin*2, dest);
-#else
-		io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
@@ -1419,9 +1415,8 @@ void __setup_vector_irq(int cpu)
 }
 
 static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip;
-#endif
+static struct irq_chip msi_ir_chip;
 
 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
@@ -1460,7 +1455,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 	else
 		desc->status &= ~IRQ_LEVEL;
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		desc->status |= IRQ_MOVE_PCNTXT;
 		if (trigger)
@@ -1472,7 +1466,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 						      handle_edge_irq, "edge");
 		return;
 	}
-#endif
+
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1493,7 +1487,6 @@ int setup_ioapic_entry(int apic_id, int irq,
 	 */
 	memset(entry,0,sizeof(*entry));
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled) {
 		struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
 		struct irte irte;
@@ -1535,9 +1528,7 @@ int setup_ioapic_entry(int apic_id, int irq,
 		 * irq handler will do the explicit EOI to the io-apic.
 		 */
 		ir_entry->vector = pin;
-	} else
-#endif
-	{
+	} else {
 		entry->delivery_mode = apic->irq_delivery_mode;
 		entry->dest_mode = apic->irq_dest_mode;
 		entry->dest = destination;
@@ -1662,10 +1653,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
 {
 	struct IO_APIC_route_entry entry;
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled)
 		return;
-#endif
 
 	memset(&entry, 0, sizeof(entry));
 
@@ -2395,6 +2384,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
 
 	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+						   const struct cpumask *mask)
+{
+}
 #endif
 
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2883,10 +2877,8 @@ static inline void __init check_timer(void)
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("BIOS bug: timer not connected to IO-APIC");
-#endif
 		pin1 = pin2;
 		apic1 = apic2;
 		no_pin1 = 1;
@@ -2922,10 +2914,8 @@ static inline void __init check_timer(void)
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
 		local_irq_disable();
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
@@ -3219,9 +3209,7 @@ void destroy_irq(unsigned int irq)
 	if (desc)
 		desc->chip_data = cfg;
 
-#ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
-#endif
 	spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
@@ -3247,7 +3235,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irte irte;
 		int ir_index;
@@ -3273,9 +3260,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 				  MSI_ADDR_IR_SHV |
 				  MSI_ADDR_IR_INDEX1(ir_index) |
 				  MSI_ADDR_IR_INDEX2(ir_index);
-	} else
-#endif
-	{
+	} else {
 		if (x2apic_enabled())
 			msg->address_hi = MSI_ADDR_BASE_HI |
 					  MSI_ADDR_EXT_DEST_ID(dest);
@@ -3392,6 +3377,7 @@ static struct irq_chip msi_ir_chip = {
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
+#endif
 
 /*
  * Map the PCI dev to the corresponding remapping hardware unit
@@ -3419,7 +3405,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 	}
 	return index;
 }
-#endif
 
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
@@ -3433,7 +3418,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irq_desc *desc = irq_to_desc(irq);
 		/*
@@ -3442,7 +3426,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 		desc->status |= IRQ_MOVE_PCNTXT;
 		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
 	} else
-#endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
 	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3456,11 +3439,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int ret, sub_handle;
 	struct msi_desc *msidesc;
 	unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
 	struct intel_iommu *iommu = 0;
 	int index = 0;
-#endif
 
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
@@ -3469,7 +3449,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		if (irq == 0)
 			return -1;
 		irq_want = irq + 1;
-#ifdef CONFIG_INTR_REMAP
 		if (!intr_remapping_enabled)
 			goto no_ir;
 
@@ -3497,7 +3476,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			set_irte_irq(irq, iommu, index, sub_handle);
 		}
 no_ir:
-#endif
 		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
@@ -4032,11 +4010,9 @@ void __init setup_ioapic_dest(void)
 			else
 				mask = apic->target_cpus();
 
-#ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
 				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
-#endif
 				set_ioapic_affinity_irq_desc(desc, mask);
 		}
 
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8297c2b8ed20..1783652bb0e5 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -69,14 +69,12 @@ void __init default_setup_apic_routing(void)
 		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 	}
 
-#ifdef CONFIG_X86_X2APIC
 	/*
 	 * Now that apic routing model is selected, configure the
 	 * fault handling for intr remapping.
 	 */
 	if (intr_remapping_enabled)
 		enable_drhd_fault_handling();
-#endif
 }
 
 /* Same for both flat and physical. */
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 8a035aec14a9..2f3427468956 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -26,9 +26,8 @@
 #include <linux/msi.h>
 #include <linux/irqreturn.h>
 
-#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
 struct intel_iommu;
-
+#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
 struct dmar_drhd_unit {
 	struct list_head list;		/* list of drhd units	*/
 	struct  acpi_dmar_header *hdr;	/* ACPI header		*/
@@ -52,7 +51,6 @@ extern int dmar_dev_scope_init(void);
 extern void detect_intel_iommu(void);
 extern int enable_drhd_fault_handling(void);
 
-
 extern int parse_ioapics_under_ir(void);
 extern int alloc_iommu(struct dmar_drhd_unit *);
 #else
@@ -65,12 +63,12 @@ static inline int dmar_table_init(void)
 {
 	return -ENODEV;
 }
+static inline int enable_drhd_fault_handling(void)
+{
+	return -1;
+}
 #endif /* !CONFIG_DMAR && !CONFIG_INTR_REMAP */
 
-#ifdef CONFIG_INTR_REMAP
-extern int intr_remapping_enabled;
-extern int enable_intr_remapping(int);
-
 struct irte {
 	union {
 		struct {
@@ -99,6 +97,10 @@ struct irte {
 		__u64 high;
 	};
 };
+#ifdef CONFIG_INTR_REMAP
+extern int intr_remapping_enabled;
+extern int enable_intr_remapping(int);
+
 extern int get_irte(int irq, struct irte *entry);
 extern int modify_irte(int irq, struct irte *irte_modified);
 extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
@@ -113,6 +115,35 @@ extern int irq_remapped(int irq);
 extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
 extern struct intel_iommu *map_ioapic_to_ir(int apic);
 #else
+static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+{
+	return -1;
+}
+static inline int modify_irte(int irq, struct irte *irte_modified)
+{
+	return -1;
+}
+static inline int free_irte(int irq)
+{
+	return -1;
+}
+static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+{
+	return -1;
+}
+static inline int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
+			       u16 sub_handle)
+{
+	return -1;
+}
+static inline struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+	return NULL;
+}
+static inline struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+	return NULL;
+}
 #define irq_remapped(irq)		(0)
 #define enable_intr_remapping(mode)	(-1)
 #define intr_remapping_enabled		(0)
-- 
cgit v1.2.3-71-gd317


From 37886f6a9f62d22530ffee8d3f9215c8345b6969 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 17:22:06 -0400
Subject: ring-buffer: add api to allow a tracer to change clock source

This patch adds a new function called ring_buffer_set_clock that
allows a tracer to assign its own clock source to the buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++--
 kernel/trace/ring_buffer.c  | 65 ++++++++++++++++++++++++++-------------------
 kernel/trace/trace.c        | 21 ++++++++++-----
 3 files changed, 57 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1a0068a5557..9e6052bd1a1c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -118,8 +118,11 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 
-u64 ring_buffer_time_stamp(int cpu);
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts);
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void));
 
 size_t ring_buffer_page_len(void *page);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 58128ad2fde0..bbf51922a8ca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -180,29 +180,6 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #include "trace.h"
 
-/* Up this if you want to test the TIME_EXTENTS and normalization */
-#define DEBUG_SHIFT 0
-
-u64 ring_buffer_time_stamp(int cpu)
-{
-	u64 time;
-
-	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = trace_clock_local() << DEBUG_SHIFT;
-	preempt_enable_no_resched_notrace();
-
-	return time;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
-{
-	/* Just stupid testing the normalize function and deltas */
-	*ts >>= DEBUG_SHIFT;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
-
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
@@ -374,6 +351,7 @@ struct ring_buffer {
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
+	u64				(*clock)(void);
 };
 
 struct ring_buffer_iter {
@@ -394,6 +372,30 @@ struct ring_buffer_iter {
 		_____ret;					\
 	})
 
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	u64 time;
+
+	preempt_disable_notrace();
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	time = buffer->clock() << DEBUG_SHIFT;
+	preempt_enable_no_resched_notrace();
+
+	return time;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
+
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts)
+{
+	/* Just stupid testing the normalize function and deltas */
+	*ts >>= DEBUG_SHIFT;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -569,6 +571,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
+	buffer->clock = trace_clock_local;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -645,6 +648,12 @@ ring_buffer_free(struct ring_buffer *buffer)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free);
 
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void))
+{
+	buffer->clock = clock;
+}
+
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 
 static void
@@ -1191,7 +1200,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			cpu_buffer->tail_page = next_page;
 
 			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
 			cpu_buffer->tail_page->page->time_stamp = *ts;
 		}
 
@@ -1334,7 +1343,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
@@ -2051,7 +2060,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = cpu_buffer->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
@@ -2112,7 +2122,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = iter->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f89690230e6..3be2f788e10d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -155,13 +155,6 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
-cycle_t ftrace_now(int cpu)
-{
-	u64 ts = ring_buffer_time_stamp(cpu);
-	ring_buffer_normalize_time_stamp(cpu, &ts);
-	return ts;
-}
-
 /*
  * The global_trace is the descriptor that holds the tracing
  * buffers for the live tracing. For each CPU, it contains
@@ -178,6 +171,20 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+cycle_t ftrace_now(int cpu)
+{
+	u64 ts;
+
+	/* Early boot up does not have a buffer yet */
+	if (!global_trace.buffer)
+		return trace_clock_local();
+
+	ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
+	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+
+	return ts;
+}
+
 /*
  * The max_tr is used to snapshot the global_trace when a maximum
  * latency is reached. Some tracers will use this to store a maximum
-- 
cgit v1.2.3-71-gd317


From 97e7e4f391cac2b00417b581b432533d245d4fd0 Mon Sep 17 00:00:00 2001
From: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Date: Tue, 17 Mar 2009 21:15:44 +0100
Subject: tracing: optimization of branch tracer

Impact: better performance for if branch tracer

Use an array to count the hit and misses of a conditional instead
of using another conditional. This cuts down on saturation of branch
predictions and increases performance of modern pipelined architectures.

Signed-off-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/compiler.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d95da1020f1c..6faa7e549de4 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -68,6 +68,7 @@ struct ftrace_branch_data {
 			unsigned long miss;
 			unsigned long hit;
 		};
+		unsigned long miss_hit[2];
 	};
 };
 
@@ -125,10 +126,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 				.line = __LINE__,			\
 			};						\
 		______r = !!(cond);					\
-		if (______r)						\
-			______f.hit++;					\
-		else							\
-			______f.miss++;					\
+		______f.miss_hit[______r]++;					\
 		______r;						\
 	}))
 #endif /* CONFIG_PROFILE_ALL_BRANCHES */
-- 
cgit v1.2.3-71-gd317


From 84be58d4601c86306cd939ebf58a9b90989883a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 11:50:29 +0100
Subject: dma-debug: fix dma_debug_add_bus() definition for
 !CONFIG_DMA_API_DEBUG

Impact: build fix

Fix:

 arch/x86/kvm/x86.o: In function `dma_debug_add_bus':
 (.text+0x0): multiple definition of `dma_debug_add_bus'

dma_debug_add_bus() should be a static inline function.

Cc: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <20090317120112.GP6159@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/dma-debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index e851d23e91eb..28d53cb7b5a2 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -83,7 +83,7 @@ extern void debug_dma_dump_mappings(struct device *dev);
 
 #else /* CONFIG_DMA_API_DEBUG */
 
-void dma_debug_add_bus(struct bus_type *bus)
+static inline void dma_debug_add_bus(struct bus_type *bus)
 {
 }
 
-- 
cgit v1.2.3-71-gd317


From 27bf91d6a0d5a9c7224e8687754249bba67dd4cf Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 18 Mar 2009 19:45:11 -0700
Subject: mlx4_core: Add link type autosensing

When a port's link is down (except to driver restart) and the port is
configured for auto sensing, we try to sense port link type (Ethernet
or InfiniBand) in order to determine how to initialize the port.  If
the port type needs to be changed, all mlx4 for the device interfaces
are unregistered and then registered again with the new port
types.  Sensing is done with intervals of 3 seconds.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/Makefile   |   2 +-
 drivers/net/mlx4/catas.c    |  16 +----
 drivers/net/mlx4/eq.c       |  16 +++--
 drivers/net/mlx4/main.c     | 104 +++++++++++++++++++++--------
 drivers/net/mlx4/mlx4.h     |  27 +++++++-
 drivers/net/mlx4/sense.c    | 156 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mlx4/cmd.h    |   1 +
 include/linux/mlx4/device.h |   6 +-
 8 files changed, 277 insertions(+), 51 deletions(-)
 create mode 100644 drivers/net/mlx4/sense.c

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index a7a97bf998f8..21040a0d81fe 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
 
 mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
-		mr.o pd.o port.o profile.o qp.o reset.o srq.o
+		mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o
 
 obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
 
diff --git a/drivers/net/mlx4/catas.c b/drivers/net/mlx4/catas.c
index f094ee00c416..aa9674b7f19c 100644
--- a/drivers/net/mlx4/catas.c
+++ b/drivers/net/mlx4/catas.c
@@ -42,7 +42,6 @@ enum {
 static DEFINE_SPINLOCK(catas_lock);
 
 static LIST_HEAD(catas_list);
-static struct workqueue_struct *catas_wq;
 static struct work_struct catas_work;
 
 static int internal_err_reset = 1;
@@ -77,7 +76,7 @@ static void poll_catas(unsigned long dev_ptr)
 			list_add(&priv->catas_err.list, &catas_list);
 			spin_unlock(&catas_lock);
 
-			queue_work(catas_wq, &catas_work);
+			queue_work(mlx4_wq, &catas_work);
 		}
 	} else
 		mod_timer(&priv->catas_err.timer,
@@ -146,18 +145,7 @@ void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 	spin_unlock_irq(&catas_lock);
 }
 
-int __init mlx4_catas_init(void)
+void  __init mlx4_catas_init(void)
 {
 	INIT_WORK(&catas_work, catas_reset);
-
-	catas_wq = create_singlethread_workqueue("mlx4_err");
-	if (!catas_wq)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void mlx4_catas_cleanup(void)
-{
-	destroy_workqueue(catas_wq);
 }
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index 2c19bff7cbab..8830dcb92ec8 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -163,6 +163,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 	int cqn;
 	int eqes_found = 0;
 	int set_ci = 0;
+	int port;
 
 	while ((eqe = next_eqe_sw(eq))) {
 		/*
@@ -203,11 +204,16 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 			break;
 
 		case MLX4_EVENT_TYPE_PORT_CHANGE:
-			mlx4_dispatch_event(dev,
-					    eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_ACTIVE ?
-					    MLX4_DEV_EVENT_PORT_UP :
-					    MLX4_DEV_EVENT_PORT_DOWN,
-					    be32_to_cpu(eqe->event.port_change.port) >> 28);
+			port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+			if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+			} else {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+			}
 			break;
 
 		case MLX4_EVENT_TYPE_CQ_ERROR:
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 8480f0346844..a66f5b2fd288 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -51,6 +51,8 @@ MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
+struct workqueue_struct *mlx4_wq;
+
 #ifdef CONFIG_MLX4_DEBUG
 
 int mlx4_debug_level = 0;
@@ -98,24 +100,23 @@ module_param_named(use_prio, use_prio, bool, 0444);
 MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
 		  "(0/1, default 0)");
 
-static int mlx4_check_port_params(struct mlx4_dev *dev,
-				  enum mlx4_port_type *port_type)
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type)
 {
 	int i;
 
 	for (i = 0; i < dev->caps.num_ports - 1; i++) {
-		if (port_type[i] != port_type[i+1] &&
-		    !(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
-			mlx4_err(dev, "Only same port types supported "
-				 "on this HCA, aborting.\n");
-			return -EINVAL;
+		if (port_type[i] != port_type[i + 1]) {
+			if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+				mlx4_err(dev, "Only same port types supported "
+					 "on this HCA, aborting.\n");
+				return -EINVAL;
+			}
+			if (port_type[i] == MLX4_PORT_TYPE_ETH &&
+			    port_type[i + 1] == MLX4_PORT_TYPE_IB)
+				return -EINVAL;
 		}
 	}
-	if ((port_type[0] == MLX4_PORT_TYPE_ETH) &&
-	    (port_type[1] == MLX4_PORT_TYPE_IB)) {
-		mlx4_err(dev, "eth-ib configuration is not supported.\n");
-		return -EINVAL;
-	}
 
 	for (i = 0; i < dev->caps.num_ports; i++) {
 		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
@@ -225,6 +226,9 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
 		else
 			dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+		dev->caps.possible_type[i] = dev->caps.port_type[i];
+		mlx4_priv(dev)->sense.sense_allowed[i] =
+			dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO;
 
 		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
 			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
@@ -263,14 +267,16 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
  * Change the port configuration of the device.
  * Every user of this function must hold the port mutex.
  */
-static int mlx4_change_port_types(struct mlx4_dev *dev,
-				  enum mlx4_port_type *port_types)
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types)
 {
 	int err = 0;
 	int change = 0;
 	int port;
 
 	for (port = 0; port <  dev->caps.num_ports; port++) {
+		/* Change the port type only if the new type is different
+		 * from the current, and not set to Auto */
 		if (port_types[port] != dev->caps.port_type[port + 1]) {
 			change = 1;
 			dev->caps.port_type[port + 1] = port_types[port];
@@ -302,10 +308,17 @@ static ssize_t show_port_type(struct device *dev,
 	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
 						   port_attr);
 	struct mlx4_dev *mdev = info->dev;
+	char type[8];
+
+	sprintf(type, "%s",
+		(mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ?
+		"ib" : "eth");
+	if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO)
+		sprintf(buf, "auto (%s)\n", type);
+	else
+		sprintf(buf, "%s\n", type);
 
-	return sprintf(buf, "%s\n",
-		       mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB ?
-		       "ib" : "eth");
+	return strlen(buf);
 }
 
 static ssize_t set_port_type(struct device *dev,
@@ -317,6 +330,7 @@ static ssize_t set_port_type(struct device *dev,
 	struct mlx4_dev *mdev = info->dev;
 	struct mlx4_priv *priv = mlx4_priv(mdev);
 	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	enum mlx4_port_type new_types[MLX4_MAX_PORTS];
 	int i;
 	int err = 0;
 
@@ -324,26 +338,56 @@ static ssize_t set_port_type(struct device *dev,
 		info->tmp_type = MLX4_PORT_TYPE_IB;
 	else if (!strcmp(buf, "eth\n"))
 		info->tmp_type = MLX4_PORT_TYPE_ETH;
+	else if (!strcmp(buf, "auto\n"))
+		info->tmp_type = MLX4_PORT_TYPE_AUTO;
 	else {
 		mlx4_err(mdev, "%s is not supported port type\n", buf);
 		return -EINVAL;
 	}
 
+	mlx4_stop_sense(mdev);
 	mutex_lock(&priv->port_mutex);
-	for (i = 0; i < mdev->caps.num_ports; i++)
+	/* Possible type is always the one that was delivered */
+	mdev->caps.possible_type[info->port] = info->tmp_type;
+
+	for (i = 0; i < mdev->caps.num_ports; i++) {
 		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
-					mdev->caps.port_type[i+1];
+					mdev->caps.possible_type[i+1];
+		if (types[i] == MLX4_PORT_TYPE_AUTO)
+			types[i] = mdev->caps.port_type[i+1];
+	}
 
-	err = mlx4_check_port_params(mdev, types);
+	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+		for (i = 1; i <= mdev->caps.num_ports; i++) {
+			if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+				mdev->caps.possible_type[i] = mdev->caps.port_type[i];
+				err = -EINVAL;
+			}
+		}
+	}
+	if (err) {
+		mlx4_err(mdev, "Auto sensing is not supported on this HCA. "
+			       "Set only 'eth' or 'ib' for both ports "
+			       "(should be the same)\n");
+		goto out;
+	}
+
+	mlx4_do_sense_ports(mdev, new_types, types);
+
+	err = mlx4_check_port_params(mdev, new_types);
 	if (err)
 		goto out;
 
-	for (i = 1; i <= mdev->caps.num_ports; i++)
-		priv->port[i].tmp_type = 0;
+	/* We are about to apply the changes after the configuration
+	 * was verified, no need to remember the temporary types
+	 * any more */
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		priv->port[i + 1].tmp_type = 0;
 
-	err = mlx4_change_port_types(mdev, types);
+	err = mlx4_change_port_types(mdev, new_types);
 
 out:
+	mlx4_start_sense(mdev);
 	mutex_unlock(&priv->port_mutex);
 	return err ? err : count;
 }
@@ -1117,6 +1161,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_port;
 
+	mlx4_sense_init(dev);
+	mlx4_start_sense(dev);
+
 	pci_set_drvdata(pdev, dev);
 
 	return 0;
@@ -1182,6 +1229,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	int p;
 
 	if (dev) {
+		mlx4_stop_sense(dev);
 		mlx4_unregister_device(dev);
 
 		for (p = 1; p <= dev->caps.num_ports; p++) {
@@ -1266,9 +1314,11 @@ static int __init mlx4_init(void)
 	if (mlx4_verify_params())
 		return -EINVAL;
 
-	ret = mlx4_catas_init();
-	if (ret)
-		return ret;
+	mlx4_catas_init();
+
+	mlx4_wq = create_singlethread_workqueue("mlx4");
+	if (!mlx4_wq)
+		return -ENOMEM;
 
 	ret = pci_register_driver(&mlx4_driver);
 	return ret < 0 ? ret : 0;
@@ -1277,7 +1327,7 @@ static int __init mlx4_init(void)
 static void __exit mlx4_cleanup(void)
 {
 	pci_unregister_driver(&mlx4_driver);
-	mlx4_catas_cleanup();
+	destroy_workqueue(mlx4_wq);
 }
 
 module_init(mlx4_init);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index e0213bad61c7..5bd79c2b184f 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -40,6 +40,7 @@
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
 #include <linux/timer.h>
+#include <linux/workqueue.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/driver.h>
@@ -276,6 +277,13 @@ struct mlx4_port_info {
 	struct mlx4_vlan_table	vlan_table;
 };
 
+struct mlx4_sense {
+	struct mlx4_dev		*dev;
+	u8			do_sense_port[MLX4_MAX_PORTS + 1];
+	u8			sense_allowed[MLX4_MAX_PORTS + 1];
+	struct delayed_work	sense_poll;
+};
+
 struct mlx4_priv {
 	struct mlx4_dev		dev;
 
@@ -305,6 +313,7 @@ struct mlx4_priv {
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
 	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct mlx4_sense       sense;
 	struct mutex		port_mutex;
 };
 
@@ -313,6 +322,10 @@ static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
 	return container_of(dev, struct mlx4_priv, dev);
 }
 
+#define MLX4_SENSE_RANGE	(HZ * 3)
+
+extern struct workqueue_struct *mlx4_wq;
+
 u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
 u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
@@ -346,8 +359,7 @@ void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev);
 void mlx4_stop_catas_poll(struct mlx4_dev *dev);
-int mlx4_catas_init(void);
-void mlx4_catas_cleanup(void);
+void mlx4_catas_init(void);
 int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
@@ -379,6 +391,17 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 
 void mlx4_handle_catas_err(struct mlx4_dev *dev);
 
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults);
+void mlx4_start_sense(struct mlx4_dev *dev);
+void mlx4_stop_sense(struct mlx4_dev *dev);
+void mlx4_sense_init(struct mlx4_dev *dev);
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type);
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types);
+
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
 
diff --git a/drivers/net/mlx4/sense.c b/drivers/net/mlx4/sense.c
new file mode 100644
index 000000000000..6d5089ecb5af
--- /dev/null
+++ b/drivers/net/mlx4/sense.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+			   enum mlx4_port_type *type)
+{
+	u64 out_param;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
+			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B);
+	if (err) {
+		mlx4_err(dev, "Sense command failed for port: %d\n", port);
+		return err;
+	}
+
+	if (out_param > 2) {
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
+		return EINVAL;
+	}
+
+	*type = out_param;
+	return 0;
+}
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults)
+{
+	struct mlx4_sense *sense = &mlx4_priv(dev)->sense;
+	int err;
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		stype[i - 1] = 0;
+		if (sense->do_sense_port[i] && sense->sense_allowed[i] &&
+		    dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+			err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]);
+			if (err)
+				stype[i - 1] = defaults[i - 1];
+		} else
+			stype[i - 1] = defaults[i - 1];
+	}
+
+	/*
+	 * Adjust port configuration:
+	 * If port 1 sensed nothing and port 2 is IB, set both as IB
+	 * If port 2 sensed nothing and port 1 is Eth, set both as Eth
+	 */
+	if (stype[0] == MLX4_PORT_TYPE_ETH) {
+		for (i = 1; i < dev->caps.num_ports; i++)
+			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_ETH;
+	}
+	if (stype[dev->caps.num_ports - 1] == MLX4_PORT_TYPE_IB) {
+		for (i = 0; i < dev->caps.num_ports - 1; i++)
+			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_IB;
+	}
+
+	/*
+	 * If sensed nothing, remain in current configuration.
+	 */
+	for (i = 0; i < dev->caps.num_ports; i++)
+		stype[i] = stype[i] ? stype[i] : defaults[i];
+
+}
+
+static void mlx4_sense_port(struct work_struct *work)
+{
+	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
+						sense_poll);
+	struct mlx4_dev *dev = sense->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	enum mlx4_port_type stype[MLX4_MAX_PORTS];
+
+	mutex_lock(&priv->port_mutex);
+	mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
+
+	if (mlx4_check_port_params(dev, stype))
+		goto sense_again;
+
+	if (mlx4_change_port_types(dev, stype))
+		mlx4_err(dev, "Failed to change port_types\n");
+
+sense_again:
+	mutex_unlock(&priv->port_mutex);
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_start_sense(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
+		return;
+
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_stop_sense(struct mlx4_dev *dev)
+{
+	cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll);
+}
+
+void  mlx4_sense_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+	int port;
+
+	sense->dev = dev;
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		sense->do_sense_port[port] = 1;
+
+	INIT_DELAYED_WORK_DEFERRABLE(&sense->sense_poll, mlx4_sense_port);
+}
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index cf9c679ab38b..0f82293a82ed 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -55,6 +55,7 @@ enum {
 	MLX4_CMD_CLOSE_PORT	 = 0xa,
 	MLX4_CMD_QUERY_HCA	 = 0xb,
 	MLX4_CMD_QUERY_PORT	 = 0x43,
+	MLX4_CMD_SENSE_PORT	 = 0x4d,
 	MLX4_CMD_SET_PORT	 = 0xc,
 	MLX4_CMD_ACCESS_DDR	 = 0x2e,
 	MLX4_CMD_MAP_ICM	 = 0xffa,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 8f659cc29960..3aff8a6a389e 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -155,8 +155,9 @@ enum mlx4_qp_region {
 };
 
 enum mlx4_port_type {
-	MLX4_PORT_TYPE_IB	= 1 << 0,
-	MLX4_PORT_TYPE_ETH	= 1 << 1,
+	MLX4_PORT_TYPE_IB	= 1,
+	MLX4_PORT_TYPE_ETH	= 2,
+	MLX4_PORT_TYPE_AUTO	= 3
 };
 
 enum mlx4_special_vlan_idx {
@@ -237,6 +238,7 @@ struct mlx4_caps {
 	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
 	u8			supported_type[MLX4_MAX_PORTS + 1];
 	u32			port_mask;
+	enum mlx4_port_type	possible_type[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3-71-gd317


From 7d1e8255cf959fba7ee2317550dfde39f0b936ae Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Mar 2009 14:38:03 -0400
Subject: SUNRPC: Add the equivalent of the linger and linger2 timeouts to RPC
 sockets

This fixes a regression against FreeBSD servers as reported by Tomas
Kasparek. Apparently when using RPC over a TCP socket, the FreeBSD servers
don't ever react to the client closing the socket, and so commit
e06799f958bf7f9f8fae15f0c6f519953fb0257c (SUNRPC: Use shutdown() instead of
close() when disconnecting a TCP socket) causes the setup to hang forever
whenever the client attempts to close and then reconnect.

We break the deadlock by adding a 'linger2' style timeout to the socket,
after which, the client will abort the connection using a TCP 'RST'.

The default timeout is set to 15 seconds. A subsequent patch will put it
under user control by means of a systctl.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprtsock.c       | 98 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 82 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 2b0d960603b9..1758d9f5b5c3 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -260,6 +260,7 @@ void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 #define XPRT_BOUND		(4)
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
+#define XPRT_CONNECTION_ABORT	(7)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2e070679ab4a..b51f58b95c39 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -49,6 +49,8 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
 unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
 unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
 
+#define XS_TCP_LINGER_TO	(15U * HZ)
+
 /*
  * We can register our own files under /proc/sys/sunrpc by
  * calling register_sysctl_table() again.  The files in that
@@ -806,6 +808,7 @@ static void xs_close(struct rpc_xprt *xprt)
 	xs_reset_transport(transport);
 
 	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
 	smp_mb__after_clear_bit();
@@ -1133,6 +1136,47 @@ out:
 	read_unlock(&sk->sk_callback_lock);
 }
 
+/*
+ * Do the equivalent of linger/linger2 handling for dealing with
+ * broken servers that don't close the socket in a timely
+ * fashion
+ */
+static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
+		unsigned long timeout)
+{
+	struct sock_xprt *transport;
+
+	if (xprt_test_and_set_connecting(xprt))
+		return;
+	set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	transport = container_of(xprt, struct sock_xprt, xprt);
+	queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
+			   timeout);
+}
+
+static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport;
+
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
+	    !cancel_delayed_work(&transport->connect_worker))
+		return;
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	xprt_clear_connecting(xprt);
+}
+
+static void xs_sock_mark_closed(struct rpc_xprt *xprt)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	clear_bit(XPRT_CLOSING, &xprt->state);
+	smp_mb__after_clear_bit();
+	/* Mark transport as closed and wake up all pending tasks */
+	xprt_disconnect_done(xprt);
+}
+
 /**
  * xs_tcp_state_change - callback to handle TCP socket state changes
  * @sk: socket whose state has changed
@@ -1178,6 +1222,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 		smp_mb__after_clear_bit();
+		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
 		break;
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
@@ -1194,17 +1239,14 @@ static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_LAST_ACK:
 		set_bit(XPRT_CLOSING, &xprt->state);
+		xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO);
 		smp_mb__before_clear_bit();
 		clear_bit(XPRT_CONNECTED, &xprt->state);
 		smp_mb__after_clear_bit();
 		break;
 	case TCP_CLOSE:
-		smp_mb__before_clear_bit();
-		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
-		clear_bit(XPRT_CLOSING, &xprt->state);
-		smp_mb__after_clear_bit();
-		/* Mark transport as closed and wake up all pending tasks */
-		xprt_disconnect_done(xprt);
+		xs_tcp_cancel_linger_timeout(xprt);
+		xs_sock_mark_closed(xprt);
 	}
  out:
 	read_unlock(&sk->sk_callback_lock);
@@ -1562,8 +1604,8 @@ static void xs_udp_connect_worker4(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_wake_pending_tasks(xprt, status);
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
@@ -1604,8 +1646,8 @@ static void xs_udp_connect_worker6(struct work_struct *work)
 	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
-	xprt_wake_pending_tasks(xprt, status);
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /*
@@ -1626,7 +1668,9 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo
 	memset(&any, 0, sizeof(any));
 	any.sa_family = AF_UNSPEC;
 	result = kernel_connect(transport->sock, &any, sizeof(any), 0);
-	if (result)
+	if (!result)
+		xs_sock_mark_closed(xprt);
+	else
 		dprintk("RPC:       AF_UNSPEC connect return code %d\n",
 				result);
 }
@@ -1702,6 +1746,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 		goto out;
 
 	if (!sock) {
+		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
@@ -1713,10 +1758,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 			sock_release(sock);
 			goto out;
 		}
-	} else
+	} else {
+		int abort_and_exit;
+
+		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+				&xprt->state);
 		/* "close" the socket, preserving the local port */
 		xs_tcp_reuse_connection(xprt, transport);
 
+		if (abort_and_exit)
+			goto out_eagain;
+	}
+
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1732,17 +1785,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
-		goto out_clear;
+		xprt_clear_connecting(xprt);
+		return;
 	}
 	/* get rid of existing socket, and retry */
 	xs_tcp_shutdown(xprt);
 	printk("%s: connect returned unhandled error %d\n",
 			__func__, status);
+out_eagain:
 	status = -EAGAIN;
 out:
-	xprt_wake_pending_tasks(xprt, status);
-out_clear:
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
@@ -1763,6 +1817,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 		goto out;
 
 	if (!sock) {
+		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
@@ -1774,10 +1829,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 			sock_release(sock);
 			goto out;
 		}
-	} else
+	} else {
+		int abort_and_exit;
+
+		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+				&xprt->state);
 		/* "close" the socket, preserving the local port */
 		xs_tcp_reuse_connection(xprt, transport);
 
+		if (abort_and_exit)
+			goto out_eagain;
+	}
+
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1792,17 +1855,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 	case 0:
 	case -EINPROGRESS:
 	case -EALREADY:
-		goto out_clear;
+		xprt_clear_connecting(xprt);
+		return;
 	}
 	/* get rid of existing socket, and retry */
 	xs_tcp_shutdown(xprt);
 	printk("%s: connect returned unhandled error %d\n",
 			__func__, status);
+out_eagain:
 	status = -EAGAIN;
 out:
-	xprt_wake_pending_tasks(xprt, status);
-out_clear:
 	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 7fe5c398fc2186ed586db11106a6692d871d0d58 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 19 Mar 2009 15:35:50 -0400
Subject: NFS: Optimise NFS close()

Close-to-open cache consistency rules really only require us to flush out
writes on calls to close(), and require us to revalidate attributes on the
very last close of the file.

Currently we appear to be doing a lot of extra attribute revalidation
and cache flushes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c           | 11 ++---------
 fs/nfs/inode.c          | 41 +++++++++++++++++++++++++++++------------
 fs/nfs/internal.h       |  3 +++
 fs/nfs/nfs3proc.c       |  1 +
 fs/nfs/nfs4proc.c       | 10 ++++++++++
 fs/nfs/proc.c           |  1 +
 include/linux/nfs_xdr.h |  1 +
 7 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1eab9c9ad242..d451073c4947 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 
-	/* Ensure that dirty pages are flushed out with the right creds */
-	if (filp->f_mode & FMODE_WRITE)
-		nfs_wb_all(dentry->d_inode);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return nfs_release(inode, filp);
 }
@@ -231,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
-	int		status;
 
 	dprintk("NFS: flush(%s/%s)\n",
 			dentry->d_parent->d_name.name,
@@ -241,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 		return 0;
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 
-	/* Ensure that data+attribute caches are up to date after close() */
-	status = nfs_do_fsync(ctx, inode);
-	if (!status)
-		nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	return status;
+	/* Flush writes to the server and return any errors */
+	return nfs_do_fsync(ctx, inode);
 }
 
 static ssize_t
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c40adc5dd609..a834d1d850b7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	return err;
 }
 
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode;
+	struct nfs_server *server;
+
+	if (!(ctx->mode & FMODE_WRITE))
+		return;
+	if (!is_sync)
+		return;
+	inode = ctx->path.dentry->d_inode;
+	if (!list_empty(&NFS_I(inode)->open_files))
+		return;
+	server = NFS_SERVER(inode);
+	if (server->flags & NFS_MOUNT_NOCTO)
+		return;
+	nfs_revalidate_inode(server, inode);
+}
+
 static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
@@ -567,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 	return ctx;
 }
 
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
-	struct inode *inode;
+	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (ctx == NULL)
-		return;
-
-	inode = ctx->path.dentry->d_inode;
 	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
 	spin_unlock(&inode->i_lock);
-	if (ctx->state != NULL) {
-		if (wait)
-			nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
-		else
-			nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-	}
+	NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a55e69aa52e5..2041f68ff1cc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+
 /* dir.c */
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..b82fe6847f14 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
+	.close_context	= nfs_close_context,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95f171e7e05a..97bacccff579 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1572,6 +1572,15 @@ out_drop:
 	return 0;
 }
 
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	if (ctx->state == NULL)
+		return;
+	if (is_sync)
+		nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+	else
+		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -3776,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.commit_done	= nfs4_commit_done,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
+	.close_context  = nfs4_close_context,
 };
 
 /*
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.commit_setup	= nfs_proc_commit_setup,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
+	.close_context	= nfs_close_context,
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0691b9c188d9..9708e78a4d49 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -868,6 +868,7 @@ struct nfs_rpc_ops {
 	int	(*lock)(struct file *, int, struct file_lock *);
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
+	void	(*close_context)(struct nfs_open_context *ctx, int);
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From 1bf83e558cb29d163f4bc6decbc3800ecf4db195 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:38:34 +0100
Subject: PCI: PCIe portdrv: Use driver data to simplify code

PCI Express port driver extension, as defined by struct
pcie_port_device_ext in portdrv.h, is allocated and initialized, but
never used (it also is never freed).  Extend it to hold the PCI Express
port type as well as the port interrupt mode, change its name and use it
to simplify the code in portdrv_core.c .

Additionally, remove the redundant interrupt_mode member of struct
pcie_device defined in include/linux/pcieport_if.h .

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv.h      |  5 ++-
 drivers/pci/pcie/portdrv_core.c | 95 ++++++++++++++++-------------------------
 include/linux/pcieport_if.h     |  1 -
 3 files changed, 39 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 2529f3f2ea5a..b0dcbc73415e 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -28,8 +28,9 @@
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
-struct pcie_port_device_ext {
-	int interrupt_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
+struct pcie_port_data {
+	int port_type;		/* Type of the port */
+	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
 };
 
 extern struct bus_type pcie_port_bus_type;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 8b3f8c18032f..273e97619bce 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -15,10 +15,9 @@
 #include <linux/slab.h>
 #include <linux/pcieport_if.h>
 
+#include "../pci.h"
 #include "portdrv.h"
 
-extern int pcie_mch_quirk;	/* MSI-quirk Indicator */
-
 /**
  * release_pcie_device - free PCI Express port service device structure
  * @dev: Port service device to release
@@ -31,28 +30,6 @@ static void release_pcie_device(struct device *dev)
 	kfree(to_pcie_device(dev));			
 }
 
-static int is_msi_quirked(struct pci_dev *dev)
-{
-	int port_type, quirk = 0;
-	u16 reg16;
-
-	pci_read_config_word(dev, 
-		pci_find_capability(dev, PCI_CAP_ID_EXP) + 
-		PCIE_CAPABILITIES_REG, &reg16);
-	port_type = (reg16 >> 4) & PORT_TYPE_MASK;
-	switch(port_type) {
-	case PCIE_RC_PORT:
-		if (pcie_mch_quirk == 1)
-			quirk = 1;
-		break;
-	case PCIE_SW_UPSTREAM_PORT:
-	case PCIE_SW_DOWNSTREAM_PORT:
-	default:
-		break;	
-	}
-	return quirk;
-}
-
 /**
  * assign_interrupt_mode - choose interrupt mode for PCI Express port services
  *                         (INTx, MSI-X, MSI) and set up vectors
@@ -64,6 +41,7 @@ static int is_msi_quirked(struct pci_dev *dev)
  */
 static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
+	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int i, pos, nvec, status = -EINVAL;
 	int interrupt_mode = PCIE_PORT_INTx_MODE;
 
@@ -75,7 +53,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 	}
 	
 	/* Check MSI quirk */
-	if (is_msi_quirked(dev))
+	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
 		return interrupt_mode;
 
 	/* Select MSI-X over MSI if supported */		
@@ -132,13 +110,11 @@ static int get_port_device_capability(struct pci_dev *dev)
 			pos + PCIE_SLOT_CAPABILITIES_REG, &reg32);
 		if (reg32 & SLOT_HP_CAPABLE_MASK)
 			services |= PCIE_PORT_SERVICE_HP;
-	} 
-	/* PME Capable - root port capability */
-	if (((reg16 >> 4) & PORT_TYPE_MASK) == PCIE_RC_PORT)
-		services |= PCIE_PORT_SERVICE_PME;
-
+	}
+	/* AER capable */
 	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR))
 		services |= PCIE_PORT_SERVICE_AER;
+	/* VC support */
 	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VC))
 		services |= PCIE_PORT_SERVICE_VC;
 
@@ -152,15 +128,15 @@ static int get_port_device_capability(struct pci_dev *dev)
  * @port_type: Type of the port
  * @service_type: Type of service to associate with the service device
  * @irq: Interrupt vector to associate with the service device
- * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI)
  */
 static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev, 
-	int port_type, int service_type, int irq, int irq_mode)
+	int service_type, int irq)
 {
+	struct pcie_port_data *port_data = pci_get_drvdata(parent);
 	struct device *device;
+	int port_type = port_data->port_type;
 
 	dev->port = parent;
-	dev->interrupt_mode = irq_mode;
 	dev->irq = irq;
 	dev->id.vendor = parent->vendor;
 	dev->id.device = parent->device;
@@ -185,10 +161,9 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
  * @port_type: Type of the port
  * @service_type: Type of service to associate with the service device
  * @irq: Interrupt vector to associate with the service device
- * @irq_mode: Interrupt mode of the service (INTx, MSI-X, MSI)
  */
 static struct pcie_device* alloc_pcie_device(struct pci_dev *parent,
-	int port_type, int service_type, int irq, int irq_mode)
+	int service_type, int irq)
 {
 	struct pcie_device *device;
 
@@ -196,7 +171,7 @@ static struct pcie_device* alloc_pcie_device(struct pci_dev *parent,
 	if (!device)
 		return NULL;
 
-	pcie_device_init(parent, device, port_type, service_type, irq,irq_mode);
+	pcie_device_init(parent, device, service_type, irq);
 	return device;
 }
 
@@ -230,39 +205,36 @@ int pcie_port_device_probe(struct pci_dev *dev)
  */
 int pcie_port_device_register(struct pci_dev *dev)
 {
-	struct pcie_port_device_ext *p_ext;
-	int status, type, capabilities, irq_mode, i;
+	struct pcie_port_data *port_data;
+	int status, capabilities, irq_mode, i;
 	int vectors[PCIE_PORT_DEVICE_MAXSERVICES];
 	u16 reg16;
 
-	/* Allocate port device extension */
-	if (!(p_ext = kmalloc(sizeof(struct pcie_port_device_ext), GFP_KERNEL)))
+	port_data = kzalloc(sizeof(*port_data), GFP_KERNEL);
+	if (!port_data)
 		return -ENOMEM;
-
-	pci_set_drvdata(dev, p_ext);
+	pci_set_drvdata(dev, port_data);
 
 	/* Get port type */
 	pci_read_config_word(dev,
 		pci_find_capability(dev, PCI_CAP_ID_EXP) +
 		PCIE_CAPABILITIES_REG, &reg16);
-	type = (reg16 >> 4) & PORT_TYPE_MASK;
+	port_data->port_type = (reg16 >> 4) & PORT_TYPE_MASK;
 
-	/* Now get port services */
 	capabilities = get_port_device_capability(dev);
+	/* Root ports are capable of generating PME too */
+	if (port_data->port_type == PCIE_RC_PORT)
+		capabilities |= PCIE_PORT_SERVICE_PME;
+
 	irq_mode = assign_interrupt_mode(dev, vectors, capabilities);
-	p_ext->interrupt_mode = irq_mode;
+	port_data->port_irq_mode = irq_mode;
 
 	/* Allocate child services if any */
 	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
 		struct pcie_device *child;
 
 		if (capabilities & (1 << i)) {
-			child = alloc_pcie_device(
-				dev, 		/* parent */
-				type,		/* port type */
-				i,		/* service type */
-				vectors[i],	/* irq */
-				irq_mode	/* interrupt mode */);
+			child = alloc_pcie_device(dev, i, vectors[i]);
 			if (child) {
 				status = device_register(&child->device);
 				if (status) {
@@ -349,25 +321,30 @@ static int remove_iter(struct device *dev, void *data)
  */
 void pcie_port_device_remove(struct pci_dev *dev)
 {
-	struct device *device;
-	unsigned long device_addr;
-	int interrupt_mode = PCIE_PORT_INTx_MODE;
+	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int status;
 
 	do {
+		unsigned long device_addr;
+
 		status = device_for_each_child(&dev->dev, &device_addr, remove_iter);
 		if (status) {
-			device = (struct device*)device_addr;
-			interrupt_mode = (to_pcie_device(device))->interrupt_mode;
+			struct device *device = (struct device*)device_addr;
 			put_device(device);
 			device_unregister(device);
 		}
 	} while (status);
-	/* Switch to INTx by default if MSI enabled */
-	if (interrupt_mode == PCIE_PORT_MSIX_MODE)
+
+	switch (port_data->port_irq_mode) {
+	case PCIE_PORT_MSIX_MODE:
 		pci_disable_msix(dev);
-	else if (interrupt_mode == PCIE_PORT_MSI_MODE)
+		break;
+	case PCIE_PORT_MSI_MODE:
 		pci_disable_msi(dev);
+		break;
+	}
+
+	kfree(port_data);
 }
 
 /**
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 6cd91e3f9820..194409af1037 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -36,7 +36,6 @@ struct pcie_port_service_id {
 
 struct pcie_device {
 	int 		irq;	    /* Service IRQ/MSI/MSI-X Vector */
-	int 		interrupt_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */	
 	struct pcie_port_service_id id;	/* Service ID */
 	struct pci_dev	*port;	    /* Root/Upstream/Downstream Port */
 	void		*priv_data; /* Service Private Data */
-- 
cgit v1.2.3-71-gd317


From 90e9cd50f7feeddc911325c8a8c1b7e1fccc6599 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:39:39 +0100
Subject: PCI: PCIe portdrv: Aviod using service devices with wrong interrupts

The PCI Express port driver should not attempt to register service
devices that require the ability to generate interrupts if generating
interrupts is not possible.  Namely, if the port has no interrupt pin
configured and we cannot set up MSI or MSI-X for it, there is no way
it can generate interrupts and in such a case the port services that
rely on interrupts (PME, PCIe HP, AER) should not be enabled for it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv_core.c | 41 ++++++++++++++++++++++++++++-------------
 include/linux/pcieport_if.h     |  1 +
 2 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 273e97619bce..265eba033a4a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -43,7 +43,7 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
 	int i, pos, nvec, status = -EINVAL;
-	int interrupt_mode = PCIE_PORT_INTx_MODE;
+	int interrupt_mode = PCIE_PORT_NO_IRQ;
 
 	/* Set INTx as default */
 	for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
@@ -51,7 +51,9 @@ static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 			nvec++;
 		vectors[i] = dev->irq;
 	}
-	
+	if (dev->pin)
+		interrupt_mode = PCIE_PORT_INTx_MODE;
+
 	/* Check MSI quirk */
 	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
 		return interrupt_mode;
@@ -141,7 +143,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
 	dev->id.vendor = parent->vendor;
 	dev->id.device = parent->device;
 	dev->id.port_type = port_type;
-	dev->id.service_type = (1 << service_type);
+	dev->id.service_type = service_type;
 
 	/* Initialize generic device interface */
 	device = &dev->device;
@@ -232,19 +234,32 @@ int pcie_port_device_register(struct pci_dev *dev)
 	/* Allocate child services if any */
 	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
 		struct pcie_device *child;
+		int service = 1 << i;
 
-		if (capabilities & (1 << i)) {
-			child = alloc_pcie_device(dev, i, vectors[i]);
-			if (child) {
-				status = device_register(&child->device);
-				if (status) {
-					kfree(child);
-					continue;
-				}
-				get_device(&child->device);
-			}
+		if (!(capabilities & service))
+			continue;
+
+		/*
+		 * Don't use service devices that require interrupts if there is
+		 * no way to generate them.
+		 */
+		if (irq_mode == PCIE_PORT_NO_IRQ
+		    && service != PCIE_PORT_SERVICE_VC)
+			continue;
+
+		child = alloc_pcie_device(dev, service, vectors[i]);
+		if (!child)
+			continue;
+
+		status = device_register(&child->device);
+		if (status) {
+			kfree(child);
+			continue;
 		}
+
+		get_device(&child->device);
 	}
+
 	return 0;
 }
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 194409af1037..8e1ae1fd92f6 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -22,6 +22,7 @@
 #define PCIE_PORT_SERVICE_VC		8	/* Virtual Channel */
 
 /* Root/Upstream/Downstream Port's Interrupt Mode */
+#define PCIE_PORT_NO_IRQ		(-1)
 #define PCIE_PORT_INTx_MODE		0
 #define PCIE_PORT_MSI_MODE		1
 #define PCIE_PORT_MSIX_MODE		2
-- 
cgit v1.2.3-71-gd317


From 0516c8bcd25293f438573101c439ce25a18916ad Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:44:19 +0100
Subject: PCI: PCIe portdrv: Simplily probe callback of service drivers

The second argument of the ->probe() callback in
struct pcie_port_service_driver is unnecessary and never used.
Remove it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_acpi.c | 3 +--
 drivers/pci/hotplug/pciehp_core.c | 2 +-
 drivers/pci/pcie/aer/aerdrv.c     | 6 ++----
 drivers/pci/pcie/portdrv_core.c   | 2 +-
 include/linux/pcieport_if.h       | 3 +--
 5 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index 438d795f9fe3..ad8835758a17 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -82,8 +82,7 @@ static int __initdata acpi_slot_detected;
 static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots);
 
 /* Dummy driver for dumplicate name detection */
-static int __init dummy_probe(struct pcie_device *dev,
-			      const struct pcie_port_service_id *id)
+static int __init dummy_probe(struct pcie_device *dev)
 {
 	int pos;
 	u32 slot_cap;
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 681e3912b821..3429b21dbb53 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -401,7 +401,7 @@ static int get_cur_bus_speed(struct hotplug_slot *hotplug_slot, enum pci_bus_spe
 	return 0;
 }
 
-static int pciehp_probe(struct pcie_device *dev, const struct pcie_port_service_id *id)
+static int pciehp_probe(struct pcie_device *dev)
 {
 	int rc;
 	struct controller *ctrl;
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index e390707661dd..57c41204c549 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -38,8 +38,7 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 
-static int __devinit aer_probe (struct pcie_device *dev,
-	const struct pcie_port_service_id *id );
+static int __devinit aer_probe (struct pcie_device *dev);
 static void aer_remove(struct pcie_device *dev);
 static int aer_suspend(struct pcie_device *dev, pm_message_t state)
 {return 0;}
@@ -207,8 +206,7 @@ static void aer_remove(struct pcie_device *dev)
  *
  * Invoked when PCI Express bus loads AER service driver.
  **/
-static int __devinit aer_probe (struct pcie_device *dev,
-				const struct pcie_port_service_id *id )
+static int __devinit aer_probe (struct pcie_device *dev)
 {
 	int status;
 	struct aer_rpc *rpc;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 91ecbc43155f..682524b0c93a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -402,7 +402,7 @@ static int pcie_port_probe_service(struct device *dev)
 		return -ENODEV;
 
 	pciedev = to_pcie_device(dev);
-	status = driver->probe(pciedev, driver->id_table);
+	status = driver->probe(pciedev);
 	if (!status) {
 		dev_printk(KERN_DEBUG, dev, "service driver %s loaded\n",
 			driver->name);
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 8e1ae1fd92f6..59e90b8a7839 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -56,8 +56,7 @@ static inline void* get_service_data(struct pcie_device *dev)
 
 struct pcie_port_service_driver {
 	const char *name;
-	int (*probe) (struct pcie_device *dev, 
-		const struct pcie_port_service_id *id);
+	int (*probe) (struct pcie_device *dev);
 	void (*remove) (struct pcie_device *dev);
 	int (*suspend) (struct pcie_device *dev, pm_message_t state);
 	int (*resume) (struct pcie_device *dev);
-- 
cgit v1.2.3-71-gd317


From 22106368c999246c414610dcaacd485e741605b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 13 Jan 2009 14:46:46 +0100
Subject: PCI: PCIe portdrv: Remove struct pcie_port_service_id

The PCI Express port driver uses 'struct pcie_port_service_id' for
matching port service devices and drivers, but this structure
contains fields that duplicate information from the port device
itself (vendor, device, subvendor, subdevice) and fields that are not
used by any existing port service driver (class, class_mask,
drvier_data).  Also, both existing port service drivers (AER and
PCIe HP) don't even use the vendor and device fields for device
matching.  Therefore 'struct pcie_port_service_id' can be removed
altogether and the only useful members of it (port_type, service) can
be introduced directly into the port service device and port service
driver structures.  That simplifies the code quite a bit and reduces
its size.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_acpi.c  | 13 ++-----------
 drivers/pci/hotplug/pciehp_core.c  | 12 ++----------
 drivers/pci/pcie/aer/aerdrv.c      | 16 ++--------------
 drivers/pci/pcie/aer/aerdrv_core.c | 10 +++++-----
 drivers/pci/pcie/portdrv.h         |  5 -----
 drivers/pci/pcie/portdrv_bus.c     | 18 ++++++++++--------
 drivers/pci/pcie/portdrv_core.c    |  5 +----
 include/linux/pcieport_if.h        | 17 ++++++++---------
 8 files changed, 30 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c
index ad8835758a17..21734c311529 100644
--- a/drivers/pci/hotplug/pciehp_acpi.c
+++ b/drivers/pci/hotplug/pciehp_acpi.c
@@ -67,16 +67,6 @@ static int __init parse_detect_mode(void)
 	return PCIEHP_DETECT_DEFAULT;
 }
 
-static struct pcie_port_service_id __initdata port_pci_ids[] = {
-	{
-		.vendor = PCI_ANY_ID,
-		.device = PCI_ANY_ID,
-		.port_type = PCIE_ANY_PORT,
-		.service_type = PCIE_PORT_SERVICE_HP,
-		.driver_data =  0,
-        }, { /* end: all zeroes */ }
-};
-
 static int __initdata dup_slot_id;
 static int __initdata acpi_slot_detected;
 static struct list_head __initdata dummy_slots = LIST_HEAD_INIT(dummy_slots);
@@ -110,7 +100,8 @@ static int __init dummy_probe(struct pcie_device *dev)
 
 static struct pcie_port_service_driver __initdata dummy_driver = {
         .name           = "pciehp_dummy",
-        .id_table       = port_pci_ids,
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_HP,
         .probe          = dummy_probe,
 };
 
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 3429b21dbb53..3d21bbba3308 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -505,18 +505,10 @@ static int pciehp_resume (struct pcie_device *dev)
 }
 #endif
 
-static struct pcie_port_service_id port_pci_ids[] = { {
-	.vendor = PCI_ANY_ID,
-	.device = PCI_ANY_ID,
-	.port_type = PCIE_ANY_PORT,
-	.service_type = PCIE_PORT_SERVICE_HP,
-	.driver_data =	0,
-	}, { /* end: all zeroes */ }
-};
-
 static struct pcie_port_service_driver hpdriver_portdrv = {
 	.name		= PCIE_MODULE_NAME,
-	.id_table	= &port_pci_ids[0],
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_HP,
 
 	.probe		= pciehp_probe,
 	.remove		= pciehp_remove,
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 57c41204c549..e11c03194063 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -48,19 +48,6 @@ static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
 static void aer_error_resume(struct pci_dev *dev);
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
-/*
- * PCI Express bus's AER Root service driver data structure
- */
-static struct pcie_port_service_id aer_id[] = {
-	{
-	.vendor 	= PCI_ANY_ID,
-	.device 	= PCI_ANY_ID,
-	.port_type 	= PCIE_RC_PORT,
-	.service_type 	= PCIE_PORT_SERVICE_AER,
-	},
-	{ /* end: all zeroes */ }
-};
-
 static struct pci_error_handlers aer_error_handlers = {
 	.error_detected = aer_error_detected,
 	.resume = aer_error_resume,
@@ -68,7 +55,8 @@ static struct pci_error_handlers aer_error_handlers = {
 
 static struct pcie_port_service_driver aerdriver = {
 	.name		= "aer",
-	.id_table	= &aer_id[0],
+	.port_type	= PCIE_ANY_PORT,
+	.service	= PCIE_PORT_SERVICE_AER,
 
 	.probe		= aer_probe,
 	.remove		= aer_remove,
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 382575007382..307452f30035 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -351,21 +351,21 @@ static int find_aer_service_iter(struct device *device, void *data)
 {
 	struct device_driver *driver;
 	struct pcie_port_service_driver *service_driver;
-	struct pcie_device *pcie_dev;
 	struct find_aer_service_data *result;
 
 	result = (struct find_aer_service_data *) data;
 
 	if (device->bus == &pcie_port_bus_type) {
-		pcie_dev = to_pcie_device(device);
-		if (pcie_dev->id.port_type == PCIE_SW_DOWNSTREAM_PORT)
+		struct pcie_port_data *port_data;
+
+		port_data = pci_get_drvdata(to_pcie_device(device)->port);
+		if (port_data->port_type == PCIE_SW_DOWNSTREAM_PORT)
 			result->is_downstream = 1;
 
 		driver = device->driver;
 		if (driver) {
 			service_driver = to_service_driver(driver);
-			if (service_driver->id_table->service_type ==
-					PCIE_PORT_SERVICE_AER) {
+			if (service_driver->service == PCIE_PORT_SERVICE_AER) {
 				result->aer_driver = service_driver;
 				return 1;
 			}
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index b0dcbc73415e..ad4d082a0344 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -28,11 +28,6 @@
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
-struct pcie_port_data {
-	int port_type;		/* Type of the port */
-	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
-};
-
 extern struct bus_type pcie_port_bus_type;
 extern int pcie_port_device_probe(struct pci_dev *dev);
 extern int pcie_port_device_register(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/portdrv_bus.c b/drivers/pci/pcie/portdrv_bus.c
index eec89b767f9f..ef3a4eeaebb4 100644
--- a/drivers/pci/pcie/portdrv_bus.c
+++ b/drivers/pci/pcie/portdrv_bus.c
@@ -26,20 +26,22 @@ EXPORT_SYMBOL_GPL(pcie_port_bus_type);
 static int pcie_port_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct pcie_device *pciedev;
+	struct pcie_port_data *port_data;
 	struct pcie_port_service_driver *driver;
 
 	if (drv->bus != &pcie_port_bus_type || dev->bus != &pcie_port_bus_type)
 		return 0;
-	
+
 	pciedev = to_pcie_device(dev);
 	driver = to_service_driver(drv);
-	if (   (driver->id_table->vendor != PCI_ANY_ID && 
-		driver->id_table->vendor != pciedev->id.vendor) ||
-	       (driver->id_table->device != PCI_ANY_ID &&
-		driver->id_table->device != pciedev->id.device) ||	
-	       (driver->id_table->port_type != PCIE_ANY_PORT &&
-		driver->id_table->port_type != pciedev->id.port_type) ||
-		driver->id_table->service_type != pciedev->id.service_type )
+
+	if (driver->service != pciedev->service)
+		return 0;
+
+	port_data = pci_get_drvdata(pciedev->port);
+
+	if (driver->port_type != PCIE_ANY_PORT
+	     && driver->port_type != port_data->port_type)
 		return 0;
 
 	return 1;
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 682524b0c93a..843d9e30dd3b 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -140,10 +140,7 @@ static void pcie_device_init(struct pci_dev *parent, struct pcie_device *dev,
 
 	dev->port = parent;
 	dev->irq = irq;
-	dev->id.vendor = parent->vendor;
-	dev->id.device = parent->device;
-	dev->id.port_type = port_type;
-	dev->id.service_type = service_type;
+	dev->service = service_type;
 
 	/* Initialize generic device interface */
 	device = &dev->device;
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 59e90b8a7839..a3832079508e 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -27,18 +27,15 @@
 #define PCIE_PORT_MSI_MODE		1
 #define PCIE_PORT_MSIX_MODE		2
 
-struct pcie_port_service_id {
-	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
-	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
-	__u32 class, class_mask;	/* (class,subclass,prog-if) triplet */
-	__u32 port_type, service_type;	/* Port Entity */
-	kernel_ulong_t driver_data;
+struct pcie_port_data {
+	int port_type;		/* Type of the port */
+	int port_irq_mode;	/* [0:INTx | 1:MSI | 2:MSI-X] */
 };
 
 struct pcie_device {
 	int 		irq;	    /* Service IRQ/MSI/MSI-X Vector */
-	struct pcie_port_service_id id;	/* Service ID */
-	struct pci_dev	*port;	    /* Root/Upstream/Downstream Port */
+	struct pci_dev *port;	    /* Root/Upstream/Downstream Port */
+	u32		service;    /* Port service this device represents */
 	void		*priv_data; /* Service Private Data */
 	struct device	device;     /* Generic Device Interface */
 };
@@ -67,7 +64,9 @@ struct pcie_port_service_driver {
 	/* Link Reset Capability - AER service driver specific */
 	pci_ers_result_t (*reset_link) (struct pci_dev *dev);
 
-	const struct pcie_port_service_id *id_table;
+	int port_type;  /* Type of the port this driver can handle */
+	u32 service;    /* Port service this device represents */
+
 	struct device_driver driver;
 };
 #define to_service_driver(d) \
-- 
cgit v1.2.3-71-gd317


From a52e2e3513d4beafe8fe8699f1519b021c2d05ba Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 24 Jan 2009 00:21:14 +0100
Subject: PCI/MSI: Introduce pci_msix_table_size()

Introduce new function pci_msix_table_size() returning the size of
the MSI-X table of given PCI device or 0 if the device doesn't
support MSI-X.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 24 +++++++++++++++++++-----
 include/linux/pci.h |  5 +++++
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index baba2eb5367d..08aedd5875b0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -674,6 +674,23 @@ static int msi_free_irqs(struct pci_dev* dev)
 	return 0;
 }
 
+/**
+ * pci_msix_table_size - return the number of device's MSI-X table entries
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ */
+int pci_msix_table_size(struct pci_dev *dev)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	return multi_msix_capable(control);
+}
+
 /**
  * pci_enable_msix - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
@@ -691,9 +708,8 @@ static int msi_free_irqs(struct pci_dev* dev)
  **/
 int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 {
-	int status, pos, nr_entries;
+	int status, nr_entries;
 	int i, j;
-	u16 control;
 
 	if (!entries)
  		return -EINVAL;
@@ -702,9 +718,7 @@ int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 	if (status)
 		return status;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	nr_entries = multi_msix_capable(control);
+	nr_entries = pci_msix_table_size(dev);
 	if (nvec > nr_entries)
 		return -EINVAL;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7bd624bfdcfd..b5d6d0e0f1cb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -799,6 +799,10 @@ static inline void pci_msi_shutdown(struct pci_dev *dev)
 static inline void pci_disable_msi(struct pci_dev *dev)
 { }
 
+static inline int pci_msix_table_size(struct pci_dev *dev)
+{
+	return 0;
+}
 static inline int pci_enable_msix(struct pci_dev *dev,
 				  struct msix_entry *entries, int nvec)
 {
@@ -823,6 +827,7 @@ static inline int pci_msi_enabled(void)
 extern int pci_enable_msi(struct pci_dev *dev);
 extern void pci_msi_shutdown(struct pci_dev *dev);
 extern void pci_disable_msi(struct pci_dev *dev);
+extern int pci_msix_table_size(struct pci_dev *dev);
 extern int pci_enable_msix(struct pci_dev *dev,
 	struct msix_entry *entries, int nvec);
 extern void pci_msix_shutdown(struct pci_dev *dev);
-- 
cgit v1.2.3-71-gd317


From b43d451385ef833e0696032aac2629da04d46c59 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 24 Jan 2009 00:23:22 +0100
Subject: PCI/PCIe portdrv: Fix allocation of interrupts

If MSI-X interrupt mode is used by the PCI Express port driver, too
many vectors are allocated and it is not ensured that the right
vectors will be used for the right services.  Namely, the PCI Express
specification states that both PCI Express native PME and PCI Express
hotplug will always use the same MSI or MSI-X message for signalling
interrupts, which implies that the same vector will be used by both
of them.  Also, the VC service does not use interrupts at all.
Moreover, is not clear which of the vectors allocated by
pci_enable_msix() in the current code will be used for PME and
hotplug and which of them will be used for AER if all of these
services are configured.

For these reasons, rework the allocation of interrupts for PCI
Express ports so that if MSI-X are enabled, the right vectors will be
used for the right purposes.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/portdrv.h      |   6 ++
 drivers/pci/pcie/portdrv_core.c | 206 ++++++++++++++++++++++++++++++++--------
 include/linux/pcieport_if.h     |  12 ++-
 3 files changed, 181 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ad4d082a0344..5b818bd835ef 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -25,6 +25,12 @@
 #define PCIE_CAPABILITIES_REG		0x2
 #define PCIE_SLOT_CAPABILITIES_REG	0x14
 #define PCIE_PORT_DEVICE_MAXSERVICES	4
+#define PCIE_PORT_MSI_VECTOR_MASK	0x1f
+/*
+ * According to the PCI Express Base Specification 2.0, the indices of the MSI-X
+ * table entires used by port services must not exceed 31
+ */
+#define PCIE_PORT_MAX_MSIX_ENTRIES	32
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 843d9e30dd3b..3aea92a92928 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -30,6 +30,152 @@ static void release_pcie_device(struct device *dev)
 	kfree(to_pcie_device(dev));			
 }
 
+/**
+ * pcie_port_msix_add_entry - add entry to given array of MSI-X entries
+ * @entries: Array of MSI-X entries
+ * @new_entry: Index of the entry to add to the array
+ * @nr_entries: Number of entries aleady in the array
+ *
+ * Return value: Position of the added entry in the array
+ */
+static int pcie_port_msix_add_entry(
+	struct msix_entry *entries, int new_entry, int nr_entries)
+{
+	int j;
+
+	for (j = 0; j < nr_entries; j++)
+		if (entries[j].entry == new_entry)
+			return j;
+
+	entries[j].entry = new_entry;
+	return j;
+}
+
+/**
+ * pcie_port_enable_msix - try to set up MSI-X as interrupt mode for given port
+ * @dev: PCI Express port to handle
+ * @vectors: Array of interrupt vectors to populate
+ * @mask: Bitmask of port capabilities returned by get_port_device_capability()
+ *
+ * Return value: 0 on success, error code on failure
+ */
+static int pcie_port_enable_msix(struct pci_dev *dev, int *vectors, int mask)
+{
+	struct msix_entry *msix_entries;
+	int idx[PCIE_PORT_DEVICE_MAXSERVICES];
+	int nr_entries, status, pos, i, nvec;
+	u16 reg16;
+	u32 reg32;
+
+	nr_entries = pci_msix_table_size(dev);
+	if (!nr_entries)
+		return -EINVAL;
+	if (nr_entries > PCIE_PORT_MAX_MSIX_ENTRIES)
+		nr_entries = PCIE_PORT_MAX_MSIX_ENTRIES;
+
+	msix_entries = kzalloc(sizeof(*msix_entries) * nr_entries, GFP_KERNEL);
+	if (!msix_entries)
+		return -ENOMEM;
+
+	/*
+	 * Allocate as many entries as the port wants, so that we can check
+	 * which of them will be useful.  Moreover, if nr_entries is correctly
+	 * equal to the number of entries this port actually uses, we'll happily
+	 * go through without any tricks.
+	 */
+	for (i = 0; i < nr_entries; i++)
+		msix_entries[i].entry = i;
+
+	status = pci_enable_msix(dev, msix_entries, nr_entries);
+	if (status)
+		goto Exit;
+
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		idx[i] = -1;
+	status = -EIO;
+	nvec = 0;
+
+	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
+		int entry;
+
+		/*
+		 * The code below follows the PCI Express Base Specification 2.0
+		 * stating in Section 6.1.6 that "PME and Hot-Plug Event
+		 * interrupts (when both are implemented) always share the same
+		 * MSI or MSI-X vector, as indicated by the Interrupt Message
+		 * Number field in the PCI Express Capabilities register", where
+		 * according to Section 7.8.2 of the specification "For MSI-X,
+		 * the value in this field indicates which MSI-X Table entry is
+		 * used to generate the interrupt message."
+		 */
+		pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+		pci_read_config_word(dev, pos + PCIE_CAPABILITIES_REG, &reg16);
+		entry = (reg16 >> 9) & PCIE_PORT_MSI_VECTOR_MASK;
+		if (entry >= nr_entries)
+			goto Error;
+
+		i = pcie_port_msix_add_entry(msix_entries, entry, nvec);
+		if (i == nvec)
+			nvec++;
+
+		idx[PCIE_PORT_SERVICE_PME_SHIFT] = i;
+		idx[PCIE_PORT_SERVICE_HP_SHIFT] = i;
+	}
+
+	if (mask & PCIE_PORT_SERVICE_AER) {
+		int entry;
+
+		/*
+		 * The code below follows Section 7.10.10 of the PCI Express
+		 * Base Specification 2.0 stating that bits 31-27 of the Root
+		 * Error Status Register contain a value indicating which of the
+		 * MSI/MSI-X vectors assigned to the port is going to be used
+		 * for AER, where "For MSI-X, the value in this register
+		 * indicates which MSI-X Table entry is used to generate the
+		 * interrupt message."
+		 */
+		pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &reg32);
+		entry = reg32 >> 27;
+		if (entry >= nr_entries)
+			goto Error;
+
+		i = pcie_port_msix_add_entry(msix_entries, entry, nvec);
+		if (i == nvec)
+			nvec++;
+
+		idx[PCIE_PORT_SERVICE_AER_SHIFT] = i;
+	}
+
+	/*
+	 * If nvec is equal to the allocated number of entries, we can just use
+	 * what we have.  Otherwise, the port has some extra entries not for the
+	 * services we know and we need to work around that.
+	 */
+	if (nvec == nr_entries) {
+		status = 0;
+	} else {
+		/* Drop the temporary MSI-X setup */
+		pci_disable_msix(dev);
+
+		/* Now allocate the MSI-X vectors for real */
+		status = pci_enable_msix(dev, msix_entries, nvec);
+		if (status)
+			goto Exit;
+	}
+
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		vectors[i] = idx[i] >= 0 ? msix_entries[idx[i]].vector : -1;
+
+ Exit:
+	kfree(msix_entries);
+	return status;
+
+ Error:
+	pci_disable_msix(dev);
+	goto Exit;
+}
+
 /**
  * assign_interrupt_mode - choose interrupt mode for PCI Express port services
  *                         (INTx, MSI-X, MSI) and set up vectors
@@ -42,49 +188,31 @@ static void release_pcie_device(struct device *dev)
 static int assign_interrupt_mode(struct pci_dev *dev, int *vectors, int mask)
 {
 	struct pcie_port_data *port_data = pci_get_drvdata(dev);
-	int i, pos, nvec, status = -EINVAL;
-	int interrupt_mode = PCIE_PORT_NO_IRQ;
-
-	/* Set INTx as default */
-	for (i = 0, nvec = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
-		if (mask & (1 << i)) 
-			nvec++;
-		vectors[i] = dev->irq;
-	}
-	if (dev->pin)
-		interrupt_mode = PCIE_PORT_INTx_MODE;
+	int irq, interrupt_mode = PCIE_PORT_NO_IRQ;
+	int i;
 
 	/* Check MSI quirk */
 	if (port_data->port_type == PCIE_RC_PORT && pcie_mch_quirk)
-		return interrupt_mode;
+		goto Fallback;
+
+	/* Try to use MSI-X if supported */
+	if (!pcie_port_enable_msix(dev, vectors, mask))
+		return PCIE_PORT_MSIX_MODE;
+
+	/* We're not going to use MSI-X, so try MSI and fall back to INTx */
+	if (!pci_enable_msi(dev))
+		interrupt_mode = PCIE_PORT_MSI_MODE;
+
+ Fallback:
+	if (interrupt_mode == PCIE_PORT_NO_IRQ && dev->pin)
+		interrupt_mode = PCIE_PORT_INTx_MODE;
+
+	irq = interrupt_mode != PCIE_PORT_NO_IRQ ? dev->irq : -1;
+	for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
+		vectors[i] = irq;
+
+	vectors[PCIE_PORT_SERVICE_VC_SHIFT] = -1;
 
-	/* Select MSI-X over MSI if supported */		
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (pos) {
-		struct msix_entry msix_entries[PCIE_PORT_DEVICE_MAXSERVICES] = 
-			{{0, 0}, {0, 1}, {0, 2}, {0, 3}};
-		status = pci_enable_msix(dev, msix_entries, nvec);
-		if (!status) {
-			int j = 0;
-
-			interrupt_mode = PCIE_PORT_MSIX_MODE;
-			for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) {
-				if (mask & (1 << i)) 
-					vectors[i] = msix_entries[j++].vector;
-			}
-		}
-	} 
-	if (status) {
-		pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-		if (pos) {
-			status = pci_enable_msi(dev);
-			if (!status) {
-				interrupt_mode = PCIE_PORT_MSI_MODE;
-				for (i = 0;i < PCIE_PORT_DEVICE_MAXSERVICES;i++)
-					vectors[i] = dev->irq;
-			}
-		}
-	} 
 	return interrupt_mode;
 }
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index a3832079508e..5d2afcfa6bc1 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -16,10 +16,14 @@
 #define PCIE_ANY_PORT			7
 
 /* Service Type */
-#define PCIE_PORT_SERVICE_PME		1	/* Power Management Event */
-#define PCIE_PORT_SERVICE_AER		2	/* Advanced Error Reporting */
-#define PCIE_PORT_SERVICE_HP		4	/* Native Hotplug */
-#define PCIE_PORT_SERVICE_VC		8	/* Virtual Channel */
+#define PCIE_PORT_SERVICE_PME_SHIFT	0	/* Power Management Event */
+#define PCIE_PORT_SERVICE_PME		(1 << PCIE_PORT_SERVICE_PME_SHIFT)
+#define PCIE_PORT_SERVICE_AER_SHIFT	1	/* Advanced Error Reporting */
+#define PCIE_PORT_SERVICE_AER		(1 << PCIE_PORT_SERVICE_AER_SHIFT)
+#define PCIE_PORT_SERVICE_HP_SHIFT	2	/* Native Hotplug */
+#define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
+#define PCIE_PORT_SERVICE_VC_SHIFT	3	/* Virtual Channel */
+#define PCIE_PORT_SERVICE_VC		(1 << PCIE_PORT_SERVICE_VC_SHIFT)
 
 /* Root/Upstream/Downstream Port's Interrupt Mode */
 #define PCIE_PORT_NO_IRQ		(-1)
-- 
cgit v1.2.3-71-gd317


From 63f10f0f6df4e4e860b790d64bebfde85b540b0a Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 9 Feb 2009 15:59:29 +0900
Subject: PCI/ACPI: move _OSC code to pci_root.c

Move PCI _OSC management code from drivers/pci/pci-acpi.c to
drivers/acpi/pci_root.c. The benefits are

- We no longer need struct osc_data and its management code (contents
  are moved to struct acpi_pci_root). This simplify the code, and we
  no longer care about kmalloc() failure.

- We can make pci_acpi_osc_support() be a static function, which is
  called only from drivers/acpi/pci_root.c.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Tested-by: Andrew Patterson <andrew.patterson@hp.com>
Acked-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/pci_root.c  | 180 ++++++++++++++++++++++++++++++++++++++-
 drivers/pci/pci-acpi.c   | 215 -----------------------------------------------
 include/linux/pci-acpi.h |   1 -
 3 files changed, 178 insertions(+), 218 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 5b38a026d122..979eccc82c5b 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -66,11 +66,18 @@ struct acpi_pci_root {
 	struct acpi_device * device;
 	struct acpi_pci_id id;
 	struct pci_bus *bus;
+
+	u32 osc_support_set;	/* _OSC state of support bits */
+	u32 osc_control_set;	/* _OSC state of control bits */
+	u32 osc_control_qry;	/* the latest _OSC query result */
+
+	u32 osc_queried:1;	/* has _OSC control been queried? */
 };
 
 static LIST_HEAD(acpi_pci_roots);
 
 static struct acpi_pci_driver *sub_driver;
+static DEFINE_MUTEX(osc_lock);
 
 int acpi_pci_register_driver(struct acpi_pci_driver *driver)
 {
@@ -185,6 +192,175 @@ static void acpi_pci_bridge_scan(struct acpi_device *device)
 		}
 }
 
+static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40,
+			  0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66};
+
+static acpi_status acpi_pci_run_osc(acpi_handle handle,
+				    const u32 *capbuf, u32 *retval)
+{
+	acpi_status status;
+	struct acpi_object_list input;
+	union acpi_object in_params[4];
+	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object *out_obj;
+	u32 errors;
+
+	/* Setting up input parameters */
+	input.count = 4;
+	input.pointer = in_params;
+	in_params[0].type 		= ACPI_TYPE_BUFFER;
+	in_params[0].buffer.length 	= 16;
+	in_params[0].buffer.pointer	= OSC_UUID;
+	in_params[1].type 		= ACPI_TYPE_INTEGER;
+	in_params[1].integer.value 	= 1;
+	in_params[2].type 		= ACPI_TYPE_INTEGER;
+	in_params[2].integer.value	= 3;
+	in_params[3].type		= ACPI_TYPE_BUFFER;
+	in_params[3].buffer.length 	= 12;
+	in_params[3].buffer.pointer 	= (u8 *)capbuf;
+
+	status = acpi_evaluate_object(handle, "_OSC", &input, &output);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	if (!output.length)
+		return AE_NULL_OBJECT;
+
+	out_obj = output.pointer;
+	if (out_obj->type != ACPI_TYPE_BUFFER) {
+		printk(KERN_DEBUG "_OSC evaluation returned wrong type\n");
+		status = AE_TYPE;
+		goto out_kfree;
+	}
+	/* Need to ignore the bit0 in result code */
+	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+	if (errors) {
+		if (errors & OSC_REQUEST_ERROR)
+			printk(KERN_DEBUG "_OSC request failed\n");
+		if (errors & OSC_INVALID_UUID_ERROR)
+			printk(KERN_DEBUG "_OSC invalid UUID\n");
+		if (errors & OSC_INVALID_REVISION_ERROR)
+			printk(KERN_DEBUG "_OSC invalid revision\n");
+		if (errors & OSC_CAPABILITIES_MASK_ERROR) {
+			if (capbuf[OSC_QUERY_TYPE] & OSC_QUERY_ENABLE)
+				goto out_success;
+			printk(KERN_DEBUG
+			       "Firmware did not grant requested _OSC control\n");
+			status = AE_SUPPORT;
+			goto out_kfree;
+		}
+		status = AE_ERROR;
+		goto out_kfree;
+	}
+out_success:
+	*retval = *((u32 *)(out_obj->buffer.pointer + 8));
+	status = AE_OK;
+
+out_kfree:
+	kfree(output.pointer);
+	return status;
+}
+
+static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, u32 flags)
+{
+	acpi_status status;
+	u32 support_set, result, capbuf[3];
+
+	/* do _OSC query for all possible controls */
+	support_set = root->osc_support_set | (flags & OSC_SUPPORT_MASKS);
+	capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
+	capbuf[OSC_SUPPORT_TYPE] = support_set;
+	capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS;
+
+	status = acpi_pci_run_osc(root->device->handle, capbuf, &result);
+	if (ACPI_SUCCESS(status)) {
+		root->osc_support_set = support_set;
+		root->osc_control_qry = result;
+		root->osc_queried = 1;
+	}
+	return status;
+}
+
+static acpi_status acpi_pci_osc_support(struct acpi_pci_root *root, u32 flags)
+{
+	acpi_status status;
+	acpi_handle tmp;
+
+	status = acpi_get_handle(root->device->handle, "_OSC", &tmp);
+	if (ACPI_FAILURE(status))
+		return status;
+	mutex_lock(&osc_lock);
+	status = acpi_pci_query_osc(root, flags);
+	mutex_unlock(&osc_lock);
+	return status;
+}
+
+static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle)
+{
+	struct acpi_pci_root *root;
+	list_for_each_entry(root, &acpi_pci_roots, node) {
+		if (root->device->handle == handle)
+			return root;
+	}
+	return NULL;
+}
+
+/**
+ * pci_osc_control_set - commit requested control to Firmware
+ * @handle: acpi_handle for the target ACPI object
+ * @flags: driver's requested control bits
+ *
+ * Attempt to take control from Firmware on requested control bits.
+ **/
+acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
+{
+	acpi_status status;
+	u32 control_req, result, capbuf[3];
+	acpi_handle tmp;
+	struct acpi_pci_root *root;
+
+	status = acpi_get_handle(handle, "_OSC", &tmp);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	control_req = (flags & OSC_CONTROL_MASKS);
+	if (!control_req)
+		return AE_TYPE;
+
+	root = acpi_pci_find_root(handle);
+	if (!root)
+		return AE_NOT_EXIST;
+
+	mutex_lock(&osc_lock);
+	/* No need to evaluate _OSC if the control was already granted. */
+	if ((root->osc_control_set & control_req) == control_req)
+		goto out;
+
+	/* Need to query controls first before requesting them */
+	if (!root->osc_queried) {
+		status = acpi_pci_query_osc(root, root->osc_support_set);
+		if (ACPI_FAILURE(status))
+			goto out;
+	}
+	if ((root->osc_control_qry & control_req) != control_req) {
+		printk(KERN_DEBUG
+		       "Firmware did not grant requested _OSC control\n");
+		status = AE_SUPPORT;
+		goto out;
+	}
+
+	capbuf[OSC_QUERY_TYPE] = 0;
+	capbuf[OSC_SUPPORT_TYPE] = root->osc_support_set;
+	capbuf[OSC_CONTROL_TYPE] = root->osc_control_set | control_req;
+	status = acpi_pci_run_osc(handle, capbuf, &result);
+	if (ACPI_SUCCESS(status))
+		root->osc_control_set = result;
+out:
+	mutex_unlock(&osc_lock);
+	return status;
+}
+EXPORT_SYMBOL(pci_osc_control_set);
+
 static int __devinit acpi_pci_root_add(struct acpi_device *device)
 {
 	int result = 0;
@@ -217,7 +393,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	 * PCI domains, so we indicate this in _OSC support capabilities.
 	 */
 	flags = base_flags = OSC_PCI_SEGMENT_GROUPS_SUPPORT;
-	pci_acpi_osc_support(device->handle, flags);
+	acpi_pci_osc_support(root, flags);
 
 	/* 
 	 * Segment
@@ -353,7 +529,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	if (pci_msi_enabled())
 		flags |= OSC_MSI_SUPPORT;
 	if (flags != base_flags)
-		pci_acpi_osc_support(device->handle, flags);
+		acpi_pci_osc_support(root, flags);
 
       end:
 	if (result) {
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index deea8a187eb8..fac5eddcefd2 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -18,221 +18,6 @@
 #include <linux/pci-acpi.h>
 #include "pci.h"
 
-struct acpi_osc_data {
-	acpi_handle handle;
-	u32 support_set;
-	u32 control_set;
-	u32 control_query;
-	int is_queried;
-	struct list_head sibiling;
-};
-static LIST_HEAD(acpi_osc_data_list);
-
-struct acpi_osc_args {
-	u32 capbuf[3];
-};
-
-static DEFINE_MUTEX(pci_acpi_lock);
-
-static struct acpi_osc_data *acpi_get_osc_data(acpi_handle handle)
-{
-	struct acpi_osc_data *data;
-
-	list_for_each_entry(data, &acpi_osc_data_list, sibiling) {
-		if (data->handle == handle)
-			return data;
-	}
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return NULL;
-	INIT_LIST_HEAD(&data->sibiling);
-	data->handle = handle;
-	list_add_tail(&data->sibiling, &acpi_osc_data_list);
-	return data;
-}
-
-static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40,
-			  0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66};
-
-static acpi_status acpi_run_osc(acpi_handle handle,
-				struct acpi_osc_args *osc_args, u32 *retval)
-{
-	acpi_status status;
-	struct acpi_object_list input;
-	union acpi_object in_params[4];
-	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-	union acpi_object *out_obj;
-	u32 errors, flags = osc_args->capbuf[OSC_QUERY_TYPE];
-
-	/* Setting up input parameters */
-	input.count = 4;
-	input.pointer = in_params;
-	in_params[0].type 		= ACPI_TYPE_BUFFER;
-	in_params[0].buffer.length 	= 16;
-	in_params[0].buffer.pointer	= OSC_UUID;
-	in_params[1].type 		= ACPI_TYPE_INTEGER;
-	in_params[1].integer.value 	= 1;
-	in_params[2].type 		= ACPI_TYPE_INTEGER;
-	in_params[2].integer.value	= 3;
-	in_params[3].type		= ACPI_TYPE_BUFFER;
-	in_params[3].buffer.length 	= 12;
-	in_params[3].buffer.pointer 	= (u8 *)osc_args->capbuf;
-
-	status = acpi_evaluate_object(handle, "_OSC", &input, &output);
-	if (ACPI_FAILURE(status))
-		return status;
-
-	if (!output.length)
-		return AE_NULL_OBJECT;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		printk(KERN_DEBUG "Evaluate _OSC returns wrong type\n");
-		status = AE_TYPE;
-		goto out_kfree;
-	}
-	/* Need to ignore the bit0 in result code */
-	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		if (errors & OSC_REQUEST_ERROR)
-			printk(KERN_DEBUG "_OSC request fails\n"); 
-		if (errors & OSC_INVALID_UUID_ERROR)
-			printk(KERN_DEBUG "_OSC invalid UUID\n"); 
-		if (errors & OSC_INVALID_REVISION_ERROR)
-			printk(KERN_DEBUG "_OSC invalid revision\n"); 
-		if (errors & OSC_CAPABILITIES_MASK_ERROR) {
-			if (flags & OSC_QUERY_ENABLE)
-				goto out_success;
-			printk(KERN_DEBUG "_OSC FW not grant req. control\n");
-			status = AE_SUPPORT;
-			goto out_kfree;
-		}
-		status = AE_ERROR;
-		goto out_kfree;
-	}
-out_success:
-	*retval = *((u32 *)(out_obj->buffer.pointer + 8));
-	status = AE_OK;
-
-out_kfree:
-	kfree(output.pointer);
-	return status;
-}
-
-static acpi_status __acpi_query_osc(u32 flags, struct acpi_osc_data *osc_data)
-{
-	acpi_status status;
-	u32 support_set, result;
-	struct acpi_osc_args osc_args;
-
-	/* do _OSC query for all possible controls */
-	support_set = osc_data->support_set | (flags & OSC_SUPPORT_MASKS);
-	osc_args.capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
-	osc_args.capbuf[OSC_SUPPORT_TYPE] = support_set;
-	osc_args.capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS;
-
-	status = acpi_run_osc(osc_data->handle, &osc_args, &result);
-	if (ACPI_SUCCESS(status)) {
-		osc_data->support_set = support_set;
-		osc_data->control_query = result;
-		osc_data->is_queried = 1;
-	}
-
-	return status;
-}
-
-/*
- * pci_acpi_osc_support: Invoke _OSC indicating support for the given feature
- * @flags: Bitmask of flags to support
- *
- * See the ACPI spec for the definition of the flags
- */
-int pci_acpi_osc_support(acpi_handle handle, u32 flags)
-{
-	acpi_status status;
-	acpi_handle tmp;
-	struct acpi_osc_data *osc_data;
-	int rc = 0;
-
-	status = acpi_get_handle(handle, "_OSC", &tmp);
-	if (ACPI_FAILURE(status))
-		return -ENOTTY;
-
-	mutex_lock(&pci_acpi_lock);
-	osc_data = acpi_get_osc_data(handle);
-	if (!osc_data) {
-		printk(KERN_ERR "acpi osc data array is full\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	__acpi_query_osc(flags, osc_data);
-out:
-	mutex_unlock(&pci_acpi_lock);
-	return rc;
-}
-
-/**
- * pci_osc_control_set - commit requested control to Firmware
- * @handle: acpi_handle for the target ACPI object
- * @flags: driver's requested control bits
- *
- * Attempt to take control from Firmware on requested control bits.
- **/
-acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
-{
-	acpi_status status;
-	u32 control_req, control_set, result;
-	acpi_handle tmp;
-	struct acpi_osc_data *osc_data;
-	struct acpi_osc_args osc_args;
-
-	status = acpi_get_handle(handle, "_OSC", &tmp);
-	if (ACPI_FAILURE(status))
-		return status;
-
-	mutex_lock(&pci_acpi_lock);
-	osc_data = acpi_get_osc_data(handle);
-	if (!osc_data) {
-		printk(KERN_ERR "acpi osc data array is full\n");
-		status = AE_ERROR;
-		goto out;
-	}
-
-	control_req = (flags & OSC_CONTROL_MASKS);
-	if (!control_req) {
-		status = AE_TYPE;
-		goto out;
-	}
-
-	/* No need to evaluate _OSC if the control was already granted. */
-	if ((osc_data->control_set & control_req) == control_req)
-		goto out;
-
-	if (!osc_data->is_queried) {
-		status = __acpi_query_osc(osc_data->support_set, osc_data);
-		if (ACPI_FAILURE(status))
-			goto out;
-	}
-
-	if ((osc_data->control_query & control_req) != control_req) {
-		status = AE_SUPPORT;
-		goto out;
-	}
-
-	control_set = osc_data->control_set | control_req;
-	osc_args.capbuf[OSC_QUERY_TYPE] = 0;
-	osc_args.capbuf[OSC_SUPPORT_TYPE] = osc_data->support_set;
-	osc_args.capbuf[OSC_CONTROL_TYPE] = control_set;
-	status = acpi_run_osc(handle, &osc_args, &result);
-	if (ACPI_SUCCESS(status))
-		osc_data->control_set = result;
-out:
-	mutex_unlock(&pci_acpi_lock);
-	return status;
-}
-EXPORT_SYMBOL(pci_osc_control_set);
-
 /*
  * _SxD returns the D-state with the highest power
  * (lowest D-state number) supported in the S-state "x".
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 042c166f65d5..65cb103b21db 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -50,7 +50,6 @@
 
 #ifdef CONFIG_ACPI
 extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags);
-int pci_acpi_osc_support(acpi_handle handle, u32 flags);
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	/* Find root host bridge */
-- 
cgit v1.2.3-71-gd317


From 9f5404d8ea90bfa4d58a3936e5a3d0d28cecf60f Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Mon, 9 Feb 2009 16:00:04 +0900
Subject: PCI/ACPI: rename pci_osc_control_set()

- Rename pci_osc_control_set() to acpi_pci_osc_control_set() according
  to the other API names in drivers/acpi/pci_root.c.

- Move _OSC related definitions to include/linux/acpi.h because _OSC
  related API is implemented in drivers/acpi/pci_root.c now.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Tested-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/pci_root.c            |  6 ++---
 drivers/pci/hotplug/acpi_pcihp.c   |  5 ++---
 drivers/pci/pcie/aer/aerdrv_acpi.c |  2 +-
 include/linux/acpi.h               | 34 ++++++++++++++++++++++++++++
 include/linux/pci-acpi.h           | 45 --------------------------------------
 5 files changed, 40 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 979eccc82c5b..196f97d00956 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -306,13 +306,13 @@ static struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle)
 }
 
 /**
- * pci_osc_control_set - commit requested control to Firmware
+ * acpi_pci_osc_control_set - commit requested control to Firmware
  * @handle: acpi_handle for the target ACPI object
  * @flags: driver's requested control bits
  *
  * Attempt to take control from Firmware on requested control bits.
  **/
-acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
+acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags)
 {
 	acpi_status status;
 	u32 control_req, result, capbuf[3];
@@ -359,7 +359,7 @@ out:
 	mutex_unlock(&osc_lock);
 	return status;
 }
-EXPORT_SYMBOL(pci_osc_control_set);
+EXPORT_SYMBOL(acpi_pci_osc_control_set);
 
 static int __devinit acpi_pci_root_add(struct acpi_device *device)
 {
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 1c1141801060..f47bc74be567 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -30,9 +30,8 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
+#include <linux/acpi.h>
 #include <linux/pci-acpi.h>
-#include <acpi/acpi.h>
-#include <acpi/acpi_bus.h>
 
 #define MY_NAME	"acpi_pcihp"
 
@@ -408,7 +407,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags)
 		acpi_get_name(handle, ACPI_FULL_PATHNAME, &string);
 		dbg("Trying to get hotplug control for %s\n",
 				(char *)string.pointer);
-		status = pci_osc_control_set(handle, flags);
+		status = acpi_pci_osc_control_set(handle, flags);
 		if (ACPI_SUCCESS(status))
 			goto got_one;
 		kfree(string.pointer);
diff --git a/drivers/pci/pcie/aer/aerdrv_acpi.c b/drivers/pci/pcie/aer/aerdrv_acpi.c
index ebce26c37049..8edb2f300e8f 100644
--- a/drivers/pci/pcie/aer/aerdrv_acpi.c
+++ b/drivers/pci/pcie/aer/aerdrv_acpi.c
@@ -38,7 +38,7 @@ int aer_osc_setup(struct pcie_device *pciedev)
 
 	handle = acpi_find_root_bridge_handle(pdev);
 	if (handle) {
-		status = pci_osc_control_set(handle,
+		status = acpi_pci_osc_control_set(handle,
 					OSC_PCI_EXPRESS_AER_CONTROL |
 					OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL);
 	}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6fce2fc2d124..2a3b189e3e26 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -256,6 +256,40 @@ void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
 void __init acpi_s4_no_nvs(void);
 #endif /* CONFIG_PM_SLEEP */
+
+#define OSC_QUERY_TYPE			0
+#define OSC_SUPPORT_TYPE 		1
+#define OSC_CONTROL_TYPE		2
+#define OSC_SUPPORT_MASKS		0x1f
+
+/* _OSC DW0 Definition */
+#define OSC_QUERY_ENABLE		1
+#define OSC_REQUEST_ERROR		2
+#define OSC_INVALID_UUID_ERROR		4
+#define OSC_INVALID_REVISION_ERROR	8
+#define OSC_CAPABILITIES_MASK_ERROR	16
+
+/* _OSC DW1 Definition (OS Support Fields) */
+#define OSC_EXT_PCI_CONFIG_SUPPORT		1
+#define OSC_ACTIVE_STATE_PWR_SUPPORT 		2
+#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT	4
+#define OSC_PCI_SEGMENT_GROUPS_SUPPORT		8
+#define OSC_MSI_SUPPORT				16
+
+/* _OSC DW1 Definition (OS Control Fields) */
+#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	1
+#define OSC_SHPC_NATIVE_HP_CONTROL 		2
+#define OSC_PCI_EXPRESS_PME_CONTROL		4
+#define OSC_PCI_EXPRESS_AER_CONTROL		8
+#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL	16
+
+#define OSC_CONTROL_MASKS 	(OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | 	\
+				OSC_SHPC_NATIVE_HP_CONTROL | 		\
+				OSC_PCI_EXPRESS_PME_CONTROL |		\
+				OSC_PCI_EXPRESS_AER_CONTROL |		\
+				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
+
+extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags);
 #else	/* CONFIG_ACPI */
 
 static inline int early_acpi_boot_init(void)
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 65cb103b21db..20480b9f10c8 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -10,46 +10,7 @@
 
 #include <linux/acpi.h>
 
-#define OSC_QUERY_TYPE			0
-#define OSC_SUPPORT_TYPE 		1
-#define OSC_CONTROL_TYPE		2
-#define OSC_SUPPORT_MASKS		0x1f
-
-/*
- * _OSC DW0 Definition 
- */
-#define OSC_QUERY_ENABLE		1
-#define OSC_REQUEST_ERROR		2
-#define OSC_INVALID_UUID_ERROR		4
-#define OSC_INVALID_REVISION_ERROR	8
-#define OSC_CAPABILITIES_MASK_ERROR	16
-
-/*
- * _OSC DW1 Definition (OS Support Fields)
- */
-#define OSC_EXT_PCI_CONFIG_SUPPORT		1
-#define OSC_ACTIVE_STATE_PWR_SUPPORT 		2
-#define OSC_CLOCK_PWR_CAPABILITY_SUPPORT	4
-#define OSC_PCI_SEGMENT_GROUPS_SUPPORT		8
-#define OSC_MSI_SUPPORT				16
-
-/*
- * _OSC DW1 Definition (OS Control Fields)
- */
-#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	1
-#define OSC_SHPC_NATIVE_HP_CONTROL 		2
-#define OSC_PCI_EXPRESS_PME_CONTROL		4
-#define OSC_PCI_EXPRESS_AER_CONTROL		8
-#define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL	16
-
-#define OSC_CONTROL_MASKS 	(OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | 	\
-				OSC_SHPC_NATIVE_HP_CONTROL | 		\
-				OSC_PCI_EXPRESS_PME_CONTROL |		\
-				OSC_PCI_EXPRESS_AER_CONTROL |		\
-				OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL)
-
 #ifdef CONFIG_ACPI
-extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags);
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
 	/* Find root host bridge */
@@ -69,12 +30,6 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 	return acpi_get_pci_rootbridge_handle(seg, busnr);
 }
 #else
-#if !defined(AE_ERROR)
-typedef u32 		acpi_status;
-#define AE_ERROR      	(acpi_status) (0x0001)
-#endif    
-static inline acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
-{return AE_ERROR;}
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 { return NULL; }
 #endif
-- 
cgit v1.2.3-71-gd317


From c48f1670f42b71f39f4a3bfba01ffb691cc9206c Mon Sep 17 00:00:00 2001
From: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Date: Tue, 3 Feb 2009 15:45:26 -0800
Subject: PCI: constify pci_bus_add_devices()

drivers/pci/hotplug/fakephp.c:283: warning: passing argument 1 of 'pci_bus_add_devices' discards qualifiers from pointer target type

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/bus.c   | 2 +-
 include/linux/pci.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 52b54f053be0..118c77778d29 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -133,7 +133,7 @@ int pci_bus_add_child(struct pci_bus *bus)
  *
  * Call hotplug for each new devices.
  */
-void pci_bus_add_devices(struct pci_bus *bus)
+void pci_bus_add_devices(const struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	struct pci_bus *child;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b5d6d0e0f1cb..a1af2fe00639 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -528,7 +528,7 @@ void pcibios_update_irq(struct pci_dev *, int irq);
 /* Generic PCI functions used internally */
 
 extern struct pci_bus *pci_find_bus(int domain, int busnr);
-void pci_bus_add_devices(struct pci_bus *bus);
+void pci_bus_add_devices(const struct pci_bus *bus);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
 				      struct pci_ops *ops, void *sysdata);
 static inline struct pci_bus * __devinit pci_scan_bus(int bus, struct pci_ops *ops,
-- 
cgit v1.2.3-71-gd317


From ea7415512a07add2b09c070c9a5d1950833cf9b3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 18 Feb 2009 10:44:29 -0800
Subject: PCI: constify pci_bus_assign_resources()

drivers/pci/hotplug/fakephp.c: In function 'pci_rescan_bus':
drivers/pci/hotplug/fakephp.c:271: warning: passing argument 1 of 'pci_bus_assign_resources' discards qualifiers from pointer target type

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/setup-bus.c | 4 ++--
 include/linux/pci.h     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 704608945780..170a3eda9dd3 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -27,7 +27,7 @@
 #include <linux/slab.h>
 
 
-static void pbus_assign_resources_sorted(struct pci_bus *bus)
+static void pbus_assign_resources_sorted(const struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	struct resource *res;
@@ -495,7 +495,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
-void __ref pci_bus_assign_resources(struct pci_bus *bus)
+void __ref pci_bus_assign_resources(const struct pci_bus *bus)
 {
 	struct pci_bus *b;
 	struct pci_dev *dev;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a1af2fe00639..7baf2a5db12a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -708,7 +708,7 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void
 int pci_vpd_truncate(struct pci_dev *dev, size_t size);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
-void pci_bus_assign_resources(struct pci_bus *bus);
+void pci_bus_assign_resources(const struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
 int pci_claim_resource(struct pci_dev *, int);
 void pci_assign_unassigned_resources(void);
-- 
cgit v1.2.3-71-gd317


From 2e1ab634bf013792d8803ec57c7a428a76f50028 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 19 Mar 2009 23:49:41 -0700
Subject: rtnetlink: add new value for DHCP added routes

To improve manageability, it would be good to be able to disambiguate routes
added by administrator from those added by DHCP client.  The only necessary
kernel change is to add value to rtnetlink include file so iproute2 utility
can use it.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 35a07c830f79..ba3254ecf7fb 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -217,6 +217,7 @@ enum
 #define RTPROT_DNROUTED	13	/* DECnet routing daemon */
 #define RTPROT_XORP	14	/* XORP */
 #define RTPROT_NTK	15	/* Netsukuku */
+#define RTPROT_DHCP	16      /* DHCP client */
 
 /* rtm_scope
 
-- 
cgit v1.2.3-71-gd317


From 5e140dfc1fe87eae27846f193086724806b33c7d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 20 Mar 2009 01:33:32 -0700
Subject: net: reorder struct Qdisc for better SMP performance

dev_queue_xmit() needs to dirty fields "state", "q", "bstats" and "qstats"

On x86_64 arch, they currently span three cache lines, involving more
cache line ping pongs than necessary, making longer holding of queue spinlock.

We can reduce this to one cache line, by grouping all read-mostly fields
at the beginning of structure. (Or should I say, all highly modified fields
at the end :) )

Before patch :

offsetof(struct Qdisc, state)=0x38
offsetof(struct Qdisc, q)=0x48
offsetof(struct Qdisc, bstats)=0x80
offsetof(struct Qdisc, qstats)=0x90
sizeof(struct Qdisc)=0xc8

After patch :

offsetof(struct Qdisc, state)=0x80
offsetof(struct Qdisc, q)=0x88
offsetof(struct Qdisc, bstats)=0xa0
offsetof(struct Qdisc, qstats)=0xac
sizeof(struct Qdisc)=0xc0

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/gen_stats.h |  2 +-
 include/net/sch_generic.h | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gen_stats.h b/include/linux/gen_stats.h
index 13f4e74609ac..0ffa41df0ee8 100644
--- a/include/linux/gen_stats.h
+++ b/include/linux/gen_stats.h
@@ -22,7 +22,7 @@ struct gnet_stats_basic
 {
 	__u64	bytes;
 	__u32	packets;
-};
+} __attribute__ ((packed));
 
 /**
  * struct gnet_stats_rate_est - rate estimator
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3d78a4d22460..964ffa0d8815 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -49,18 +49,10 @@ struct Qdisc
 	int			padded;
 	struct Qdisc_ops	*ops;
 	struct qdisc_size_table	*stab;
+	struct list_head	list;
 	u32			handle;
 	u32			parent;
 	atomic_t		refcnt;
-	unsigned long		state;
-	struct sk_buff		*gso_skb;
-	struct sk_buff_head	q;
-	struct netdev_queue	*dev_queue;
-	struct Qdisc		*next_sched;
-	struct list_head	list;
-
-	struct gnet_stats_basic	bstats;
-	struct gnet_stats_queue	qstats;
 	struct gnet_stats_rate_est	rate_est;
 	int			(*reshape_fail)(struct sk_buff *skb,
 					struct Qdisc *q);
@@ -71,6 +63,17 @@ struct Qdisc
 	 * and it will live until better solution will be invented.
 	 */
 	struct Qdisc		*__parent;
+	struct netdev_queue	*dev_queue;
+	struct Qdisc		*next_sched;
+
+	struct sk_buff		*gso_skb;
+	/*
+	 * For performance sake on SMP, we put highly modified fields at the end
+	 */
+	unsigned long		state;
+	struct sk_buff_head	q;
+	struct gnet_stats_basic bstats;
+	struct gnet_stats_queue	qstats;
 };
 
 struct Qdisc_class_ops
-- 
cgit v1.2.3-71-gd317


From 3a3c244c9a355105bc193fde873c73727bf87192 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 15 Feb 2009 22:32:48 +0100
Subject: PCI: PCIe portdrv: Implement pm object

Implement pm object for the PCI Express port driver in order to use
the new power management framework and reduce the code size.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pciehp_core.c |  4 ++--
 drivers/pci/pcie/aer/aerdrv.c     |  6 ------
 drivers/pci/pcie/portdrv.h        |  4 ++--
 drivers/pci/pcie/portdrv_core.c   | 14 ++++++--------
 drivers/pci/pcie/portdrv_pci.c    | 31 +++++++++++++++----------------
 include/linux/pcieport_if.h       |  2 +-
 6 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 3d21bbba3308..fb254b2454de 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -475,7 +475,7 @@ static void pciehp_remove (struct pcie_device *dev)
 }
 
 #ifdef CONFIG_PM
-static int pciehp_suspend (struct pcie_device *dev, pm_message_t state)
+static int pciehp_suspend (struct pcie_device *dev)
 {
 	dev_info(&dev->device, "%s ENTRY\n", __func__);
 	return 0;
@@ -503,7 +503,7 @@ static int pciehp_resume (struct pcie_device *dev)
 	}
 	return 0;
 }
-#endif
+#endif /* PM */
 
 static struct pcie_port_service_driver hpdriver_portdrv = {
 	.name		= PCIE_MODULE_NAME,
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index e11c03194063..32ade5af927e 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -40,9 +40,6 @@ MODULE_LICENSE("GPL");
 
 static int __devinit aer_probe (struct pcie_device *dev);
 static void aer_remove(struct pcie_device *dev);
-static int aer_suspend(struct pcie_device *dev, pm_message_t state)
-{return 0;}
-static int aer_resume(struct pcie_device *dev) {return 0;}
 static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
 	enum pci_channel_state error);
 static void aer_error_resume(struct pci_dev *dev);
@@ -61,9 +58,6 @@ static struct pcie_port_service_driver aerdriver = {
 	.probe		= aer_probe,
 	.remove		= aer_remove,
 
-	.suspend	= aer_suspend,
-	.resume		= aer_resume,
-
 	.err_handler	= &aer_error_handlers,
 
 	.reset_link	= aer_root_reset,
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 5b818bd835ef..17ad53868f9f 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -38,8 +38,8 @@ extern struct bus_type pcie_port_bus_type;
 extern int pcie_port_device_probe(struct pci_dev *dev);
 extern int pcie_port_device_register(struct pci_dev *dev);
 #ifdef CONFIG_PM
-extern int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state);
-extern int pcie_port_device_resume(struct pci_dev *dev);
+extern int pcie_port_device_suspend(struct device *dev);
+extern int pcie_port_device_resume(struct device *dev);
 #endif
 extern void pcie_port_device_remove(struct pci_dev *dev);
 extern int __must_check pcie_port_bus_register(void);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 569af0015fce..5a5bfe7cdf5f 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -410,13 +410,12 @@ int pcie_port_device_register(struct pci_dev *dev)
 static int suspend_iter(struct device *dev, void *data)
 {
 	struct pcie_port_service_driver *service_driver;
-	pm_message_t state = * (pm_message_t *) data;
 
  	if ((dev->bus == &pcie_port_bus_type) &&
  	    (dev->driver)) {
  		service_driver = to_service_driver(dev->driver);
  		if (service_driver->suspend)
- 			service_driver->suspend(to_pcie_device(dev), state);
+ 			service_driver->suspend(to_pcie_device(dev));
   	}
 	return 0;
 }
@@ -424,11 +423,10 @@ static int suspend_iter(struct device *dev, void *data)
 /**
  * pcie_port_device_suspend - suspend port services associated with a PCIe port
  * @dev: PCI Express port to handle
- * @state: Representation of system power management transition in progress
  */
-int pcie_port_device_suspend(struct pci_dev *dev, pm_message_t state)
+int pcie_port_device_suspend(struct device *dev)
 {
-	return device_for_each_child(&dev->dev, &state, suspend_iter);
+	return device_for_each_child(dev, NULL, suspend_iter);
 }
 
 static int resume_iter(struct device *dev, void *data)
@@ -448,11 +446,11 @@ static int resume_iter(struct device *dev, void *data)
  * pcie_port_device_suspend - resume port services associated with a PCIe port
  * @dev: PCI Express port to handle
  */
-int pcie_port_device_resume(struct pci_dev *dev)
+int pcie_port_device_resume(struct device *dev)
 {
-	return device_for_each_child(&dev->dev, NULL, resume_iter);
+	return device_for_each_child(dev, NULL, resume_iter);
 }
-#endif
+#endif /* PM */
 
 static int remove_iter(struct device *dev, void *data)
 {
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 94d0e2af9bad..a61f4930d676 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -44,21 +44,21 @@ static int pcie_portdrv_restore_config(struct pci_dev *dev)
 }
 
 #ifdef CONFIG_PM
-static int pcie_portdrv_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	return pcie_port_device_suspend(dev, state);
+static struct dev_pm_ops pcie_portdrv_pm_ops = {
+	.suspend	= pcie_port_device_suspend,
+	.resume		= pcie_port_device_resume,
+	.freeze		= pcie_port_device_suspend,
+	.thaw		= pcie_port_device_resume,
+	.poweroff	= pcie_port_device_suspend,
+	.restore	= pcie_port_device_resume,
+};
 
-}
+#define PCIE_PORTDRV_PM_OPS	(&pcie_portdrv_pm_ops)
 
-static int pcie_portdrv_resume(struct pci_dev *dev)
-{
-	pci_set_master(dev);
-	return pcie_port_device_resume(dev);
-}
-#else
-#define pcie_portdrv_suspend NULL
-#define pcie_portdrv_resume NULL
-#endif
+#else /* !PM */
+
+#define PCIE_PORTDRV_PM_OPS	NULL
+#endif /* !PM */
 
 /*
  * pcie_portdrv_probe - Probe PCI-Express port devices
@@ -268,10 +268,9 @@ static struct pci_driver pcie_portdriver = {
 	.probe		= pcie_portdrv_probe,
 	.remove		= pcie_portdrv_remove,
 
-	.suspend	= pcie_portdrv_suspend,
-	.resume		= pcie_portdrv_resume,
-
 	.err_handler 	= &pcie_portdrv_err_handler,
+
+	.driver.pm 	= PCIE_PORTDRV_PM_OPS,
 };
 
 static int __init pcie_portdrv_init(void)
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index 5d2afcfa6bc1..b4c79545330b 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -59,7 +59,7 @@ struct pcie_port_service_driver {
 	const char *name;
 	int (*probe) (struct pcie_device *dev);
 	void (*remove) (struct pcie_device *dev);
-	int (*suspend) (struct pcie_device *dev, pm_message_t state);
+	int (*suspend) (struct pcie_device *dev);
 	int (*resume) (struct pcie_device *dev);
 
 	/* Service Error Recovery Handler */
-- 
cgit v1.2.3-71-gd317


From 0747aaf42d78d26684c6f6b34a4103ff81f571f8 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:11:56 +0900
Subject: PCI/ACPI: fix wrong assumption in acpi_pci_get_bridge_handle

Current acpi_pci_get_bridge_handle() has an assumption that
pci_bus->self is NULL on the root pci bus. But it might not true on
some platforms. Because of this wrong assumption, current
acpi_pci_get_bridge_handle() might return improper ACPI handle. We
must check pci_bus->parent instead.

This bug is the root cause of the following kernel panic reported by
James Bottomley. This problem was introduced by the commit
e8c331e963c58b83db24b7d0e39e8c07f687dbc6. The immediate cause was
acpi_pci_get_bridge_handle() returned NULL unexpectedly and it was
passed as the second argument of acpi_walk_namespace().

pci_hotplug: PCI Hot Plug PCI Core version: 0.5
acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
IP: [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.28 #1
RIP: 0010:[<ffffffff8039646f>]  [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
RSP: 0018:ffff88007f87fd30  EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000000 R08: ffffffff8037d260 R09: ffff88007f87fdfc
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffffffff80742040(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000010 CR3: 0000000000201000 CR4: 00000000000006a0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 1, threadinfo ffff88007f87e000, task ffff88007f875040)
Stack:
 0000000000000000 ffffffff803964f5 ffff88007f81b728 0000000000001001
 ffff88007f87fdfc ffffffff8037d260 0000000600000001 0000000000000000
 ffffffff8037d260 0000000000000000 0000000000000001 ffff88007f87fdfc
Call Trace:
 [<ffffffff803964f5>] acpi_ns_walk_namespace+0x55/0x138
 [<ffffffff8037d260>] is_pci_dock_device+0x0/0x20
 [<ffffffff8037d260>] is_pci_dock_device+0x0/0x20
 [<ffffffff80394a9e>] acpi_walk_namespace+0x5f/0x83
 [<ffffffff8037dd33>] detect_ejectable_slots+0x53/0x70
 [<ffffffff8037de38>] add_bridge+0xe8/0x200
 [<ffffffff80394aaa>] acpi_walk_namespace+0x6b/0x83
 [<ffffffff803a4ad1>] acpi_pci_register_driver+0x48/0x61
 [<ffffffff806fc5df>] acpiphp_init+0x0/0x58
 [<ffffffff806fc732>] acpiphp_glue_init+0x4c/0x5a
 [<ffffffff806fc616>] acpiphp_init+0x37/0x58
 [<ffffffff8020903b>] _stext+0x3b/0x180
 [<ffffffff80312598>] create_proc_entry+0x58/0xa0
 [<ffffffff802815d1>] register_irq_proc+0xc1/0xe0
 [<ffffffff806db64b>] kernel_init+0x152/0x1ac
 [<ffffffff8023d970>] finish_task_switch+0x0/0x110
 [<ffffffff8020ca7a>] child_rip+0xa/0x20
 [<ffffffff8020c47c>] restore_args+0x0/0x30
 [<ffffffff806db4f9>] kernel_init+0x0/0x1ac
 [<ffffffff8020ca70>] child_rip+0x0/0x20
Code: 89 c2 48 8b 00 48 85 c0 75 f5 48 8b 45 00 48 89 02 44 88 65 09 48 89 5d 00 31 c0 5b 5d 41 5c c3 53 48 85 d2 89 fb 48 89 d7 75 06 <48> 8b 56 10 eb 08 e8 73 f1 ff ff 48 89 c2 85 db 74 1a eb 13 0f
RIP  [<ffffffff8039646f>] acpi_ns_get_next_node+0xb/0x3c
 RSP <ffff88007f87fd30>
CR2: 0000000000000010
---[ end trace a7919e7f17c0a725 ]---

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 20480b9f10c8..3cee2367459f 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -23,11 +23,10 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 {
-	int seg = pci_domain_nr(pbus), busnr = pbus->number;
-	struct pci_dev *bridge = pbus->self;
-	if (bridge)
-		return DEVICE_ACPI_HANDLE(&(bridge->dev));
-	return acpi_get_pci_rootbridge_handle(seg, busnr);
+	if (pbus->parent)
+		return DEVICE_ACPI_HANDLE(&(pbus->self->dev));
+	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
+					      pbus->number);
 }
 #else
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
-- 
cgit v1.2.3-71-gd317


From d18690af626b83fef1d1953b9f70e09497060586 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 17 Feb 2009 14:12:36 +0900
Subject: PCI/ACPI: fix wrong assumption in acpi_find_root_bridge_handle

Current acpi_find_root_bridge_handle() has a assumption that
pci_bus->self is NULL on the root pci bus. But it might not be true on
some platforms. Because of this wrong assumption, current
acpi_find_root_bridge_handle() might cause endless loop. We must check
pci_bus->parent instead.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci-acpi.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 3cee2367459f..092e82e0048c 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -13,12 +13,12 @@
 #ifdef CONFIG_ACPI
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
-	/* Find root host bridge */
-	while (pdev->bus->self)
-		pdev = pdev->bus->self;
-
-	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pdev->bus),
-			pdev->bus->number);
+	struct pci_bus *pbus = pdev->bus;
+	/* Find a PCI root bus */
+	while (pbus->parent)
+		pbus = pbus->parent;
+	return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus),
+					      pbus->number);
 }
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
-- 
cgit v1.2.3-71-gd317


From 998dd7c719f62dcfa91d7bf7f4eb9c160e03d817 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Wed, 25 Feb 2009 13:15:52 +0800
Subject: PCI: fix incorrect mask of PM No_Soft_Reset bit

Reviewed-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 027815b4635e..b647a4df59fc 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -235,7 +235,7 @@
 #define  PCI_PM_CAP_PME_SHIFT	11	/* Start of the PME Mask in PMC */
 #define PCI_PM_CTRL		4	/* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0004	/* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0008	/* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
-- 
cgit v1.2.3-71-gd317


From 24d27553390c69d11cdbd930d635193956fc295f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:06 -0400
Subject: PCI MSI: Replace 'type' with 'is_msix'

By changing from a 5-bit field to a 1-bit field, we free up some bits
that can be used by a later patch.  Also rearrange the fields for better
packing on 64-bit platforms (reducing the size of msi_desc from 72 bytes
to 64 bytes).

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 115 ++++++++++++++++++----------------------------------
 include/linux/msi.h |   4 +-
 2 files changed, 41 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index dceea56f7342..b3db4388f974 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -111,20 +111,10 @@ static void msix_flush_writes(struct irq_desc *desc)
 
 	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-		/* nothing to do */
-		break;
-	case PCI_CAP_ID_MSIX:
-	{
+	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
 		readl(entry->mask_base + offset);
-		break;
-	}
-	default:
-		BUG();
-		break;
 	}
 }
 
@@ -143,32 +133,23 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 
 	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-		if (entry->msi_attrib.maskbit) {
-			int pos;
-			u32 mask_bits;
-
-			pos = (long)entry->mask_base;
-			pci_read_config_dword(entry->dev, pos, &mask_bits);
-			mask_bits &= ~(mask);
-			mask_bits |= flag & mask;
-			pci_write_config_dword(entry->dev, pos, mask_bits);
-		} else {
-			return 0;
-		}
-		break;
-	case PCI_CAP_ID_MSIX:
-	{
+	if (entry->msi_attrib.is_msix) {
 		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
 		writel(flag, entry->mask_base + offset);
 		readl(entry->mask_base + offset);
-		break;
-	}
-	default:
-		BUG();
-		break;
+	} else {
+		int pos;
+		u32 mask_bits;
+
+		if (!entry->msi_attrib.maskbit)
+			return 0;
+
+		pos = (long)entry->mask_base;
+		pci_read_config_dword(entry->dev, pos, &mask_bits);
+		mask_bits &= ~mask;
+		mask_bits |= flag & mask;
+		pci_write_config_dword(entry->dev, pos, mask_bits);
 	}
 	entry->msi_attrib.masked = !!flag;
 	return 1;
@@ -177,9 +158,14 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
-	switch(entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-	{
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
 		u16 data;
@@ -195,21 +181,6 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 			pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
 		}
 		msg->data = data;
-		break;
-	}
-	case PCI_CAP_ID_MSIX:
-	{
-		void __iomem *base;
-		base = entry->mask_base +
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
-		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
- 		break;
- 	}
- 	default:
-		BUG();
 	}
 }
 
@@ -223,9 +194,17 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
 void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
-	switch (entry->msi_attrib.type) {
-	case PCI_CAP_ID_MSI:
-	{
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base;
+		base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		writel(msg->address_lo,
+			base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		writel(msg->address_hi,
+			base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
 
@@ -240,23 +219,6 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 			pci_write_config_word(dev, msi_data_reg(pos, 0),
 						msg->data);
 		}
-		break;
-	}
-	case PCI_CAP_ID_MSIX:
-	{
-		void __iomem *base;
-		base = entry->mask_base +
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
-		writel(msg->address_lo,
-			base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		writel(msg->address_hi,
-			base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
-		break;
-	}
-	default:
-		BUG();
 	}
 	entry->msg = *msg;
 }
@@ -393,7 +355,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->msi_attrib.type = PCI_CAP_ID_MSI;
+	entry->msi_attrib.is_msix = 0;
 	entry->msi_attrib.is_64 = is_64bit_address(control);
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
@@ -475,7 +437,7 @@ static int msix_capability_init(struct pci_dev *dev,
 			break;
 
  		j = entries[i].entry;
-		entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+		entry->msi_attrib.is_msix = 1;
 		entry->msi_attrib.is_64 = 1;
 		entry->msi_attrib.entry_nr = j;
 		entry->msi_attrib.maskbit = 1;
@@ -619,12 +581,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
 		struct irq_desc *desc = irq_to_desc(dev->irq);
 		msi_set_mask_bits(desc, mask, ~mask);
 	}
-	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+	if (!entry->dev || entry->msi_attrib.is_msix)
 		return;
 
 	/* Restore dev->irq to its default pin-assertion irq */
 	dev->irq = entry->msi_attrib.default_irq;
 }
+
 void pci_disable_msi(struct pci_dev* dev)
 {
 	struct msi_desc *entry;
@@ -635,7 +598,7 @@ void pci_disable_msi(struct pci_dev* dev)
 	pci_msi_shutdown(dev);
 
 	entry = list_entry(dev->msi_list.next, struct msi_desc, list);
-	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+	if (!entry->dev || entry->msi_attrib.is_msix)
 		return;
 
 	msi_free_irqs(dev);
@@ -654,7 +617,7 @@ static int msi_free_irqs(struct pci_dev* dev)
 	arch_teardown_msi_irqs(dev);
 
 	list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
-		if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
+		if (entry->msi_attrib.is_msix) {
 			writel(1, entry->mask_base + entry->msi_attrib.entry_nr
 				  * PCI_MSIX_ENTRY_SIZE
 				  + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d2b8a1e8ca11..9c5ce214fbf4 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -20,13 +20,13 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 struct msi_desc {
 	struct {
-		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
+		__u8	is_msix	: 1;
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
 		__u8	masked	: 1;
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
-		__u32	maskbits_mask;  /* mask bits mask */
 		__u16	entry_nr;    	/* specific enabled entry 	  */
+		__u32	maskbits_mask;  /* mask bits mask */
 		unsigned default_irq;	/* default pre-assigned irq	  */
 	}msi_attrib;
 
-- 
cgit v1.2.3-71-gd317


From 264d9caaa1c574c0274b019a810abfe957391005 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:08 -0400
Subject: PCI MSI: Use mask_pos instead of mask_base when appropriate

MSI interrupts have a mask_pos where MSI-X have a mask_base.  Use a
transparent union to get rid of some ugly casts.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 5 ++---
 include/linux/msi.h | 5 ++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index a658c0f34e16..fcde04df6dfe 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -145,7 +145,7 @@ static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 		if (!entry->msi_attrib.maskbit)
 			return 0;
 
-		pos = (long)entry->mask_base;
+		pos = entry->mask_pos;
 		pci_read_config_dword(entry->dev, pos, &mask_bits);
 		mask_bits &= ~mask;
 		mask_bits |= flag & mask;
@@ -363,8 +363,7 @@ static int msi_capability_init(struct pci_dev *dev)
 		unsigned int base, maskbits, temp;
 
 		base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
-		entry->mask_base = (void __iomem *)(long)base;
-
+		entry->mask_pos = base;
 		/* All MSIs are unmasked by default, Mask them all */
 		pci_read_config_dword(dev, base, &maskbits);
 		temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 9c5ce214fbf4..5025ca4d91e4 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -33,7 +33,10 @@ struct msi_desc {
 	unsigned int irq;
 	struct list_head list;
 
-	void __iomem *mask_base;
+	union {
+		void __iomem *mask_base;
+		u8 mask_pos;
+	};
 	struct pci_dev *dev;
 
 	/* Last set MSI message */
-- 
cgit v1.2.3-71-gd317


From f2440d9acbe866b917b16cc0f927366341ce9215 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:09 -0400
Subject: PCI MSI: Refactor interrupt masking code

Since most of the callers already know whether they have an MSI or
an MSI-X capability, split msi_set_mask_bits() into msi_mask_irq()
and msix_mask_irq().  The only callers which don't (mask_msi_irq()
and unmask_msi_irq()) can share code in msi_set_mask_bit().  This then
becomes the only caller of msix_flush_writes(), so we can inline it.
The flushing read can be to any address that belongs to the device,
so we can eliminate the calculation too.

We can also get rid of maskbits_mask from struct msi_desc and simply
recalculate it on the rare occasion that we need it.  The single-bit
'masked' element is replaced by a copy of the 32-bit 'masked' register,
so this patch does not affect the size of msi_desc.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/msi.c   | 155 +++++++++++++++++++++++++---------------------------
 include/linux/msi.h |   5 +-
 2 files changed, 77 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index fcde04df6dfe..adcc78242571 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -105,17 +105,14 @@ static inline __attribute_const__ u32 msi_mask(unsigned x)
 	return (1 << (1 << x)) - 1;
 }
 
-static void msix_flush_writes(struct irq_desc *desc)
+static inline __attribute_const__ u32 msi_capable_mask(u16 control)
 {
-	struct msi_desc *entry;
+	return msi_mask((control >> 1) & 7);
+}
 
-	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry);
-	if (entry->msi_attrib.is_msix) {
-		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-		readl(entry->mask_base + offset);
-	}
+static inline __attribute_const__ u32 msi_enabled_mask(u16 control)
+{
+	return msi_mask((control >> 4) & 7);
 }
 
 /*
@@ -127,32 +124,57 @@ static void msix_flush_writes(struct irq_desc *desc)
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
+static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	struct msi_desc *entry;
+	u32 mask_bits = desc->masked;
 
-	entry = get_irq_desc_msi(desc);
-	BUG_ON(!entry);
-	if (entry->msi_attrib.is_msix) {
-		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-		writel(flag, entry->mask_base + offset);
-		readl(entry->mask_base + offset);
-	} else {
-		int pos;
-		u32 mask_bits;
+	if (!desc->msi_attrib.maskbit)
+		return;
+
+	mask_bits &= ~mask;
+	mask_bits |= flag;
+	pci_write_config_dword(desc->dev, desc->mask_pos, mask_bits);
+	desc->masked = mask_bits;
+}
+
+/*
+ * This internal function does not flush PCI writes to the device.
+ * All users must ensure that they read from the device before either
+ * assuming that the device state is up to date, or returning out of this
+ * file.  This saves a few milliseconds when initialising devices with lots
+ * of MSI-X interrupts.
+ */
+static void msix_mask_irq(struct msi_desc *desc, u32 flag)
+{
+	u32 mask_bits = desc->masked;
+	unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+					PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+	mask_bits &= ~1;
+	mask_bits |= flag;
+	writel(mask_bits, desc->mask_base + offset);
+	desc->masked = mask_bits;
+}
 
-		if (!entry->msi_attrib.maskbit)
-			return 0;
+static void msi_set_mask_bit(unsigned irq, u32 flag)
+{
+	struct msi_desc *desc = get_irq_msi(irq);
 
-		pos = entry->mask_pos;
-		pci_read_config_dword(entry->dev, pos, &mask_bits);
-		mask_bits &= ~mask;
-		mask_bits |= flag & mask;
-		pci_write_config_dword(entry->dev, pos, mask_bits);
+	if (desc->msi_attrib.is_msix) {
+		msix_mask_irq(desc, flag);
+		readl(desc->mask_base);		/* Flush write to device */
+	} else {
+		msi_mask_irq(desc, 1, flag);
 	}
-	entry->msi_attrib.masked = !!flag;
-	return 1;
+}
+
+void mask_msi_irq(unsigned int irq)
+{
+	msi_set_mask_bit(irq, 1);
+}
+
+void unmask_msi_irq(unsigned int irq)
+{
+	msi_set_mask_bit(irq, 0);
 }
 
 void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
@@ -230,22 +252,6 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 	write_msi_msg_desc(desc, msg);
 }
 
-void mask_msi_irq(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	msi_set_mask_bits(desc, 1, 1);
-	msix_flush_writes(desc);
-}
-
-void unmask_msi_irq(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	msi_set_mask_bits(desc, 1, 0);
-	msix_flush_writes(desc);
-}
-
 static int msi_free_irqs(struct pci_dev* dev);
 
 static struct msi_desc *alloc_msi_entry(struct pci_dev *dev)
@@ -281,13 +287,9 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit) {
-		struct irq_desc *desc = irq_to_desc(dev->irq);
-		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
-				  entry->msi_attrib.masked);
-	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
+	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
 	control |= PCI_MSI_FLAGS_ENABLE;
 	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
@@ -307,9 +309,8 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
+		msix_mask_irq(entry, entry->masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -342,6 +343,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	struct msi_desc *entry;
 	int pos, ret;
 	u16 control;
+	unsigned mask;
 
 	msi_set_enable(dev, 0);	/* Ensure msi is disabled as I set it up */
 
@@ -356,21 +358,16 @@ static int msi_capability_init(struct pci_dev *dev)
 	entry->msi_attrib.is_64 = is_64bit_address(control);
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
-	entry->msi_attrib.masked = 1;
 	entry->msi_attrib.default_irq = dev->irq;	/* Save IOAPIC IRQ */
 	entry->msi_attrib.pos = pos;
-	if (entry->msi_attrib.maskbit) {
-		unsigned int base, maskbits, temp;
-
-		base = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
-		entry->mask_pos = base;
-		/* All MSIs are unmasked by default, Mask them all */
-		pci_read_config_dword(dev, base, &maskbits);
-		temp = msi_mask((control & PCI_MSI_FLAGS_QMASK) >> 1);
-		maskbits |= temp;
-		pci_write_config_dword(dev, base, maskbits);
-		entry->msi_attrib.maskbits_mask = temp;
-	}
+
+	entry->mask_pos = msi_mask_bits_reg(pos, entry->msi_attrib.is_64);
+	/* All MSIs are unmasked by default, Mask them all */
+	if (entry->msi_attrib.maskbit)
+		pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
+	mask = msi_capable_mask(control);
+	msi_mask_irq(entry, mask, mask);
+
 	list_add_tail(&entry->list, &dev->msi_list);
 
 	/* Configure MSI capability structure */
@@ -435,11 +432,12 @@ static int msix_capability_init(struct pci_dev *dev,
 		entry->msi_attrib.is_msix = 1;
 		entry->msi_attrib.is_64 = 1;
 		entry->msi_attrib.entry_nr = j;
-		entry->msi_attrib.maskbit = 1;
-		entry->msi_attrib.masked = 1;
 		entry->msi_attrib.default_irq = dev->irq;
 		entry->msi_attrib.pos = pos;
 		entry->mask_base = base;
+		entry->masked = readl(base + j * PCI_MSIX_ENTRY_SIZE +
+					PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+		msix_mask_irq(entry, 1);
 
 		list_add_tail(&entry->list, &dev->msi_list);
 	}
@@ -556,9 +554,11 @@ int pci_enable_msi(struct pci_dev* dev)
 }
 EXPORT_SYMBOL(pci_enable_msi);
 
-void pci_msi_shutdown(struct pci_dev* dev)
+void pci_msi_shutdown(struct pci_dev *dev)
 {
-	struct msi_desc *entry;
+	struct msi_desc *desc;
+	u32 mask;
+	u16 ctrl;
 
 	if (!pci_msi_enable || !dev || !dev->msi_enabled)
 		return;
@@ -568,18 +568,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
 	dev->msi_enabled = 0;
 
 	BUG_ON(list_empty(&dev->msi_list));
-	entry = list_entry(dev->msi_list.next, struct msi_desc, list);
-	/* Return the the pci reset with msi irqs unmasked */
-	if (entry->msi_attrib.maskbit) {
-		u32 mask = entry->msi_attrib.maskbits_mask;
-		struct irq_desc *desc = irq_to_desc(dev->irq);
-		msi_set_mask_bits(desc, mask, ~mask);
-	}
-	if (entry->msi_attrib.is_msix)
-		return;
+	desc = list_first_entry(&dev->msi_list, struct msi_desc, list);
+	pci_read_config_word(dev, desc->msi_attrib.pos + PCI_MSI_FLAGS, &ctrl);
+	mask = msi_capable_mask(ctrl);
+	msi_mask_irq(desc, mask, ~mask);
 
 	/* Restore dev->irq to its default pin-assertion irq */
-	dev->irq = entry->msi_attrib.default_irq;
+	dev->irq = desc->msi_attrib.default_irq;
 }
 
 void pci_disable_msi(struct pci_dev* dev)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 5025ca4d91e4..37c1bbe546e5 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -22,14 +22,13 @@ struct msi_desc {
 	struct {
 		__u8	is_msix	: 1;
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
-		__u8	masked	: 1;
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
 		__u16	entry_nr;    	/* specific enabled entry 	  */
-		__u32	maskbits_mask;  /* mask bits mask */
 		unsigned default_irq;	/* default pre-assigned irq	  */
-	}msi_attrib;
+	} msi_attrib;
 
+	u32 masked;			/* mask bits */
 	unsigned int irq;
 	struct list_head list;
 
-- 
cgit v1.2.3-71-gd317


From 1c8d7b0a562da06d3ebe83f01b1ed553205d1ae4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Tue, 17 Mar 2009 08:54:10 -0400
Subject: PCI MSI: Add support for multiple MSI

Add the new API pci_enable_msi_block() to allow drivers to
request multiple MSI and reimplement pci_enable_msi in terms of
pci_enable_msi_block.  Ensure that the architecture back ends don't
have to know about multiple MSI.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/PCI/MSI-HOWTO.txt | 45 +++++++++++++++++---
 arch/powerpc/kernel/msi.c       |  4 ++
 arch/x86/kernel/io_apic.c       |  4 ++
 drivers/pci/msi.c               | 91 +++++++++++++++++++++++++++++------------
 drivers/pci/msi.h               |  6 ---
 include/linux/msi.h             |  1 +
 include/linux/pci.h             |  6 ++-
 7 files changed, 116 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 1c02431f1d1a..9494f6dc38eb 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -94,15 +94,48 @@ This function should be called before the driver calls request_irq()
 since enabling MSIs disables the pin-based IRQ and the driver will not
 receive interrupts on the old interrupt.
 
-4.2.2 pci_disable_msi
+4.2.2 pci_enable_msi_block
+
+int pci_enable_msi_block(struct pci_dev *dev, int count)
+
+This variation on the above call allows a device driver to request multiple
+MSIs.  The MSI specification only allows interrupts to be allocated in
+powers of two, up to a maximum of 2^5 (32).
+
+If this function returns 0, it has succeeded in allocating at least as many
+interrupts as the driver requested (it may have allocated more in order
+to satisfy the power-of-two requirement).  In this case, the function
+enables MSI on this device and updates dev->irq to be the lowest of
+the new interrupts assigned to it.  The other interrupts assigned to
+the device are in the range dev->irq to dev->irq + count - 1.
+
+If this function returns a negative number, it indicates an error and
+the driver should not attempt to request any more MSI interrupts for
+this device.  If this function returns a positive number, it will be
+less than 'count' and indicate the number of interrupts that could have
+been allocated.  In neither case will the irq value have been
+updated, nor will the device have been switched into MSI mode.
+
+The device driver must decide what action to take if
+pci_enable_msi_block() returns a value less than the number asked for.
+Some devices can make use of fewer interrupts than the maximum they
+request; in this case the driver should call pci_enable_msi_block()
+again.  Note that it is not guaranteed to succeed, even when the
+'count' has been reduced to the value returned from a previous call to
+pci_enable_msi_block().  This is because there are multiple constraints
+on the number of vectors that can be allocated; pci_enable_msi_block()
+will return as soon as it finds any constraint that doesn't allow the
+call to succeed.
+
+4.2.3 pci_disable_msi
 
 void pci_disable_msi(struct pci_dev *dev)
 
-This function should be used to undo the effect of pci_enable_msi().
-Calling it restores dev->irq to the pin-based interrupt number and frees
-the previously allocated message signaled interrupt(s).  The interrupt
-may subsequently be assigned to another device, so drivers should not
-cache the value of dev->irq.
+This function should be used to undo the effect of pci_enable_msi() or
+pci_enable_msi_block().  Calling it restores dev->irq to the pin-based
+interrupt number and frees the previously allocated message signaled
+interrupt(s).  The interrupt may subsequently be assigned to another
+device, so drivers should not cache the value of dev->irq.
 
 A device driver must always call free_irq() on the interrupt(s)
 for which it has called request_irq() before calling this function.
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 3bb7d3dd28be..0c16e2a854e5 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -19,6 +19,10 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
 		return -ENOSYS;
 	}
 
+	/* PowerPC doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	if (ppc_md.msi_check_device) {
 		pr_debug("msi: Using platform check routine.\n");
 		return ppc_md.msi_check_device(dev, nvec, type);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index bc7ac4da90d7..a09549a6321b 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3510,6 +3510,10 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int index = 0;
 #endif
 
+	/* x86 doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index adcc78242571..6f2e6295e773 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -40,6 +40,13 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	struct msi_desc *entry;
 	int ret;
 
+	/*
+	 * If an architecture wants to support multiple MSI, it needs to
+	 * override arch_setup_msi_irqs()
+	 */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	list_for_each_entry(entry, &dev->msi_list, list) {
 		ret = arch_setup_msi_irq(dev, entry);
 		if (ret < 0)
@@ -58,8 +65,12 @@ void arch_teardown_msi_irqs(struct pci_dev *dev)
 	struct msi_desc *entry;
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq != 0)
-			arch_teardown_msi_irq(entry->irq);
+		int i, nvec;
+		if (entry->irq == 0)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			arch_teardown_msi_irq(entry->irq + i);
 	}
 }
 #endif
@@ -163,7 +174,8 @@ static void msi_set_mask_bit(unsigned irq, u32 flag)
 		msix_mask_irq(desc, flag);
 		readl(desc->mask_base);		/* Flush write to device */
 	} else {
-		msi_mask_irq(desc, 1, flag);
+		unsigned offset = irq - desc->dev->irq;
+		msi_mask_irq(desc, 1 << offset, flag << offset);
 	}
 }
 
@@ -229,6 +241,12 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 	} else {
 		struct pci_dev *dev = entry->dev;
 		int pos = entry->msi_attrib.pos;
+		u16 msgctl;
+
+		pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
+		msgctl &= ~PCI_MSI_FLAGS_QSIZE;
+		msgctl |= entry->msi_attrib.multiple << 4;
+		pci_write_config_word(dev, msi_control_reg(pos), msgctl);
 
 		pci_write_config_dword(dev, msi_lower_address_reg(pos),
 					msg->address_lo);
@@ -291,7 +309,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
-	control |= PCI_MSI_FLAGS_ENABLE;
+	control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE;
 	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
 }
 
@@ -332,13 +350,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state);
 /**
  * msi_capability_init - configure device's MSI capability structure
  * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
  *
- * Setup the MSI capability structure of device function with a single
- * MSI irq, regardless of device function is capable of handling
- * multiple messages. A return of zero indicates the successful setup
- * of an entry zero with the new MSI irq or non-zero for otherwise.
- **/
-static int msi_capability_init(struct pci_dev *dev)
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts.  A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq.  A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
 {
 	struct msi_desc *entry;
 	int pos, ret;
@@ -371,7 +391,7 @@ static int msi_capability_init(struct pci_dev *dev)
 	list_add_tail(&entry->list, &dev->msi_list);
 
 	/* Configure MSI capability structure */
-	ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI);
+	ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
 	if (ret) {
 		msi_free_irqs(dev);
 		return ret;
@@ -524,35 +544,48 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
 }
 
 /**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
  *
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * irq or non-zero for otherwise.
- **/
-int pci_enable_msi(struct pci_dev* dev)
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs.  If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate.  If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
 {
-	int status;
+	int status, pos, maxvec;
+	u16 msgctl;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!pos)
+		return -EINVAL;
+	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+	maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
+	if (nvec > maxvec)
+		return maxvec;
 
-	status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+	status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
 	if (status)
 		return status;
 
 	WARN_ON(!!dev->msi_enabled);
 
-	/* Check whether driver already requested for MSI-X irqs */
+	/* Check whether driver already requested MSI-X irqs */
 	if (dev->msix_enabled) {
 		dev_info(&dev->dev, "can't enable MSI "
 			 "(MSI-X already enabled)\n");
 		return -EINVAL;
 	}
-	status = msi_capability_init(dev);
+
+	status = msi_capability_init(dev, nvec);
 	return status;
 }
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);
 
 void pci_msi_shutdown(struct pci_dev *dev)
 {
@@ -599,8 +632,12 @@ static int msi_free_irqs(struct pci_dev* dev)
 	struct msi_desc *entry, *tmp;
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq)
-			BUG_ON(irq_has_action(entry->irq));
+		int i, nvec;
+		if (!entry->irq)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			BUG_ON(irq_has_action(entry->irq + i));
 	}
 
 	arch_teardown_msi_irqs(dev);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 3898f5237144..71f4df2ef654 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -20,14 +20,8 @@
 #define msi_mask_bits_reg(base, is64bit) \
 	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
 #define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
-#define multi_msi_capable(control) \
-	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
-#define multi_msi_enable(control, num) \
-	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
-#define msi_enable(control, num) multi_msi_enable(control, num); \
-	control |= PCI_MSI_FLAGS_ENABLE
 
 #define msix_table_offset_reg(base)	(base + 0x04)
 #define msix_pba_offset_reg(base)	(base + 0x08)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 37c1bbe546e5..6991ab5b24d1 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -21,6 +21,7 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 struct msi_desc {
 	struct {
 		__u8	is_msix	: 1;
+		__u8	multiple: 3;	/* log2 number of messages */
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
 		__u8	pos;	 	/* Location of the msi capability */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7baf2a5db12a..1f6c5ddaae36 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -789,7 +789,7 @@ struct msix_entry {
 
 
 #ifndef CONFIG_PCI_MSI
-static inline int pci_enable_msi(struct pci_dev *dev)
+static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
 {
 	return -1;
 }
@@ -824,7 +824,7 @@ static inline int pci_msi_enabled(void)
 	return 0;
 }
 #else
-extern int pci_enable_msi(struct pci_dev *dev);
+extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
 extern void pci_msi_shutdown(struct pci_dev *dev);
 extern void pci_disable_msi(struct pci_dev *dev);
 extern int pci_msix_table_size(struct pci_dev *dev);
@@ -846,6 +846,8 @@ static inline int pcie_aspm_enabled(void)
 extern int pcie_aspm_enabled(void);
 #endif
 
+#define pci_enable_msi(pdev)	pci_enable_msi_block(pdev, 1)
+
 #ifdef CONFIG_HT_IRQ
 /* The functions a driver should call */
 int  ht_create_irq(struct pci_dev *dev, int idx);
-- 
cgit v1.2.3-71-gd317


From 8293b0f629095efbe7c7e3f9b437f8c040c19eb5 Mon Sep 17 00:00:00 2001
From: David O'Shea <dcoshea@hotmail.com>
Date: Mon, 2 Mar 2009 09:51:13 +0100
Subject: PCI: Compaq Evo D510 SMBus quirk using USB instead of VGA

On the Compaq Evo D510 SFF/CMT, a PCI quirk activated the SMBus device
based on detection of the on-board VGA controller, but the on-board
VGA is disabled if an AGP card is inserted, so look for one of the USB
controllers instead.

Signed-off-by: David O'Shea <dcoshea@hotmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/quirks.c    | 9 +++++++--
 include/linux/pci_ids.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 50233818a763..7ddcfc65e790 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1186,10 +1186,15 @@ static void __init asus_hides_smbus_hostbridge(struct pci_dev *dev)
 				 * its on-board VGA controller */
 				asus_hides_smbus = 1;
 			}
-		else if (dev->device == PCI_DEVICE_ID_INTEL_82845G_IG)
+		else if (dev->device == PCI_DEVICE_ID_INTEL_82801DB_2)
 			switch(dev->subsystem_device) {
 			case 0x00b8: /* Compaq Evo D510 CMT */
 			case 0x00b9: /* Compaq Evo D510 SFF */
+				/* Motherboard doesn't have Host bridge
+				 * subvendor/subdevice IDs and on-board VGA
+				 * controller is disabled if an AGP card is
+				 * inserted, therefore checking USB UHCI
+				 * Controller #1 */
 				asus_hides_smbus = 1;
 			}
 		else if (dev->device == PCI_DEVICE_ID_INTEL_82815_CGC)
@@ -1214,7 +1219,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82855GM_HB,	as
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82915GM_HB, asus_hides_smbus_hostbridge);
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82810_IG3,	asus_hides_smbus_hostbridge);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82845G_IG,	asus_hides_smbus_hostbridge);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82801DB_2,	asus_hides_smbus_hostbridge);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82815_CGC,	asus_hides_smbus_hostbridge);
 
 static void asus_hides_smbus_lpc(struct pci_dev *dev)
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index aca8c458aa8a..3ddf8beabdf8 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2373,6 +2373,7 @@
 #define PCI_DEVICE_ID_INTEL_82801CA_12	0x248c
 #define PCI_DEVICE_ID_INTEL_82801DB_0	0x24c0
 #define PCI_DEVICE_ID_INTEL_82801DB_1	0x24c1
+#define PCI_DEVICE_ID_INTEL_82801DB_2	0x24c2
 #define PCI_DEVICE_ID_INTEL_82801DB_3	0x24c3
 #define PCI_DEVICE_ID_INTEL_82801DB_5	0x24c5
 #define PCI_DEVICE_ID_INTEL_82801DB_6	0x24c6
-- 
cgit v1.2.3-71-gd317


From d1b054da8f599905f3c18a218961dcf17f9d5f13 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:11 +0800
Subject: PCI: initialize and release SR-IOV capability

If a device has the SR-IOV capability, initialize it (set the ARI
Capable Hierarchy in the lowest numbered PF if necessary; calculate
the System Page Size for the VF MMIO, probe the VF Offset, Stride
and BARs). A lock for the VF bus allocation is also initialized if
a PF is the lowest numbered PF.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/Kconfig      |  10 +++
 drivers/pci/Makefile     |   2 +
 drivers/pci/iov.c        | 182 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.c        |   7 ++
 drivers/pci/pci.h        |  37 ++++++++++
 drivers/pci/probe.c      |   4 ++
 include/linux/pci.h      |  11 +++
 include/linux/pci_regs.h |  33 +++++++++
 8 files changed, 286 insertions(+)
 create mode 100644 drivers/pci/iov.c

(limited to 'include/linux')

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2a4501dd2515..fdc864f9cf23 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -59,3 +59,13 @@ config HT_IRQ
 	   This allows native hypertransport devices to use interrupts.
 
 	   If unsure say Y.
+
+config PCI_IOV
+	bool "PCI IOV support"
+	depends on PCI
+	help
+	  I/O Virtualization is a PCI feature supported by some devices
+	  which allows them to create virtual devices which share their
+	  physical resources.
+
+	  If unsure, say N.
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 3d07ce24f6a8..ba6af162fd39 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -29,6 +29,8 @@ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
 
 obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
 
+obj-$(CONFIG_PCI_IOV) += iov.o
+
 #
 # Some architectures use the generic PCI setup functions
 #
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
new file mode 100644
index 000000000000..66cc414ed15f
--- /dev/null
+++ b/drivers/pci/iov.c
@@ -0,0 +1,182 @@
+/*
+ * drivers/pci/iov.c
+ *
+ * Copyright (C) 2009 Intel Corporation, Yu Zhao <yu.zhao@intel.com>
+ *
+ * PCI Express I/O Virtualization (IOV) support.
+ *   Single Root IOV 1.0
+ */
+
+#include <linux/pci.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include "pci.h"
+
+
+static int sriov_init(struct pci_dev *dev, int pos)
+{
+	int i;
+	int rc;
+	int nres;
+	u32 pgsz;
+	u16 ctrl, total, offset, stride;
+	struct pci_sriov *iov;
+	struct resource *res;
+	struct pci_dev *pdev;
+
+	if (dev->pcie_type != PCI_EXP_TYPE_RC_END &&
+	    dev->pcie_type != PCI_EXP_TYPE_ENDPOINT)
+		return -ENODEV;
+
+	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
+	if (ctrl & PCI_SRIOV_CTRL_VFE) {
+		pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
+		ssleep(1);
+	}
+
+	pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
+	if (!total)
+		return 0;
+
+	ctrl = 0;
+	list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+		if (pdev->is_physfn)
+			goto found;
+
+	pdev = NULL;
+	if (pci_ari_enabled(dev->bus))
+		ctrl |= PCI_SRIOV_CTRL_ARI;
+
+found:
+	pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
+	pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
+	if (!offset || (total > 1 && !stride))
+		return -EIO;
+
+	pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
+	i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
+	pgsz &= ~((1 << i) - 1);
+	if (!pgsz)
+		return -EIO;
+
+	pgsz &= ~(pgsz - 1);
+	pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
+
+	nres = 0;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		i += __pci_read_base(dev, pci_bar_unknown, res,
+				     pos + PCI_SRIOV_BAR + i * 4);
+		if (!res->flags)
+			continue;
+		if (resource_size(res) & (PAGE_SIZE - 1)) {
+			rc = -EIO;
+			goto failed;
+		}
+		res->end = res->start + resource_size(res) * total - 1;
+		nres++;
+	}
+
+	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+	if (!iov) {
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	iov->pos = pos;
+	iov->nres = nres;
+	iov->ctrl = ctrl;
+	iov->total = total;
+	iov->offset = offset;
+	iov->stride = stride;
+	iov->pgsz = pgsz;
+	iov->self = dev;
+	pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
+	pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
+
+	if (pdev)
+		iov->dev = pci_dev_get(pdev);
+	else {
+		iov->dev = dev;
+		mutex_init(&iov->lock);
+	}
+
+	dev->sriov = iov;
+	dev->is_physfn = 1;
+
+	return 0;
+
+failed:
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		res->flags = 0;
+	}
+
+	return rc;
+}
+
+static void sriov_release(struct pci_dev *dev)
+{
+	if (dev == dev->sriov->dev)
+		mutex_destroy(&dev->sriov->lock);
+	else
+		pci_dev_put(dev->sriov->dev);
+
+	kfree(dev->sriov);
+	dev->sriov = NULL;
+}
+
+/**
+ * pci_iov_init - initialize the IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_iov_init(struct pci_dev *dev)
+{
+	int pos;
+
+	if (!dev->is_pcie)
+		return -ENODEV;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
+	if (pos)
+		return sriov_init(dev, pos);
+
+	return -ENODEV;
+}
+
+/**
+ * pci_iov_release - release resources used by the IOV capability
+ * @dev: the PCI device
+ */
+void pci_iov_release(struct pci_dev *dev)
+{
+	if (dev->is_physfn)
+		sriov_release(dev);
+}
+
+/**
+ * pci_iov_resource_bar - get position of the SR-IOV BAR
+ * @dev: the PCI device
+ * @resno: the resource number
+ * @type: the BAR type to be filled in
+ *
+ * Returns position of the BAR encapsulated in the SR-IOV capability.
+ */
+int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+			 enum pci_bar_type *type)
+{
+	if (resno < PCI_IOV_RESOURCES || resno > PCI_IOV_RESOURCE_END)
+		return 0;
+
+	BUG_ON(!dev->is_physfn);
+
+	*type = pci_bar_unknown;
+
+	return dev->sriov->pos + PCI_SRIOV_BAR +
+		4 * (resno - PCI_IOV_RESOURCES);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a35a8b2ba631..2b3201ec2b05 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2360,12 +2360,19 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags)
  */
 int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type)
 {
+	int reg;
+
 	if (resno < PCI_ROM_RESOURCE) {
 		*type = pci_bar_unknown;
 		return PCI_BASE_ADDRESS_0 + 4 * resno;
 	} else if (resno == PCI_ROM_RESOURCE) {
 		*type = pci_bar_mem32;
 		return dev->rom_base_reg;
+	} else if (resno < PCI_BRIDGE_RESOURCES) {
+		/* device specific resource */
+		reg = pci_iov_resource_bar(dev, resno, type);
+		if (reg)
+			return reg;
 	}
 
 	dev_err(&dev->dev, "BAR: invalid resource #%d\n", resno);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2cd1cba7236f..7d5327c986f5 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -201,4 +201,41 @@ resource_size_t pci_specified_resource_alignment(struct pci_dev *dev);
 extern void pci_disable_bridge_window(struct pci_dev *dev);
 #endif
 
+/* Single Root I/O Virtualization */
+struct pci_sriov {
+	int pos;		/* capability position */
+	int nres;		/* number of resources */
+	u32 cap;		/* SR-IOV Capabilities */
+	u16 ctrl;		/* SR-IOV Control */
+	u16 total;		/* total VFs associated with the PF */
+	u16 offset;		/* first VF Routing ID offset */
+	u16 stride;		/* following VF stride */
+	u32 pgsz;		/* page size for BAR alignment */
+	u8 link;		/* Function Dependency Link */
+	struct pci_dev *dev;	/* lowest numbered PF */
+	struct pci_dev *self;	/* this PF */
+	struct mutex lock;	/* lock for VF bus */
+};
+
+#ifdef CONFIG_PCI_IOV
+extern int pci_iov_init(struct pci_dev *dev);
+extern void pci_iov_release(struct pci_dev *dev);
+extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+				enum pci_bar_type *type);
+#else
+static inline int pci_iov_init(struct pci_dev *dev)
+{
+	return -ENODEV;
+}
+static inline void pci_iov_release(struct pci_dev *dev)
+
+{
+}
+static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+				       enum pci_bar_type *type)
+{
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 579a56c8181f..0471f6ea1466 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -785,6 +785,7 @@ static int pci_setup_device(struct pci_dev * dev)
 static void pci_release_capabilities(struct pci_dev *dev)
 {
 	pci_vpd_release(dev);
+	pci_iov_release(dev);
 }
 
 /**
@@ -979,6 +980,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
 
 	/* Alternative Routing-ID Forwarding */
 	pci_enable_ari(dev);
+
+	/* Single Root I/O Virtualization */
+	pci_iov_init(dev);
 }
 
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1f6c5ddaae36..8ce2f2d9ab63 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -93,6 +93,12 @@ enum {
 	/* #6: expansion ROM resource */
 	PCI_ROM_RESOURCE,
 
+	/* device specific resources */
+#ifdef CONFIG_PCI_IOV
+	PCI_IOV_RESOURCES,
+	PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1,
+#endif
+
 	/* resources assigned to buses behind the bridge */
 #define PCI_BRIDGE_RESOURCE_NUM 4
 
@@ -180,6 +186,7 @@ struct pci_cap_saved_state {
 
 struct pcie_link_state;
 struct pci_vpd;
+struct pci_sriov;
 
 /*
  * The pci_dev structure is used to describe PCI devices.
@@ -257,6 +264,7 @@ struct pci_dev {
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
 	unsigned int	state_saved:1;
+	unsigned int	is_physfn:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -270,6 +278,9 @@ struct pci_dev {
 	struct list_head msi_list;
 #endif
 	struct pci_vpd *vpd;
+#ifdef CONFIG_PCI_IOV
+	struct pci_sriov *sriov;	/* SR-IOV capability related */
+#endif
 };
 
 extern struct pci_dev *alloc_pci_dev(void);
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index b647a4df59fc..d4e663877f45 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,7 @@
 #define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
 #define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
 #define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
+#define  PCI_EXP_TYPE_RC_END	0x9	/* Root Complex Integrated Endpoint */
 #define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
 #define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
 #define PCI_EXP_DEVCAP		4	/* Device capabilities */
@@ -498,6 +499,7 @@
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
 #define PCI_EXT_CAP_ID_ARI	14
+#define PCI_EXT_CAP_ID_SRIOV	16
 
 /* Advanced Error Reporting */
 #define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
@@ -615,4 +617,35 @@
 #define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
 #define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
+/* Single Root I/O Virtualization */
+#define PCI_SRIOV_CAP		0x04	/* SR-IOV Capabilities */
+#define  PCI_SRIOV_CAP_VFM	0x01	/* VF Migration Capable */
+#define  PCI_SRIOV_CAP_INTR(x)	((x) >> 21) /* Interrupt Message Number */
+#define PCI_SRIOV_CTRL		0x08	/* SR-IOV Control */
+#define  PCI_SRIOV_CTRL_VFE	0x01	/* VF Enable */
+#define  PCI_SRIOV_CTRL_VFM	0x02	/* VF Migration Enable */
+#define  PCI_SRIOV_CTRL_INTR	0x04	/* VF Migration Interrupt Enable */
+#define  PCI_SRIOV_CTRL_MSE	0x08	/* VF Memory Space Enable */
+#define  PCI_SRIOV_CTRL_ARI	0x10	/* ARI Capable Hierarchy */
+#define PCI_SRIOV_STATUS	0x0a	/* SR-IOV Status */
+#define  PCI_SRIOV_STATUS_VFM	0x01	/* VF Migration Status */
+#define PCI_SRIOV_INITIAL_VF	0x0c	/* Initial VFs */
+#define PCI_SRIOV_TOTAL_VF	0x0e	/* Total VFs */
+#define PCI_SRIOV_NUM_VF	0x10	/* Number of VFs */
+#define PCI_SRIOV_FUNC_LINK	0x12	/* Function Dependency Link */
+#define PCI_SRIOV_VF_OFFSET	0x14	/* First VF Offset */
+#define PCI_SRIOV_VF_STRIDE	0x16	/* Following VF Stride */
+#define PCI_SRIOV_VF_DID	0x1a	/* VF Device ID */
+#define PCI_SRIOV_SUP_PGSIZE	0x1c	/* Supported Page Sizes */
+#define PCI_SRIOV_SYS_PGSIZE	0x20	/* System Page Size */
+#define PCI_SRIOV_BAR		0x24	/* VF BAR0 */
+#define  PCI_SRIOV_NUM_BARS	6	/* Number of VF BARs */
+#define PCI_SRIOV_VFM		0x3c	/* VF Migration State Array Offset*/
+#define  PCI_SRIOV_VFM_BIR(x)	((x) & 7)	/* State BIR */
+#define  PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7)	/* State Offset */
+#define  PCI_SRIOV_VFM_UA	0x0	/* Inactive.Unavailable */
+#define  PCI_SRIOV_VFM_MI	0x1	/* Dormant.MigrateIn */
+#define  PCI_SRIOV_VFM_MO	0x2	/* Active.MigrateOut */
+#define  PCI_SRIOV_VFM_AV	0x3	/* Active.Available */
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3-71-gd317


From dd7cc44d0bcec5e9c42fe52e88dc254ae62eac8d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:15 +0800
Subject: PCI: add SR-IOV API for Physical Function driver

Add or remove the Virtual Function when the SR-IOV is enabled or
disabled by the device driver. This can happen anytime rather than
only at the device probe stage.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   | 314 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |   2 +
 include/linux/pci.h |  19 +++-
 3 files changed, 334 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 5ddfc09a8d3f..d0ff8ad8f7ba 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include "pci.h"
 
+#define VIRTFN_ID_LEN	16
 
 static inline u8 virtfn_bus(struct pci_dev *dev, int id)
 {
@@ -26,6 +27,284 @@ static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
 		dev->sriov->stride * id) & 0xff;
 }
 
+static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
+{
+	int rc;
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return bus;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	if (child)
+		return child;
+
+	child = pci_add_new_bus(bus, NULL, busnr);
+	if (!child)
+		return NULL;
+
+	child->subordinate = busnr;
+	child->dev.parent = bus->bridge;
+	rc = pci_bus_add_child(child);
+	if (rc) {
+		pci_remove_bus(child);
+		return NULL;
+	}
+
+	return child;
+}
+
+static void virtfn_remove_bus(struct pci_bus *bus, int busnr)
+{
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	BUG_ON(!child);
+
+	if (list_empty(&child->devices))
+		pci_remove_bus(child);
+}
+
+static int virtfn_add(struct pci_dev *dev, int id, int reset)
+{
+	int i;
+	int rc;
+	u64 size;
+	char buf[VIRTFN_ID_LEN];
+	struct pci_dev *virtfn;
+	struct resource *res;
+	struct pci_sriov *iov = dev->sriov;
+
+	virtfn = alloc_pci_dev();
+	if (!virtfn)
+		return -ENOMEM;
+
+	mutex_lock(&iov->dev->sriov->lock);
+	virtfn->bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
+	if (!virtfn->bus) {
+		kfree(virtfn);
+		mutex_unlock(&iov->dev->sriov->lock);
+		return -ENOMEM;
+	}
+	virtfn->devfn = virtfn_devfn(dev, id);
+	virtfn->vendor = dev->vendor;
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
+	pci_setup_device(virtfn);
+	virtfn->dev.parent = dev->dev.parent;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		if (!res->parent)
+			continue;
+		virtfn->resource[i].name = pci_name(virtfn);
+		virtfn->resource[i].flags = res->flags;
+		size = resource_size(res);
+		do_div(size, iov->total);
+		virtfn->resource[i].start = res->start + size * id;
+		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
+		rc = request_resource(res, &virtfn->resource[i]);
+		BUG_ON(rc);
+	}
+
+	if (reset)
+		pci_execute_reset_function(virtfn);
+
+	pci_device_add(virtfn, virtfn->bus);
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	virtfn->physfn = pci_dev_get(dev);
+	virtfn->is_virtfn = 1;
+
+	rc = pci_bus_add_device(virtfn);
+	if (rc)
+		goto failed1;
+	sprintf(buf, "virtfn%u", id);
+	rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
+	if (rc)
+		goto failed1;
+	rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
+	if (rc)
+		goto failed2;
+
+	kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
+
+	return 0;
+
+failed2:
+	sysfs_remove_link(&dev->dev.kobj, buf);
+failed1:
+	pci_dev_put(dev);
+	mutex_lock(&iov->dev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	return rc;
+}
+
+static void virtfn_remove(struct pci_dev *dev, int id, int reset)
+{
+	char buf[VIRTFN_ID_LEN];
+	struct pci_bus *bus;
+	struct pci_dev *virtfn;
+	struct pci_sriov *iov = dev->sriov;
+
+	bus = pci_find_bus(pci_domain_nr(dev->bus), virtfn_bus(dev, id));
+	if (!bus)
+		return;
+
+	virtfn = pci_get_slot(bus, virtfn_devfn(dev, id));
+	if (!virtfn)
+		return;
+
+	pci_dev_put(virtfn);
+
+	if (reset) {
+		device_release_driver(&virtfn->dev);
+		pci_execute_reset_function(virtfn);
+	}
+
+	sprintf(buf, "virtfn%u", id);
+	sysfs_remove_link(&dev->dev.kobj, buf);
+	sysfs_remove_link(&virtfn->dev.kobj, "physfn");
+
+	mutex_lock(&iov->dev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, virtfn_bus(dev, id));
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	pci_dev_put(dev);
+}
+
+static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
+{
+	int rc;
+	int i, j;
+	int nres;
+	u16 offset, stride, initial;
+	struct resource *res;
+	struct pci_dev *pdev;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!nr_virtfn)
+		return 0;
+
+	if (iov->nr_virtfn)
+		return -EINVAL;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
+	if (initial > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total)))
+		return -EIO;
+
+	if (nr_virtfn < 0 || nr_virtfn > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
+		return -EINVAL;
+
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
+	if (!offset || (nr_virtfn > 1 && !stride))
+		return -EIO;
+
+	nres = 0;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_IOV_RESOURCES + i;
+		if (res->parent)
+			nres++;
+	}
+	if (nres != iov->nres) {
+		dev_err(&dev->dev, "not enough MMIO resources for SR-IOV\n");
+		return -ENOMEM;
+	}
+
+	iov->offset = offset;
+	iov->stride = stride;
+
+	if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->subordinate) {
+		dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
+		return -ENOMEM;
+	}
+
+	if (iov->link != dev->devfn) {
+		pdev = pci_get_slot(dev->bus, iov->link);
+		if (!pdev)
+			return -ENODEV;
+
+		pci_dev_put(pdev);
+
+		if (!pdev->is_physfn)
+			return -ENODEV;
+
+		rc = sysfs_create_link(&dev->dev.kobj,
+					&pdev->dev.kobj, "dep_link");
+		if (rc)
+			return rc;
+	}
+
+	iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	msleep(100);
+	pci_unblock_user_cfg_access(dev);
+
+	iov->initial = initial;
+	if (nr_virtfn < initial)
+		initial = nr_virtfn;
+
+	for (i = 0; i < initial; i++) {
+		rc = virtfn_add(dev, i, 0);
+		if (rc)
+			goto failed;
+	}
+
+	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
+	iov->nr_virtfn = nr_virtfn;
+
+	return 0;
+
+failed:
+	for (j = 0; j < i; j++)
+		virtfn_remove(dev, j, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&dev->dev.kobj, "dep_link");
+
+	return rc;
+}
+
+static void sriov_disable(struct pci_dev *dev)
+{
+	int i;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!iov->nr_virtfn)
+		return;
+
+	for (i = 0; i < iov->nr_virtfn; i++)
+		virtfn_remove(dev, i, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&dev->dev.kobj, "dep_link");
+
+	iov->nr_virtfn = 0;
+}
+
 static int sriov_init(struct pci_dev *dev, int pos)
 {
 	int i;
@@ -132,6 +411,8 @@ failed:
 
 static void sriov_release(struct pci_dev *dev)
 {
+	BUG_ON(dev->sriov->nr_virtfn);
+
 	if (dev == dev->sriov->dev)
 		mutex_destroy(&dev->sriov->lock);
 	else
@@ -155,6 +436,7 @@ static void sriov_restore_state(struct pci_dev *dev)
 		pci_update_resource(dev, i);
 
 	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn);
 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
 	if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
 		msleep(100);
@@ -245,3 +527,35 @@ int pci_iov_bus_range(struct pci_bus *bus)
 
 	return max ? max - bus->number : 0;
 }
+
+/**
+ * pci_enable_sriov - enable the SR-IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	might_sleep();
+
+	if (!dev->is_physfn)
+		return -ENODEV;
+
+	return sriov_enable(dev, nr_virtfn);
+}
+EXPORT_SYMBOL_GPL(pci_enable_sriov);
+
+/**
+ * pci_disable_sriov - disable the SR-IOV capability
+ * @dev: the PCI device
+ */
+void pci_disable_sriov(struct pci_dev *dev)
+{
+	might_sleep();
+
+	if (!dev->is_physfn)
+		return;
+
+	sriov_disable(dev);
+}
+EXPORT_SYMBOL_GPL(pci_disable_sriov);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f4fc10fc5872..0f1c7d103509 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -209,6 +209,8 @@ struct pci_sriov {
 	u32 cap;		/* SR-IOV Capabilities */
 	u16 ctrl;		/* SR-IOV Control */
 	u16 total;		/* total VFs associated with the PF */
+	u16 initial;		/* initial VFs associated with the PF */
+	u16 nr_virtfn;		/* number of VFs available */
 	u16 offset;		/* first VF Routing ID offset */
 	u16 stride;		/* following VF stride */
 	u32 pgsz;		/* page size for BAR alignment */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8ce2f2d9ab63..c2e491e04063 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -265,6 +265,7 @@ struct pci_dev {
 	unsigned int	is_pcie:1;
 	unsigned int	state_saved:1;
 	unsigned int	is_physfn:1;
+	unsigned int	is_virtfn:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -279,7 +280,10 @@ struct pci_dev {
 #endif
 	struct pci_vpd *vpd;
 #ifdef CONFIG_PCI_IOV
-	struct pci_sriov *sriov;	/* SR-IOV capability related */
+	union {
+		struct pci_sriov *sriov;	/* SR-IOV capability related */
+		struct pci_dev *physfn;	/* the PF this VF is associated with */
+	};
 #endif
 };
 
@@ -1212,5 +1216,18 @@ int pci_ext_cfg_avail(struct pci_dev *dev);
 
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 
+#ifdef CONFIG_PCI_IOV
+extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+extern void pci_disable_sriov(struct pci_dev *dev);
+#else
+static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	return -ENODEV;
+}
+static inline void pci_disable_sriov(struct pci_dev *dev)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3-71-gd317


From 74bb1bcc7dbbc9ddef773bf3395d7ff92aaaad2e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Fri, 20 Mar 2009 11:25:16 +0800
Subject: PCI: handle SR-IOV Virtual Function Migration

Add or remove a Virtual Function after receiving a Migrate In or Out
Request.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/iov.c   | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |   4 ++
 include/linux/pci.h |   6 +++
 3 files changed, 129 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index d0ff8ad8f7ba..7227efc760db 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -179,6 +179,97 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
 	pci_dev_put(dev);
 }
 
+static int sriov_migration(struct pci_dev *dev)
+{
+	u16 status;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!iov->nr_virtfn)
+		return 0;
+
+	if (!(iov->cap & PCI_SRIOV_CAP_VFM))
+		return 0;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_STATUS, &status);
+	if (!(status & PCI_SRIOV_STATUS_VFM))
+		return 0;
+
+	schedule_work(&iov->mtask);
+
+	return 1;
+}
+
+static void sriov_migration_task(struct work_struct *work)
+{
+	int i;
+	u8 state;
+	u16 status;
+	struct pci_sriov *iov = container_of(work, struct pci_sriov, mtask);
+
+	for (i = iov->initial; i < iov->nr_virtfn; i++) {
+		state = readb(iov->mstate + i);
+		if (state == PCI_SRIOV_VFM_MI) {
+			writeb(PCI_SRIOV_VFM_AV, iov->mstate + i);
+			state = readb(iov->mstate + i);
+			if (state == PCI_SRIOV_VFM_AV)
+				virtfn_add(iov->self, i, 1);
+		} else if (state == PCI_SRIOV_VFM_MO) {
+			virtfn_remove(iov->self, i, 1);
+			writeb(PCI_SRIOV_VFM_UA, iov->mstate + i);
+			state = readb(iov->mstate + i);
+			if (state == PCI_SRIOV_VFM_AV)
+				virtfn_add(iov->self, i, 0);
+		}
+	}
+
+	pci_read_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, &status);
+	status &= ~PCI_SRIOV_STATUS_VFM;
+	pci_write_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, status);
+}
+
+static int sriov_enable_migration(struct pci_dev *dev, int nr_virtfn)
+{
+	int bir;
+	u32 table;
+	resource_size_t pa;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (nr_virtfn <= iov->initial)
+		return 0;
+
+	pci_read_config_dword(dev, iov->pos + PCI_SRIOV_VFM, &table);
+	bir = PCI_SRIOV_VFM_BIR(table);
+	if (bir > PCI_STD_RESOURCE_END)
+		return -EIO;
+
+	table = PCI_SRIOV_VFM_OFFSET(table);
+	if (table + nr_virtfn > pci_resource_len(dev, bir))
+		return -EIO;
+
+	pa = pci_resource_start(dev, bir) + table;
+	iov->mstate = ioremap(pa, nr_virtfn);
+	if (!iov->mstate)
+		return -ENOMEM;
+
+	INIT_WORK(&iov->mtask, sriov_migration_task);
+
+	iov->ctrl |= PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR;
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+	return 0;
+}
+
+static void sriov_disable_migration(struct pci_dev *dev)
+{
+	struct pci_sriov *iov = dev->sriov;
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+	cancel_work_sync(&iov->mtask);
+	iounmap(iov->mstate);
+}
+
 static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 {
 	int rc;
@@ -261,6 +352,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 			goto failed;
 	}
 
+	if (iov->cap & PCI_SRIOV_CAP_VFM) {
+		rc = sriov_enable_migration(dev, nr_virtfn);
+		if (rc)
+			goto failed;
+	}
+
 	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
 	iov->nr_virtfn = nr_virtfn;
 
@@ -290,6 +387,9 @@ static void sriov_disable(struct pci_dev *dev)
 	if (!iov->nr_virtfn)
 		return;
 
+	if (iov->cap & PCI_SRIOV_CAP_VFM)
+		sriov_disable_migration(dev);
+
 	for (i = 0; i < iov->nr_virtfn; i++)
 		virtfn_remove(dev, i, 0);
 
@@ -559,3 +659,22 @@ void pci_disable_sriov(struct pci_dev *dev)
 	sriov_disable(dev);
 }
 EXPORT_SYMBOL_GPL(pci_disable_sriov);
+
+/**
+ * pci_sriov_migration - notify SR-IOV core of Virtual Function Migration
+ * @dev: the PCI device
+ *
+ * Returns IRQ_HANDLED if the IRQ is handled, or IRQ_NONE if not.
+ *
+ * Physical Function driver is responsible to register IRQ handler using
+ * VF Migration Interrupt Message Number, and call this function when the
+ * interrupt is generated by the hardware.
+ */
+irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+	if (!dev->is_physfn)
+		return IRQ_NONE;
+
+	return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_migration);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 0f1c7d103509..22dcfdb75d91 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1,6 +1,8 @@
 #ifndef DRIVERS_PCI_H
 #define DRIVERS_PCI_H
 
+#include <linux/workqueue.h>
+
 #define PCI_CFG_SPACE_SIZE	256
 #define PCI_CFG_SPACE_EXP_SIZE	4096
 
@@ -218,6 +220,8 @@ struct pci_sriov {
 	struct pci_dev *dev;	/* lowest numbered PF */
 	struct pci_dev *self;	/* this PF */
 	struct mutex lock;	/* lock for VF bus */
+	struct work_struct mtask; /* VF Migration task */
+	u8 __iomem *mstate;	/* VF Migration State Array */
 };
 
 #ifdef CONFIG_PCI_IOV
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c2e491e04063..1216843412da 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -52,6 +52,7 @@
 #include <asm/atomic.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/irqreturn.h>
 
 /* Include the ID list */
 #include <linux/pci_ids.h>
@@ -1219,6 +1220,7 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 #ifdef CONFIG_PCI_IOV
 extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 extern void pci_disable_sriov(struct pci_dev *dev);
+extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
 #else
 static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 {
@@ -1227,6 +1229,10 @@ static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 static inline void pci_disable_sriov(struct pci_dev *dev)
 {
 }
+static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+	return IRQ_NONE;
+}
 #endif
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3-71-gd317


From 79af72d716cf1bb13b175429cf181a6c4d063ee8 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Fri, 20 Mar 2009 14:55:55 -0600
Subject: PCI: pci_is_root_bus helper

Introduce pci_is_root_bus helper function. This will help make code
more consistent, as well as prevent incorrect assumptions (such as
pci_bus->self == NULL on a root bus, which is not always true).

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1216843412da..50d94388e87c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -357,6 +357,15 @@ struct pci_bus {
 #define pci_bus_b(n)	list_entry(n, struct pci_bus, node)
 #define to_pci_bus(n)	container_of(n, struct pci_bus, dev)
 
+/*
+ * Returns true if the pci bus is root (behind host-pci bridge),
+ * false otherwise
+ */
+static inline bool pci_is_root_bus(struct pci_bus *pbus)
+{
+	return !(pbus->parent);
+}
+
 #ifdef CONFIG_PCI_MSI
 static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev)
 {
-- 
cgit v1.2.3-71-gd317


From 3ed4fd96b3188406ac5357d9290bcffa08c65cf6 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Fri, 20 Mar 2009 14:56:25 -0600
Subject: PCI: Introduce pci_rescan_bus()

This API is used by the PCI core to rescan a bus and rediscover
newly added devices.

Over time, it is expected that the various PCI hotplug drivers
will migrate to this interface and away from the old
pci_do_scan_bus() interface.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/fakephp.c |  6 +++---
 drivers/pci/probe.c           | 32 ++++++++++++++++++++++++++++++++
 include/linux/pci.h           |  3 +++
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
index d8649e127298..16063745766e 100644
--- a/drivers/pci/hotplug/fakephp.c
+++ b/drivers/pci/hotplug/fakephp.c
@@ -245,12 +245,12 @@ static int pci_rescan_slot(struct pci_dev *temp)
 
 
 /**
- * pci_rescan_bus - Rescan PCI bus
+ * pci_rescan_bus_local - fakephp version of rescan PCI bus
  * @bus: the PCI bus to rescan
  *
  * Call pci_rescan_slot for each possible function of the bus.
  */
-static void pci_rescan_bus(const struct pci_bus *bus)
+static void pci_rescan_bus_local(const struct pci_bus *bus)
 {
 	unsigned int devfn;
 	struct pci_dev *dev;
@@ -291,7 +291,7 @@ static void pci_rescan_buses(const struct list_head *list)
 	const struct list_head *l;
 	list_for_each(l,list) {
 		const struct pci_bus *b = pci_bus_b(l);
-		pci_rescan_bus(b);
+		pci_rescan_bus_local(b);
 		pci_rescan_buses(&b->children);
 	}
 }
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f69256c63b2b..60a8e5fec6c5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1212,6 +1212,38 @@ struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent,
 EXPORT_SYMBOL(pci_scan_bus_parented);
 
 #ifdef CONFIG_HOTPLUG
+/**
+ * pci_rescan_bus - scan a PCI bus for devices.
+ * @bus: PCI bus to scan
+ *
+ * Scan a PCI bus and child buses for new devices, adds them,
+ * and enables them.
+ *
+ * Returns the max number of subordinate bus discovered.
+ */
+unsigned int __devinit pci_rescan_bus(struct pci_bus *bus)
+{
+	unsigned int max;
+	struct pci_dev *dev;
+
+	max = pci_scan_child_bus(bus);
+
+	up_read(&pci_bus_sem);
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+			if (dev->subordinate)
+				pci_bus_size_bridges(dev->subordinate);
+	down_read(&pci_bus_sem);
+
+	pci_bus_assign_resources(bus);
+	pci_enable_bridges(bus);
+	pci_bus_add_devices(bus);
+
+	return max;
+}
+EXPORT_SYMBOL_GPL(pci_rescan_bus);
+
 EXPORT_SYMBOL(pci_add_new_bus);
 EXPORT_SYMBOL(pci_scan_slot);
 EXPORT_SYMBOL(pci_scan_bridge);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 50d94388e87c..6fb335b0d74f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -726,6 +726,9 @@ int pci_back_from_sleep(struct pci_dev *dev);
 
 /* Functions for PCI Hotplug drivers to use */
 int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap);
+#ifdef CONFIG_HOTPLUG
+unsigned int pci_rescan_bus(struct pci_bus *bus);
+#endif
 
 /* Vital product data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
-- 
cgit v1.2.3-71-gd317


From e2fc4d19292ef2eb208f76976ddc3320cc5839b6 Mon Sep 17 00:00:00 2001
From: Maciej Sosnowski <maciej.sosnowski@intel.com>
Date: Sat, 21 Mar 2009 13:31:23 -0700
Subject: dca: add missing copyright/license headers

In two dca files copyright and license headers are missing.
This patch adds them there.

Signed-off-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dca/dca-sysfs.c | 21 +++++++++++++++++++++
 include/linux/dca.h     | 20 ++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dca/dca-sysfs.c b/drivers/dca/dca-sysfs.c
index bb538b9690e0..ee916c9857ee 100644
--- a/drivers/dca/dca-sysfs.c
+++ b/drivers/dca/dca-sysfs.c
@@ -1,3 +1,24 @@
+/*
+ * Copyright(c) 2007 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
 #include <linux/device.h>
diff --git a/include/linux/dca.h b/include/linux/dca.h
index b00a753eda53..9c20c7e87d0a 100644
--- a/include/linux/dca.h
+++ b/include/linux/dca.h
@@ -1,3 +1,23 @@
+/*
+ * Copyright(c) 2007 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
 #ifndef DCA_H
 #define DCA_H
 /* DCA Provider API */
-- 
cgit v1.2.3-71-gd317


From 9247744e5eaa29aecee5342a0c8694187a6aadcd Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Sat, 21 Mar 2009 13:39:26 -0700
Subject: skb: expose and constify hash primitives

Some minor changes to queue hashing:
 1. Use const on accessor functions
 2. Export skb_tx_hash for use in drivers (see ixgbe)

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 9 ++++++---
 net/core/dev.c         | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1fbab2ae613c..bb1981fd60f3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1969,7 +1969,7 @@ static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
 	skb->queue_mapping = queue_mapping;
 }
 
-static inline u16 skb_get_queue_mapping(struct sk_buff *skb)
+static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
 {
 	return skb->queue_mapping;
 }
@@ -1984,16 +1984,19 @@ static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
 	skb->queue_mapping = rx_queue + 1;
 }
 
-static inline u16 skb_get_rx_queue(struct sk_buff *skb)
+static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
 {
 	return skb->queue_mapping - 1;
 }
 
-static inline bool skb_rx_queue_recorded(struct sk_buff *skb)
+static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 {
 	return (skb->queue_mapping != 0);
 }
 
+extern u16 skb_tx_hash(const struct net_device *dev,
+		       const struct sk_buff *skb);
+
 #ifdef CONFIG_XFRM
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index ca212acd3348..fdb9973b82a6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1725,7 +1725,7 @@ out_kfree_skb:
 
 static u32 skb_tx_hashrnd;
 
-static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
+u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
 	u32 hash;
 
@@ -1740,6 +1740,7 @@ static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
 
 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
+EXPORT_SYMBOL(skb_tx_hash);
 
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
-- 
cgit v1.2.3-71-gd317


From 777baa4711c6b8373f4e03a3a558d44a6b046d7a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 20 Mar 2009 19:35:54 +0000
Subject: usbnet: support net_device_ops

Use net_device_ops for usbnet device, and export for use
by other derived drivers.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 31 +++++++++++++++++++++++--------
 include/linux/usb/usbnet.h |  5 +++++
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 084141692245..659654f45880 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -223,7 +223,7 @@ EXPORT_SYMBOL_GPL(usbnet_skb_return);
  *
  *-------------------------------------------------------------------------*/
 
-static int usbnet_change_mtu (struct net_device *net, int new_mtu)
+int usbnet_change_mtu (struct net_device *net, int new_mtu)
 {
 	struct usbnet	*dev = netdev_priv(net);
 	int		ll_mtu = new_mtu + net->hard_header_len;
@@ -246,6 +246,7 @@ static int usbnet_change_mtu (struct net_device *net, int new_mtu)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usbnet_change_mtu);
 
 /*-------------------------------------------------------------------------*/
 
@@ -540,7 +541,7 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);
 
 // precondition: never called in_interrupt
 
-static int usbnet_stop (struct net_device *net)
+int usbnet_stop (struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
 	int			temp;
@@ -584,6 +585,7 @@ static int usbnet_stop (struct net_device *net)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usbnet_stop);
 
 /*-------------------------------------------------------------------------*/
 
@@ -591,7 +593,7 @@ static int usbnet_stop (struct net_device *net)
 
 // precondition: never called in_interrupt
 
-static int usbnet_open (struct net_device *net)
+int usbnet_open (struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
 	int			retval;
@@ -666,6 +668,7 @@ done:
 done_nopm:
 	return retval;
 }
+EXPORT_SYMBOL_GPL(usbnet_open);
 
 /*-------------------------------------------------------------------------*/
 
@@ -900,7 +903,7 @@ static void tx_complete (struct urb *urb)
 
 /*-------------------------------------------------------------------------*/
 
-static void usbnet_tx_timeout (struct net_device *net)
+void usbnet_tx_timeout (struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
 
@@ -909,10 +912,11 @@ static void usbnet_tx_timeout (struct net_device *net)
 
 	// FIXME: device recovery -- reset?
 }
+EXPORT_SYMBOL_GPL(usbnet_tx_timeout);
 
 /*-------------------------------------------------------------------------*/
 
-static int usbnet_start_xmit (struct sk_buff *skb, struct net_device *net)
+int usbnet_start_xmit (struct sk_buff *skb, struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
 	int			length;
@@ -995,7 +999,7 @@ drop:
 	}
 	return retval;
 }
-
+EXPORT_SYMBOL_GPL(usbnet_start_xmit);
 
 /*-------------------------------------------------------------------------*/
 
@@ -1102,6 +1106,15 @@ void usbnet_disconnect (struct usb_interface *intf)
 }
 EXPORT_SYMBOL_GPL(usbnet_disconnect);
 
+static const struct net_device_ops usbnet_netdev_ops = {
+	.ndo_open		= usbnet_open,
+	.ndo_stop		= usbnet_stop,
+	.ndo_start_xmit		= usbnet_start_xmit,
+	.ndo_tx_timeout		= usbnet_tx_timeout,
+	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+};
 
 /*-------------------------------------------------------------------------*/
 
@@ -1171,12 +1184,14 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 		net->features |= NETIF_F_HIGHDMA;
 #endif
 
-	net->change_mtu = usbnet_change_mtu;
+	net->netdev_ops = &usbnet_netdev_ops;
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
 	net->hard_start_xmit = usbnet_start_xmit;
 	net->open = usbnet_open;
 	net->stop = usbnet_stop;
-	net->watchdog_timeo = TX_TIMEOUT_JIFFIES;
 	net->tx_timeout = usbnet_tx_timeout;
+#endif
+	net->watchdog_timeo = TX_TIMEOUT_JIFFIES;
 	net->ethtool_ops = &usbnet_ethtool_ops;
 
 	// allow device-specific bind/init procedures
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 7d3822243074..36fabb95c7d3 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -176,6 +176,11 @@ struct skb_data {	/* skb->cb is one of these */
 	size_t			length;
 };
 
+extern int usbnet_open (struct net_device *net);
+extern int usbnet_stop (struct net_device *net);
+extern int usbnet_start_xmit (struct sk_buff *skb, struct net_device *net);
+extern void usbnet_tx_timeout (struct net_device *net);
+extern int usbnet_change_mtu (struct net_device *net, int new_mtu);
 
 extern int usbnet_get_endpoints(struct usbnet *, struct usb_interface *);
 extern void usbnet_defer_kevent (struct usbnet *, int);
-- 
cgit v1.2.3-71-gd317


From 2d622719f1572ef31e0616444a515eba3094d050 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:49 -0500
Subject: tracing: add ring_buffer_event_discard() to ring buffer

This patch overloads RINGBUF_TYPE_PADDING to provide a way to discard
events from the ring buffer, for the event-filtering mechanism
introduced in a subsequent patch.

I did the initial version but thanks to Steven Rostedt for adding
the parts that actually made it work. ;-)

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  11 +++--
 kernel/trace/ring_buffer.c  | 117 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 105 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 9e6052bd1a1c..e1b7b2173885 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -18,10 +18,13 @@ struct ring_buffer_event {
 /**
  * enum ring_buffer_type - internal ring buffer types
  *
- * @RINGBUF_TYPE_PADDING:	Left over page padding
- *				 array is ignored
- *				 size is variable depending on how much
+ * @RINGBUF_TYPE_PADDING:	Left over page padding or discarded event
+ *				 If time_delta is 0:
+ *				  array is ignored
+ *				  size is variable depending on how much
  *				  padding is needed
+ *				 If time_delta is non zero:
+ *				  everything else same as RINGBUF_TYPE_DATA
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
@@ -65,6 +68,8 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
+void ring_buffer_event_discard(struct ring_buffer_event *event);
+
 /*
  * size is in bytes for each per CPU buffer.
  */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d9d729..a09027ec1714 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -189,16 +189,65 @@ enum {
 	RB_LEN_TIME_STAMP = 16,
 };
 
-/* inline for ring buffer fast paths */
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+}
+
+static inline int rb_discarded_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+}
+
+static void rb_event_set_padding(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	event->time_delta = 0;
+}
+
+/**
+ * ring_buffer_event_discard - discard an event in the ring buffer
+ * @buffer: the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 static unsigned
-rb_event_length(struct ring_buffer_event *event)
+rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
+	if (event->len)
+		length = event->len * RB_ALIGNMENT;
+	else
+		length = event->array[0];
+	return length + RB_EVNT_HDR_SIZE;
+}
+
+/* inline for ring buffer fast paths */
+static unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		/* undefined */
-		return -1;
+		if (rb_null_event(event))
+			/* undefined */
+			return -1;
+		return rb_event_data_length(event);
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
 		return RB_LEN_TIME_STAMP;
 
 	case RINGBUF_TYPE_DATA:
-		if (event->len)
-			length = event->len * RB_ALIGNMENT;
-		else
-			length = event->array[0];
-		return length + RB_EVNT_HDR_SIZE;
+		return rb_event_data_length(event);
 	default:
 		BUG();
 	}
@@ -845,11 +890,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
-static inline int rb_null_event(struct ring_buffer_event *event)
-{
-	return event->type == RINGBUF_TYPE_PADDING;
-}
-
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
@@ -1219,7 +1259,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (tail < BUF_PAGE_SIZE) {
 			/* Mark the rest of the page with padding */
 			event = __rb_page_index(tail_page, tail);
-			event->type = RINGBUF_TYPE_PADDING;
+			rb_event_set_padding(event);
 		}
 
 		if (tail <= BUF_PAGE_SIZE)
@@ -1969,7 +2009,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA)
+	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2052,9 +2092,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		RB_WARN_ON(cpu_buffer, 1);
+		if (rb_null_event(event))
+			RB_WARN_ON(cpu_buffer, 1);
+		/*
+		 * Because the writer could be discarding every
+		 * event it creates (which would probably be bad)
+		 * if we were to go back to "again" then we may never
+		 * catch up, and will trigger the warn on, or lock
+		 * the box. Return the padding, and we will release
+		 * the current locks, and try again.
+		 */
 		rb_advance_reader(cpu_buffer);
-		return NULL;
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2115,8 +2164,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		rb_inc_iter(iter);
-		goto again;
+		if (rb_null_event(event)) {
+			rb_inc_iter(iter);
+			goto again;
+		}
+		rb_advance_iter(iter);
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2163,10 +2216,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2185,10 +2244,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2207,6 +2272,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+ again:
 	/* might be called in atomic */
 	preempt_disable();
 
@@ -2228,6 +2294,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2306,6 +2377,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	if (!event)
@@ -2315,6 +2387,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read);
-- 
cgit v1.2.3-71-gd317


From dd5b6ce6fd465eab90357711c8e8124dc3a31ff0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 23 Mar 2009 13:21:06 +0100
Subject: nefilter: nfnetlink: add nfnetlink_set_err and use it in ctnetlink

This patch adds nfnetlink_set_err() to propagate the error to netlink
broadcast listener in case of memory allocation errors in the
message building.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink.h  | 1 +
 net/netfilter/nf_conntrack_netlink.c | 2 ++
 net/netfilter/nfnetlink.c            | 6 ++++++
 net/netlink/af_netlink.c             | 1 +
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 7d8e0455ccac..135e5cfe68a2 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -76,6 +76,7 @@ extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 extern int nfnetlink_has_listeners(unsigned int group);
 extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, 
 			  int echo);
+extern void nfnetlink_set_err(u32 pid, u32 group, int error);
 extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags);
 
 extern void nfnl_lock(void);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d1fe9d15ac5c..1b75c9efb0eb 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -518,6 +518,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this,
 nla_put_failure:
 	rcu_read_unlock();
 nlmsg_failure:
+	nfnetlink_set_err(0, group, -ENOBUFS);
 	kfree_skb(skb);
 	return NOTIFY_DONE;
 }
@@ -1514,6 +1515,7 @@ static int ctnetlink_expect_event(struct notifier_block *this,
 nla_put_failure:
 	rcu_read_unlock();
 nlmsg_failure:
+	nfnetlink_set_err(0, 0, -ENOBUFS);
 	kfree_skb(skb);
 	return NOTIFY_DONE;
 }
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 9c0ba17a1ddb..2785d66a7e38 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -113,6 +113,12 @@ int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_send);
 
+void nfnetlink_set_err(u32 pid, u32 group, int error)
+{
+	netlink_set_err(nfnl, pid, group, error);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_set_err);
+
 int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
 {
 	return netlink_unicast(nfnl, skb, pid, flags);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6ee69c27f806..5b33879c6422 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1106,6 +1106,7 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 
 	read_unlock(&nl_table_lock);
 }
+EXPORT_SYMBOL(netlink_set_err);
 
 /* must be called with netlink table grabbed */
 static void netlink_update_socket_mc(struct netlink_sock *nlk,
-- 
cgit v1.2.3-71-gd317


From c0f92ba99bdeaf35f9c580291b4e1a657c67fbd4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:44 +0100
Subject: debugfs: function to know if debugfs is initialized

Impact: add new debugfs API

With ftrace, some tracers are registered in early initcalls
and attempt to create files on the debugfs filesystem.
Depending on when they are activated, they can try to create their
file at any time. Some checks can be done on the tracing area
but providing a helper to know if debugfs is registered make it
really more easy.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/debugfs/inode.c      | 16 ++++++++++++++++
 include/linux/debugfs.h |  8 ++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
 
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
+static bool debugfs_registered;
 
 static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 {
@@ -496,6 +497,16 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+	return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
+
+
 static struct kobject *debug_kobj;
 
 static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
 	retval = register_filesystem(&debug_fs_type);
 	if (retval)
 		kobject_put(debug_kobj);
+	else
+		debugfs_registered = true;
+
 	return retval;
 }
 
 static void __exit debugfs_exit(void)
 {
+	debugfs_registered = false;
+
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	unregister_filesystem(&debug_fs_type);
 	kobject_put(debug_kobj);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index af0e01d4c663..eb5c2ba2f81a 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -71,6 +71,9 @@ struct dentry *debugfs_create_bool(const char *name, mode_t mode,
 struct dentry *debugfs_create_blob(const char *name, mode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob);
+
+bool debugfs_initialized(void);
+
 #else
 
 #include <linux/err.h>
@@ -183,6 +186,11 @@ static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline bool debugfs_initialized(void)
+{
+	return false;
+}
+
 #endif
 
 #endif
-- 
cgit v1.2.3-71-gd317


From d0bfb940ecabf0b44fb1fd80d8d60594e569e5ec Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Dec 2008 13:52:10 +0100
Subject: KVM: New guest debug interface

This rips out the support for KVM_DEBUG_GUEST and introduces a new IOCTL
instead: KVM_SET_GUEST_DEBUG. The IOCTL payload consists of a generic
part, controlling the "main switch" and the single-step feature. The
arch specific part adds an x86 interface for intercepting both types of
debug exceptions separately and re-injecting them when the host was not
interested. Moveover, the foundation for guest debugging via debug
registers is layed.

To signal breakpoint events properly back to userland, an arch-specific
data block is now returned along KVM_EXIT_DEBUG. For x86, the arch block
contains the PC, the debug exception, and relevant debug registers to
tell debug events properly apart.

The availability of this new interface is signaled by
KVM_CAP_SET_GUEST_DEBUG. Empty stubs for not yet supported archs are
provided.

Note that both SVM and VTX are supported, but only the latter was tested
yet. Based on the experience with all those VTX corner case, I would be
fairly surprised if SVM will work out of the box.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm.h     |  7 ++++
 arch/ia64/kvm/kvm-ia64.c        |  4 +-
 arch/powerpc/include/asm/kvm.h  |  7 ++++
 arch/powerpc/kvm/powerpc.c      |  4 +-
 arch/s390/include/asm/kvm.h     |  7 ++++
 arch/s390/kvm/kvm-s390.c        |  4 +-
 arch/x86/include/asm/kvm.h      | 18 ++++++++
 arch/x86/include/asm/kvm_host.h |  9 +---
 arch/x86/kvm/svm.c              | 50 +++++++++++++++++++++-
 arch/x86/kvm/vmx.c              | 93 ++++++++++++++++-------------------------
 arch/x86/kvm/x86.c              | 14 ++++---
 include/linux/kvm.h             | 51 +++++++++++++++-------
 include/linux/kvm_host.h        |  6 +--
 virt/kvm/kvm_main.c             |  6 +--
 14 files changed, 179 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index bfa86b6af7cd..be3fdb891214 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -214,4 +214,11 @@ struct kvm_sregs {
 struct kvm_fpu {
 };
 
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 28f982045f29..de47467a0e61 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1303,8 +1303,8 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return -EINVAL;
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-		struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
 	return -EINVAL;
 }
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index f993e4198d5c..755f1b1948c5 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -52,4 +52,11 @@ struct kvm_fpu {
 	__u64 fpr[32];
 };
 
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 5f81256287f5..7c2ad4017d6a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -240,8 +240,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvmppc_core_vcpu_put(vcpu);
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-                                    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
 	int i;
 
diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index e1f54654e3ae..0b2f829f6d50 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -42,4 +42,11 @@ struct kvm_fpu {
 	__u64 fprs[16];
 };
 
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
 #endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0d33893e1e89..cbfe91e10120 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -422,8 +422,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	return -EINVAL; /* not implemented yet */
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
 	return -EINVAL; /* not implemented yet */
 }
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 886c9402ec45..32eb96c7ca27 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -212,6 +212,24 @@ struct kvm_pit_channel_state {
 	__s64 count_load_time;
 };
 
+struct kvm_debug_exit_arch {
+	__u32 exception;
+	__u32 pad;
+	__u64 pc;
+	__u64 dr6;
+	__u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+#define KVM_GUESTDBG_USE_HW_BP		0x00020000
+#define KVM_GUESTDBG_INJECT_DB		0x00040000
+#define KVM_GUESTDBG_INJECT_BP		0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+	__u64 debugreg[8];
+};
+
 struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53779309514a..c430cd580ee2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -135,12 +135,6 @@ enum {
 
 #define KVM_NR_MEM_OBJS 40
 
-struct kvm_guest_debug {
-	int enabled;
-	unsigned long bp[4];
-	int singlestep;
-};
-
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
@@ -448,8 +442,7 @@ struct kvm_x86_ops {
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
 	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-			       struct kvm_debug_guest *dbg);
-	void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+			       struct kvm_guest_debug *dbg);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0fbbde54ecae..88d9062f4545 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -968,9 +968,32 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	return -EOPNOTSUPP;
+	int old_debug = vcpu->guest_debug;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->guest_debug = dbg->control;
+
+	svm->vmcb->control.intercept_exceptions &=
+		~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			svm->vmcb->control.intercept_exceptions |=
+				1 << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			svm->vmcb->control.intercept_exceptions |=
+				1 << BP_VECTOR;
+	} else
+		vcpu->guest_debug = 0;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+	return 0;
 }
 
 static int svm_get_irq(struct kvm_vcpu *vcpu)
@@ -1094,6 +1117,27 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
+static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (!(svm->vcpu.guest_debug &
+	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+		kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+		return 1;
+	}
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = DB_VECTOR;
+	return 0;
+}
+
+static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = BP_VECTOR;
+	return 0;
+}
+
 static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	int er;
@@ -2050,6 +2094,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
+	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
+	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1d974c1eaa7d..f55690ddb3ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -480,8 +480,13 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
-	if (vcpu->guest_debug.enabled)
-		eb |= 1u << DB_VECTOR;
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			eb |= 1u << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			eb |= 1u << BP_VECTOR;
+	}
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
 	if (vm_need_ept())
@@ -1003,40 +1008,23 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	}
 }
 
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	unsigned long dr7 = 0x400;
-	int old_singlestep;
-
-	old_singlestep = vcpu->guest_debug.singlestep;
-
-	vcpu->guest_debug.enabled = dbg->enabled;
-	if (vcpu->guest_debug.enabled) {
-		int i;
-
-		dr7 |= 0x200;  /* exact */
-		for (i = 0; i < 4; ++i) {
-			if (!dbg->breakpoints[i].enabled)
-				continue;
-			vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-			dr7 |= 2 << (i*2);    /* global enable */
-			dr7 |= 0 << (i*4+16); /* execution breakpoint */
-		}
-
-		vcpu->guest_debug.singlestep = dbg->singlestep;
-	} else
-		vcpu->guest_debug.singlestep = 0;
+	int old_debug = vcpu->guest_debug;
+	unsigned long flags;
 
-	if (old_singlestep && !vcpu->guest_debug.singlestep) {
-		unsigned long flags;
+	vcpu->guest_debug = dbg->control;
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+		vcpu->guest_debug = 0;
 
-		flags = vmcs_readl(GUEST_RFLAGS);
+	flags = vmcs_readl(GUEST_RFLAGS);
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
 		flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
+	vmcs_writel(GUEST_RFLAGS, flags);
 
 	update_exception_bitmap(vcpu);
-	vmcs_writel(GUEST_DR7, dr7);
 
 	return 0;
 }
@@ -2540,24 +2528,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
-	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
-	set_debugreg(dbg->bp[0], 0);
-	set_debugreg(dbg->bp[1], 1);
-	set_debugreg(dbg->bp[2], 2);
-	set_debugreg(dbg->bp[3], 3);
-
-	if (dbg->singlestep) {
-		unsigned long flags;
-
-		flags = vmcs_readl(GUEST_RFLAGS);
-		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
-}
-
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 				  int vec, u32 err_code)
 {
@@ -2574,9 +2544,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	 *        the required debugging infrastructure rework.
 	 */
 	switch (vec) {
-	case DE_VECTOR:
 	case DB_VECTOR:
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			return 0;
+		kvm_queue_exception(vcpu, vec);
+		return 1;
 	case BP_VECTOR:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			return 0;
+		/* fall through */
+	case DE_VECTOR:
 	case OF_VECTOR:
 	case BR_VECTOR:
 	case UD_VECTOR:
@@ -2593,7 +2571,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 intr_info, error_code;
+	u32 intr_info, ex_no, error_code;
 	unsigned long cr2, rip;
 	u32 vect_info;
 	enum emulation_result er;
@@ -2653,14 +2631,16 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 1;
 	}
 
-	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-	    (INTR_TYPE_HARD_EXCEPTION | 1)) {
+	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+	if (ex_no == DB_VECTOR || ex_no == BP_VECTOR) {
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
-		return 0;
+		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+		kvm_run->debug.arch.exception = ex_no;
+	} else {
+		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+		kvm_run->ex.exception = ex_no;
+		kvm_run->ex.error_code = error_code;
 	}
-	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-	kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
-	kvm_run->ex.error_code = error_code;
 	return 0;
 }
 
@@ -3600,7 +3580,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.vcpu_put = vmx_vcpu_put,
 
 	.set_guest_debug = set_guest_debug,
-	.guest_debug_pre = kvm_guest_debug_pre,
 	.get_msr = vmx_get_msr,
 	.set_msr = vmx_set_msr,
 	.get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b5e9932e0f62..e990d164b56d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3005,9 +3005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
 	vcpu->guest_mode = 1;
 	/*
 	 * Make sure that guest_mode assignment won't happen after
@@ -3218,7 +3215,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	/*
 	 * Don't leak debug flags in case they were set for guest debugging
 	 */
-	if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
 
 	vcpu_put(vcpu);
@@ -3837,8 +3834,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
 	int r;
 
@@ -3846,6 +3843,11 @@ int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
 
 	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
 
+	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+		kvm_queue_exception(vcpu, DB_VECTOR);
+	else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
+		kvm_queue_exception(vcpu, BP_VECTOR);
+
 	vcpu_put(vcpu);
 
 	return r;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 0424326f1679..429a2ce202f9 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -126,6 +126,7 @@ struct kvm_run {
 			__u64 data_offset; /* relative to kvm_run start */
 		} io;
 		struct {
+			struct kvm_debug_exit_arch arch;
 		} debug;
 		/* KVM_EXIT_MMIO */
 		struct {
@@ -217,21 +218,6 @@ struct kvm_interrupt {
 	__u32 irq;
 };
 
-struct kvm_breakpoint {
-	__u32 enabled;
-	__u32 padding;
-	__u64 address;
-};
-
-/* for KVM_DEBUG_GUEST */
-struct kvm_debug_guest {
-	/* int */
-	__u32 enabled;
-	__u32 pad;
-	struct kvm_breakpoint breakpoints[4];
-	__u32 singlestep;
-};
-
 /* for KVM_GET_DIRTY_LOG */
 struct kvm_dirty_log {
 	__u32 slot;
@@ -292,6 +278,17 @@ struct kvm_s390_interrupt {
 	__u64 parm64;
 };
 
+/* for KVM_SET_GUEST_DEBUG */
+
+#define KVM_GUESTDBG_ENABLE		0x00000001
+#define KVM_GUESTDBG_SINGLESTEP		0x00000002
+
+struct kvm_guest_debug {
+	__u32 control;
+	__u32 pad;
+	struct kvm_guest_debug_arch arch;
+};
+
 #define KVM_TRC_SHIFT           16
 /*
  * kvm trace categories
@@ -396,6 +393,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_USER_NMI
 #define KVM_CAP_USER_NMI 22
 #endif
+#define KVM_CAP_SET_GUEST_DEBUG 23
 
 /*
  * ioctls for VM fds
@@ -440,7 +438,8 @@ struct kvm_trace_rec {
 #define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs)
 #define KVM_TRANSLATE             _IOWR(KVMIO, 0x85, struct kvm_translation)
 #define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
-#define KVM_DEBUG_GUEST           _IOW(KVMIO,  0x87, struct kvm_debug_guest)
+/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
+#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_DEBUG_GUEST
 #define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
 #define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
 #define KVM_SET_CPUID             _IOW(KVMIO,  0x8a, struct kvm_cpuid)
@@ -469,6 +468,26 @@ struct kvm_trace_rec {
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 /* Available with KVM_CAP_NMI */
 #define KVM_NMI                   _IO(KVMIO,  0x9a)
+/* Available with KVM_CAP_SET_GUEST_DEBUG */
+#define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
+
+/*
+ * Deprecated interfaces
+ */
+struct kvm_breakpoint {
+	__u32 enabled;
+	__u32 padding;
+	__u64 address;
+};
+
+struct kvm_debug_guest {
+	__u32 enabled;
+	__u32 pad;
+	struct kvm_breakpoint breakpoints[4];
+	__u32 singlestep;
+};
+
+#define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO,  0x87, struct kvm_debug_guest)
 
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bf6f703642fc..e92212f970db 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -73,7 +73,7 @@ struct kvm_vcpu {
 	struct kvm_run *run;
 	int guest_mode;
 	unsigned long requests;
-	struct kvm_guest_debug guest_debug;
+	unsigned long guest_debug;
 	int fpu_active;
 	int guest_fpu_loaded;
 	wait_queue_head_t wq;
@@ -255,8 +255,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state);
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state);
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg);
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 
 int kvm_arch_init(void *opaque);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 29a667ce35b0..f83ef9c7e89b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1755,13 +1755,13 @@ out_free2:
 		r = 0;
 		break;
 	}
-	case KVM_DEBUG_GUEST: {
-		struct kvm_debug_guest dbg;
+	case KVM_SET_GUEST_DEBUG: {
+		struct kvm_guest_debug dbg;
 
 		r = -EFAULT;
 		if (copy_from_user(&dbg, argp, sizeof dbg))
 			goto out;
-		r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
+		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
 		if (r)
 			goto out;
 		r = 0;
-- 
cgit v1.2.3-71-gd317


From e9a999fe1feaddb71bffbacbbd68e0da8ca8b50b Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 18 Dec 2008 12:17:51 +0100
Subject: KVM: ia64: stack get/restore patch

Implement KVM_IA64_VCPU_[GS]ET_STACK ioctl calls. This is required
for live migrations.

Patch is based on previous implementation that was part of old
GET/SET_REGS ioctl calls.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm.h      |  7 +++
 arch/ia64/include/asm/kvm_host.h |  6 ++-
 arch/ia64/kvm/kvm-ia64.c         | 92 ++++++++++++++++++++++++++++++++++++++--
 include/linux/kvm.h              |  3 ++
 4 files changed, 104 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index be3fdb891214..b5145784233e 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -214,6 +214,13 @@ struct kvm_sregs {
 struct kvm_fpu {
 };
 
+#define KVM_IA64_VCPU_STACK_SHIFT	16
+#define KVM_IA64_VCPU_STACK_SIZE	(1UL << KVM_IA64_VCPU_STACK_SHIFT)
+
+struct kvm_ia64_vcpu_stack {
+	unsigned char stack[KVM_IA64_VCPU_STACK_SIZE];
+};
+
 struct kvm_debug_exit_arch {
 };
 
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 348663661659..7da0c0963226 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -112,7 +112,11 @@
 #define VCPU_STRUCT_SHIFT	16
 #define VCPU_STRUCT_SIZE	(__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT)
 
-#define KVM_STK_OFFSET		VCPU_STRUCT_SIZE
+/*
+ * This must match KVM_IA64_VCPU_STACK_{SHIFT,SIZE} arch/ia64/include/asm/kvm.h
+ */
+#define KVM_STK_SHIFT		16
+#define KVM_STK_OFFSET		(__IA64_UL_CONST(1)<< KVM_STK_SHIFT)
 
 #define KVM_VM_STRUCT_SHIFT	19
 #define KVM_VM_STRUCT_SIZE	(__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT)
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index de47467a0e61..1477f91617a5 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1421,6 +1421,23 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
+int kvm_arch_vcpu_ioctl_get_stack(struct kvm_vcpu *vcpu,
+				  struct kvm_ia64_vcpu_stack *stack)
+{
+	memcpy(stack, vcpu, sizeof(struct kvm_ia64_vcpu_stack));
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_stack(struct kvm_vcpu *vcpu,
+				  struct kvm_ia64_vcpu_stack *stack)
+{
+	memcpy(vcpu + 1, &stack->stack[0] + sizeof(struct kvm_vcpu),
+	       sizeof(struct kvm_ia64_vcpu_stack) - sizeof(struct kvm_vcpu));
+
+	vcpu->arch.exit_data = ((struct kvm_vcpu *)stack)->arch.exit_data;
+	return 0;
+}
+
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 
@@ -1430,9 +1447,78 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 
 long kvm_arch_vcpu_ioctl(struct file *filp,
-		unsigned int ioctl, unsigned long arg)
+			 unsigned int ioctl, unsigned long arg)
 {
-	return -EINVAL;
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	struct kvm_ia64_vcpu_stack *stack = NULL;
+	long r;
+
+	switch (ioctl) {
+	case KVM_IA64_VCPU_GET_STACK: {
+		struct kvm_ia64_vcpu_stack __user *user_stack;
+	        void __user *first_p = argp;
+
+		r = -EFAULT;
+		if (copy_from_user(&user_stack, first_p, sizeof(void *)))
+			goto out;
+
+		if (!access_ok(VERIFY_WRITE, user_stack,
+			       sizeof(struct kvm_ia64_vcpu_stack))) {
+			printk(KERN_INFO "KVM_IA64_VCPU_GET_STACK: "
+			       "Illegal user destination address for stack\n");
+			goto out;
+		}
+		stack = kzalloc(sizeof(struct kvm_ia64_vcpu_stack), GFP_KERNEL);
+		if (!stack) {
+			r = -ENOMEM;
+			goto out;
+		}
+
+		r = kvm_arch_vcpu_ioctl_get_stack(vcpu, stack);
+		if (r)
+			goto out;
+
+		if (copy_to_user(user_stack, stack,
+				 sizeof(struct kvm_ia64_vcpu_stack)))
+			goto out;
+
+		break;
+	}
+	case KVM_IA64_VCPU_SET_STACK: {
+		struct kvm_ia64_vcpu_stack __user *user_stack;
+	        void __user *first_p = argp;
+
+		r = -EFAULT;
+		if (copy_from_user(&user_stack, first_p, sizeof(void *)))
+			goto out;
+
+		if (!access_ok(VERIFY_READ, user_stack,
+			    sizeof(struct kvm_ia64_vcpu_stack))) {
+			printk(KERN_INFO "KVM_IA64_VCPU_SET_STACK: "
+			       "Illegal user address for stack\n");
+			goto out;
+		}
+		stack = kmalloc(sizeof(struct kvm_ia64_vcpu_stack), GFP_KERNEL);
+		if (!stack) {
+			r = -ENOMEM;
+			goto out;
+		}
+		if (copy_from_user(stack, user_stack,
+				   sizeof(struct kvm_ia64_vcpu_stack)))
+			goto out;
+
+		r = kvm_arch_vcpu_ioctl_set_stack(vcpu, stack);
+		break;
+	}
+
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	kfree(stack);
+	return r;
 }
 
 int kvm_arch_set_memory_region(struct kvm *kvm,
@@ -1472,7 +1558,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 }
 
 long kvm_arch_dev_ioctl(struct file *filp,
-		unsigned int ioctl, unsigned long arg)
+			unsigned int ioctl, unsigned long arg)
 {
 	return -EINVAL;
 }
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 429a2ce202f9..28582fcd3f79 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -489,6 +489,9 @@ struct kvm_debug_guest {
 
 #define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO,  0x87, struct kvm_debug_guest)
 
+#define KVM_IA64_VCPU_GET_STACK   _IOR(KVMIO,  0x9a, void *)
+#define KVM_IA64_VCPU_SET_STACK   _IOW(KVMIO,  0x9b, void *)
+
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
 #define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
-- 
cgit v1.2.3-71-gd317


From 971cc3dcbc0e020b82f568e61a47b72be03307dd Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 19 Dec 2008 18:13:54 +0100
Subject: KVM: Advertise guest debug capability per-arch

Limit KVM_CAP_SET_GUEST_DEBUG only to those archs (currently x86) that
support it. This simplifies user space stub implementations.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 28582fcd3f79..11e3e6197c83 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -393,7 +393,9 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_USER_NMI
 #define KVM_CAP_USER_NMI 22
 #endif
+#if defined(CONFIG_X86)
 #define KVM_CAP_SET_GUEST_DEBUG 23
+#endif
 
 /*
  * ioctls for VM fds
-- 
cgit v1.2.3-71-gd317


From 0f346074403bc109f9569f14b45cb09e83729032 Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Mon, 29 Dec 2008 01:42:20 +0200
Subject: KVM: remove the vmap usage

vmap() on guest pages hides those pages from the Linux mm for an extended
(userspace determined) amount of time.  Get rid of it.

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c        | 62 ++++++++++-------------------------------------
 include/linux/kvm_types.h |  3 +--
 2 files changed, 14 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 67f91764e99b..2a4f3a697343 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2371,40 +2371,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
-		if (vcpu->arch.pio.guest_pages[i]) {
-			kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
-			vcpu->arch.pio.guest_pages[i] = NULL;
-		}
-}
-
 static int pio_copy_data(struct kvm_vcpu *vcpu)
 {
 	void *p = vcpu->arch.pio_data;
-	void *q;
+	gva_t q = vcpu->arch.pio.guest_gva;
 	unsigned bytes;
-	int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+	int ret;
 
-	q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
-		 PAGE_KERNEL);
-	if (!q) {
-		free_pio_guest_pages(vcpu);
-		return -ENOMEM;
-	}
-	q += vcpu->arch.pio.guest_page_offset;
 	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
 	if (vcpu->arch.pio.in)
-		memcpy(q, p, bytes);
+		ret = kvm_write_guest_virt(q, p, bytes, vcpu);
 	else
-		memcpy(p, q, bytes);
-	q -= vcpu->arch.pio.guest_page_offset;
-	vunmap(q);
-	free_pio_guest_pages(vcpu);
-	return 0;
+		ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+	return ret;
 }
 
 int complete_pio(struct kvm_vcpu *vcpu)
@@ -2515,7 +2494,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 0;
 	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.guest_page_offset = 0;
 	vcpu->arch.pio.rep = 0;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2543,9 +2521,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  gva_t address, int rep, unsigned port)
 {
 	unsigned now, in_page;
-	int i, ret = 0;
-	int nr_pages = 1;
-	struct page *page;
+	int ret = 0;
 	struct kvm_io_device *pio_dev;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2557,7 +2533,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 1;
 	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.guest_page_offset = offset_in_page(address);
 	vcpu->arch.pio.rep = rep;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2577,15 +2552,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	else
 		in_page = offset_in_page(address) + size;
 	now = min(count, (unsigned long)in_page / size);
-	if (!now) {
-		/*
-		 * String I/O straddles page boundary.  Pin two guest pages
-		 * so that we satisfy atomicity constraints.  Do just one
-		 * transaction to avoid complexity.
-		 */
-		nr_pages = 2;
+	if (!now)
 		now = 1;
-	}
 	if (down) {
 		/*
 		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
@@ -2600,15 +2568,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
-	for (i = 0; i < nr_pages; ++i) {
-		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-		vcpu->arch.pio.guest_pages[i] = page;
-		if (!page) {
-			kvm_inject_gp(vcpu, 0);
-			free_pio_guest_pages(vcpu);
-			return 1;
-		}
-	}
+	vcpu->arch.pio.guest_gva = address;
 
 	pio_dev = vcpu_find_pio_dev(vcpu, port,
 				    vcpu->arch.pio.cur_count,
@@ -2616,7 +2576,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (!vcpu->arch.pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
-		if (ret >= 0 && pio_dev) {
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		if (ret == 0 && pio_dev) {
 			pio_string_write(pio_dev, vcpu);
 			complete_pio(vcpu);
 			if (vcpu->arch.pio.count == 0)
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 9b6f395c9625..5f4a18cae26b 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -43,8 +43,7 @@ typedef hfn_t pfn_t;
 struct kvm_pio_request {
 	unsigned long count;
 	int cur_count;
-	struct page *guest_pages[2];
-	unsigned guest_page_offset;
+	gva_t guest_gva;
 	int in;
 	int port;
 	int size;
-- 
cgit v1.2.3-71-gd317


From 52d939a0bf44081bc9f69b4fbdc9e7f416df27c7 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 30 Dec 2008 15:55:06 -0200
Subject: KVM: PIT: provide an option to disable interrupt reinjection

Certain clocks (such as TSC) in older 2.6 guests overaccount for lost
ticks, causing severe time drift. Interrupt reinjection magnifies the
problem.

Provide an option to disable it.

[avi: allow room for expansion in case we want to disable reinjection
      of other timers]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h |  5 +++++
 arch/x86/kvm/i8254.c       |  4 ++++
 arch/x86/kvm/i8254.h       |  1 +
 arch/x86/kvm/x86.c         | 21 +++++++++++++++++++++
 include/linux/kvm.h        |  4 ++++
 5 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 32eb96c7ca27..54bcf2281526 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -233,4 +233,9 @@ struct kvm_guest_debug_arch {
 struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
+
+struct kvm_reinject_control {
+	__u8 pit_reinject;
+	__u8 reserved[31];
+};
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 72bd275a9b5c..528daadeba49 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 	if (!atomic_inc_and_test(&pt->pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
 
+	if (!pt->reinject)
+		atomic_set(&pt->pending, 1);
+
 	if (vcpu0 && waitqueue_active(&vcpu0->wq))
 		wake_up_interruptible(&vcpu0->wq);
 
@@ -580,6 +583,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
 	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+	pit_state->pit_timer.reinject = true;
 	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 4178022b97aa..76959c4b500e 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -9,6 +9,7 @@ struct kvm_kpit_timer {
 	s64 period; /* unit: ns */
 	s64 scheduled;
 	atomic_t pending;
+	bool reinject;
 };
 
 struct kvm_kpit_channel_state {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3fbe8c55c13..a1f14611f4b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -993,6 +993,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_REINJECT_CONTROL:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1728,6 +1729,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 	return r;
 }
 
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+				 struct kvm_reinject_control *control)
+{
+	if (!kvm->arch.vpit)
+		return -ENXIO;
+	kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+	return 0;
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -1925,6 +1935,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_REINJECT_CONTROL: {
+		struct kvm_reinject_control control;
+		r =  -EFAULT;
+		if (copy_from_user(&control, argp, sizeof(control)))
+			goto out;
+		r = kvm_vm_ioctl_reinject(kvm, &control);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 11e3e6197c83..ae7a12c77427 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -396,6 +396,9 @@ struct kvm_trace_rec {
 #if defined(CONFIG_X86)
 #define KVM_CAP_SET_GUEST_DEBUG 23
 #endif
+#if defined(CONFIG_X86)
+#define KVM_CAP_REINJECT_CONTROL 24
+#endif
 
 /*
  * ioctls for VM fds
@@ -429,6 +432,7 @@ struct kvm_trace_rec {
 				   struct kvm_assigned_pci_dev)
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 
 /*
  * ioctls for vcpu fds
-- 
cgit v1.2.3-71-gd317


From 1c08364c3565242f1e1bd585bc2ce458967941af Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Jan 2009 12:39:07 +0200
Subject: KVM: Move struct kvm_pio_request into x86 kvm_host.h

This is an x86 specific stucture and has no business living in common code.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++++++++
 include/linux/kvm_types.h       | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b74576aec19a..863ea73431ad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -227,6 +227,18 @@ struct kvm_pv_mmu_op_buffer {
 	char buf[512] __aligned(sizeof(long));
 };
 
+struct kvm_pio_request {
+	unsigned long count;
+	int cur_count;
+	gva_t guest_gva;
+	int in;
+	int port;
+	int size;
+	int string;
+	int down;
+	int rep;
+};
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 5f4a18cae26b..2b8318c83e53 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -40,16 +40,4 @@ typedef unsigned long  hfn_t;
 
 typedef hfn_t pfn_t;
 
-struct kvm_pio_request {
-	unsigned long count;
-	int cur_count;
-	gva_t guest_gva;
-	int in;
-	int port;
-	int size;
-	int string;
-	int down;
-	int rep;
-};
-
 #endif /* __KVM_TYPES_H__ */
-- 
cgit v1.2.3-71-gd317


From 67346440e83d2a2f2e9801f370b6240317c7d9bd Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 6 Jan 2009 10:03:01 +0800
Subject: KVM: Remove duplicated prototype of kvm_arch_destroy_vm

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e92212f970db..3cf0ede3fd73 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -237,7 +237,6 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   int user_alloc);
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg);
-void kvm_arch_destroy_vm(struct kvm *kvm);
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
-- 
cgit v1.2.3-71-gd317


From 17071fe74fe0fbfdb03cd9b82f2490447cf1f986 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 6 Jan 2009 16:25:11 +0800
Subject: KVM: Add support to disable MSI for assigned device

MSI is always enabled by default for msi2intx=1. But if msi2intx=0, we
have to disable MSI if guest require to do so.

The patch also discard unnecessary msi2intx judgment if guest want to update
MSI state.

Notice KVM_DEV_IRQ_ASSIGN_MSI_ACTION is a mask which should cover all MSI
related operations, though we only got one for now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h |  1 +
 virt/kvm/kvm_main.c | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ae7a12c77427..73f348066866 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -550,6 +550,7 @@ struct kvm_assigned_irq {
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f83ef9c7e89b..04401e17c758 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -343,6 +343,14 @@ static int assigned_device_update_msi(struct kvm *kvm,
 		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
 		adev->guest_irq = airq->guest_irq;
 		adev->ack_notifier.gsi = airq->guest_irq;
+	} else {
+		/*
+		 * Guest require to disable device MSI, we disable MSI and
+		 * re-enable INTx by default again. Notice it's only for
+		 * non-msi2intx.
+		 */
+		assigned_device_update_intx(kvm, adev, airq);
+		return 0;
 	}
 
 	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
@@ -379,6 +387,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 {
 	int r = 0;
 	struct kvm_assigned_dev_kernel *match;
+	u32 current_flags = 0, changed_flags;
 
 	mutex_lock(&kvm->lock);
 
@@ -416,8 +425,13 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		}
 	}
 
-	if ((!msi2intx &&
-	     (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) ||
+	if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
+		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
+		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
+
+	changed_flags = assigned_irq->flags ^ current_flags;
+
+	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
 	    (msi2intx && match->dev->msi_enabled)) {
 #ifdef CONFIG_X86
 		r = assigned_device_update_msi(kvm, match, assigned_irq);
-- 
cgit v1.2.3-71-gd317


From 75858a84a6207f5e60196f6bbd18fde4250e5759 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Jan 2009 17:10:50 +0200
Subject: KVM: Interrupt mask notifiers for ioapic

Allow clients to request notifications when the guest masks or unmasks a
particular irq line.  This complements irq ack notifications, as the guest
will not ack an irq line that is masked.

Currently implemented for the ioapic only.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 17 +++++++++++++++++
 virt/kvm/ioapic.c        |  6 ++++++
 virt/kvm/irq_comm.c      | 24 ++++++++++++++++++++++++
 virt/kvm/kvm_main.c      |  3 +++
 4 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3cf0ede3fd73..99963f36a6db 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -127,6 +127,10 @@ struct kvm {
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 #endif
 
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	struct hlist_head mask_notifier_list;
+#endif
+
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
@@ -320,6 +324,19 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 };
+
+struct kvm_irq_mask_notifier {
+	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+	int irq;
+	struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+				    struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+				      struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
+
 void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 23b81cf242af..e85a2bcd2db1 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -101,6 +101,7 @@ static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 {
 	unsigned index;
+	bool mask_before, mask_after;
 
 	switch (ioapic->ioregsel) {
 	case IOAPIC_REG_VERSION:
@@ -120,6 +121,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		ioapic_debug("change redir index %x val %x\n", index, val);
 		if (index >= IOAPIC_NUM_PINS)
 			return;
+		mask_before = ioapic->redirtbl[index].fields.mask;
 		if (ioapic->ioregsel & 1) {
 			ioapic->redirtbl[index].bits &= 0xffffffff;
 			ioapic->redirtbl[index].bits |= (u64) val << 32;
@@ -128,6 +130,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			ioapic->redirtbl[index].bits |= (u32) val;
 			ioapic->redirtbl[index].fields.remote_irr = 0;
 		}
+		mask_after = ioapic->redirtbl[index].fields.mask;
+		if (mask_before != mask_after)
+			kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
 		if (ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index);
 		break;
@@ -426,3 +431,4 @@ int kvm_ioapic_init(struct kvm *kvm)
 	kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
 	return 0;
 }
+
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index aa5d1e5c497e..5162a411e4d2 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -99,3 +99,27 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
 }
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+				    struct kvm_irq_mask_notifier *kimn)
+{
+	kimn->irq = irq;
+	hlist_add_head(&kimn->link, &kvm->mask_notifier_list);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+				      struct kvm_irq_mask_notifier *kimn)
+{
+	hlist_del(&kimn->link);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
+{
+	struct kvm_irq_mask_notifier *kimn;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
+		if (kimn->irq == irq)
+			kimn->func(kimn, mask);
+}
+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 04401e17c758..786a3ae373b0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -842,6 +842,9 @@ static struct kvm *kvm_create_vm(void)
 
 	if (IS_ERR(kvm))
 		goto out;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+#endif
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-- 
cgit v1.2.3-71-gd317


From 399ec807ddc38ecccf8c06dbde04531cbdc63e11 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 19 Nov 2008 13:58:46 +0200
Subject: KVM: Userspace controlled irq routing

Currently KVM has a static routing from GSI numbers to interrupts (namely,
0-15 are mapped 1:1 to both PIC and IOAPIC, and 16:23 are mapped 1:1 to
the IOAPIC).  This is insufficient for several reasons:

- HPET requires non 1:1 mapping for the timer interrupt
- MSIs need a new method to assign interrupt numbers and dispatch them
- ACPI APIC mode needs to be able to reassign the PCI LINK interrupts to the
  ioapics

This patch implements an interrupt routing table (as a linked list, but this
can be easily changed) and a userspace interface to replace the table.  The
routing table is initialized according to the current hardwired mapping.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |   5 ++
 arch/x86/kvm/x86.c       |   6 ++
 include/linux/kvm.h      |  33 ++++++++++
 include/linux/kvm_host.h |  31 +++++++++
 virt/kvm/irq_comm.c      | 168 +++++++++++++++++++++++++++++++++++++++++++++--
 virt/kvm/kvm_main.c      |  36 ++++++++++
 6 files changed, 275 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 1477f91617a5..dbf527a57341 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -919,6 +919,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_ioapic_init(kvm);
 		if (r)
 			goto out;
+		r = kvm_setup_default_irq_routing(kvm);
+		if (r) {
+			kfree(kvm->arch.vioapic);
+			goto out;
+		}
 		break;
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 141a0166e51c..32e3a7ec6ad2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1835,6 +1835,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			}
 		} else
 			goto out;
+		r = kvm_setup_default_irq_routing(kvm);
+		if (r) {
+			kfree(kvm->arch.vpic);
+			kfree(kvm->arch.vioapic);
+			goto out;
+		}
 		break;
 	case KVM_CREATE_PIT:
 		mutex_lock(&kvm->lock);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 73f348066866..7a5d73a8d4fa 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -398,6 +398,38 @@ struct kvm_trace_rec {
 #endif
 #if defined(CONFIG_X86)
 #define KVM_CAP_REINJECT_CONTROL 24
+#endif
+#if defined(CONFIG_X86)||defined(CONFIG_IA64)
+#define KVM_CAP_IRQ_ROUTING 25
+#endif
+
+#ifdef KVM_CAP_IRQ_ROUTING
+
+struct kvm_irq_routing_irqchip {
+	__u32 irqchip;
+	__u32 pin;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+
+struct kvm_irq_routing_entry {
+	__u32 gsi;
+	__u32 type;
+	__u32 flags;
+	__u32 pad;
+	union {
+		struct kvm_irq_routing_irqchip irqchip;
+		__u32 pad[8];
+	} u;
+};
+
+struct kvm_irq_routing {
+	__u32 nr;
+	__u32 flags;
+	struct kvm_irq_routing_entry entries[0];
+};
+
 #endif
 
 /*
@@ -430,6 +462,7 @@ struct kvm_trace_rec {
 			_IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
 #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
 				   struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 99963f36a6db..ce285e01bd57 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -107,6 +107,19 @@ struct kvm_memory_slot {
 	int user_alloc;
 };
 
+struct kvm_kernel_irq_routing_entry {
+	u32 gsi;
+	void (*set)(struct kvm_kernel_irq_routing_entry *e,
+		    struct kvm *kvm, int level);
+	union {
+		struct {
+			unsigned irqchip;
+			unsigned pin;
+		} irqchip;
+	};
+	struct list_head link;
+};
+
 struct kvm {
 	struct mutex lock; /* protects the vcpus array and APIC accesses */
 	spinlock_t mmu_lock;
@@ -128,6 +141,7 @@ struct kvm {
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
+	struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */
 	struct hlist_head mask_notifier_list;
 #endif
 
@@ -480,4 +494,21 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 }
 #endif
 
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+
+#define KVM_MAX_IRQ_ROUTES 1024
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_set_irq_routing(struct kvm *kvm,
+			const struct kvm_irq_routing_entry *entries,
+			unsigned nr,
+			unsigned flags);
+void kvm_free_irq_routing(struct kvm *kvm);
+
+#else
+
+static inline void kvm_free_irq_routing(struct kvm *kvm) {}
+
+#endif
+
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 5162a411e4d2..a797fa5e6420 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -24,9 +24,24 @@
 
 #include "ioapic.h"
 
+static void kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+			    struct kvm *kvm, int level)
+{
+#ifdef CONFIG_X86
+	kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level);
+#endif
+}
+
+static void kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+			       struct kvm *kvm, int level)
+{
+	kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
+}
+
 /* This should be called with the kvm->lock mutex held */
 void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 {
+	struct kvm_kernel_irq_routing_entry *e;
 	unsigned long *irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
 
 	/* Logical OR for level trig interrupt */
@@ -39,10 +54,9 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 	 * IOAPIC.  So set the bit in both. The guest will ignore
 	 * writes to the unused one.
 	 */
-	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, !!(*irq_state));
-#ifdef CONFIG_X86
-	kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state));
-#endif
+	list_for_each_entry(e, &kvm->irq_routing, link)
+		if (e->gsi == irq)
+			e->set(e, kvm, !!(*irq_state));
 }
 
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
@@ -123,3 +137,149 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
 			kimn->func(kimn, mask);
 }
 
+static void __kvm_free_irq_routing(struct list_head *irq_routing)
+{
+	struct kvm_kernel_irq_routing_entry *e, *n;
+
+	list_for_each_entry_safe(e, n, irq_routing, link)
+		kfree(e);
+}
+
+void kvm_free_irq_routing(struct kvm *kvm)
+{
+	__kvm_free_irq_routing(&kvm->irq_routing);
+}
+
+int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+			const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+	int delta;
+
+	e->gsi = ue->gsi;
+	switch (ue->type) {
+	case KVM_IRQ_ROUTING_IRQCHIP:
+		delta = 0;
+		switch (ue->u.irqchip.irqchip) {
+		case KVM_IRQCHIP_PIC_MASTER:
+			e->set = kvm_set_pic_irq;
+			break;
+		case KVM_IRQCHIP_PIC_SLAVE:
+				e->set = kvm_set_pic_irq;
+			delta = 8;
+			break;
+		case KVM_IRQCHIP_IOAPIC:
+				e->set = kvm_set_ioapic_irq;
+			break;
+		default:
+			goto out;
+		}
+		e->irqchip.irqchip = ue->u.irqchip.irqchip;
+		e->irqchip.pin = ue->u.irqchip.pin + delta;
+		break;
+	default:
+		goto out;
+	}
+	r = 0;
+out:
+	return r;
+}
+
+
+int kvm_set_irq_routing(struct kvm *kvm,
+			const struct kvm_irq_routing_entry *ue,
+			unsigned nr,
+			unsigned flags)
+{
+	struct list_head irq_list = LIST_HEAD_INIT(irq_list);
+	struct list_head tmp = LIST_HEAD_INIT(tmp);
+	struct kvm_kernel_irq_routing_entry *e = NULL;
+	unsigned i;
+	int r;
+
+	for (i = 0; i < nr; ++i) {
+		r = -EINVAL;
+		if (ue->gsi >= KVM_MAX_IRQ_ROUTES)
+			goto out;
+		if (ue->flags)
+			goto out;
+		r = -ENOMEM;
+		e = kzalloc(sizeof(*e), GFP_KERNEL);
+		if (!e)
+			goto out;
+		r = setup_routing_entry(e, ue);
+		if (r)
+			goto out;
+		++ue;
+		list_add(&e->link, &irq_list);
+		e = NULL;
+	}
+
+	mutex_lock(&kvm->lock);
+	list_splice(&kvm->irq_routing, &tmp);
+	INIT_LIST_HEAD(&kvm->irq_routing);
+	list_splice(&irq_list, &kvm->irq_routing);
+	INIT_LIST_HEAD(&irq_list);
+	list_splice(&tmp, &irq_list);
+	mutex_unlock(&kvm->lock);
+
+	r = 0;
+
+out:
+	kfree(e);
+	__kvm_free_irq_routing(&irq_list);
+	return r;
+}
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
+	  .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#ifdef CONFIG_X86
+#define SELECT_PIC(irq) \
+	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
+#  define PIC_ROUTING_ENTRY(irq) \
+	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
+	  .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 }
+#  define ROUTING_ENTRY2(irq) \
+	IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+#else
+#  define ROUTING_ENTRY2(irq) \
+	IOAPIC_ROUTING_ENTRY(irq)
+#endif
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+	ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+	ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+	ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+	ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+	ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+	ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+	ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+	ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+	ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+	ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+	ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+	ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+#ifdef CONFIG_IA64
+	ROUTING_ENTRY1(24), ROUTING_ENTRY1(25),
+	ROUTING_ENTRY1(26), ROUTING_ENTRY1(27),
+	ROUTING_ENTRY1(28), ROUTING_ENTRY1(29),
+	ROUTING_ENTRY1(30), ROUTING_ENTRY1(31),
+	ROUTING_ENTRY1(32), ROUTING_ENTRY1(33),
+	ROUTING_ENTRY1(34), ROUTING_ENTRY1(35),
+	ROUTING_ENTRY1(36), ROUTING_ENTRY1(37),
+	ROUTING_ENTRY1(38), ROUTING_ENTRY1(39),
+	ROUTING_ENTRY1(40), ROUTING_ENTRY1(41),
+	ROUTING_ENTRY1(42), ROUTING_ENTRY1(43),
+	ROUTING_ENTRY1(44), ROUTING_ENTRY1(45),
+	ROUTING_ENTRY1(46), ROUTING_ENTRY1(47),
+#endif
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, default_routing,
+				   ARRAY_SIZE(default_routing), 0);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 786a3ae373b0..c65484b471c6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -843,6 +843,7 @@ static struct kvm *kvm_create_vm(void)
 	if (IS_ERR(kvm))
 		goto out;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
+	INIT_LIST_HEAD(&kvm->irq_routing);
 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
 #endif
 
@@ -926,6 +927,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
+	kvm_free_irq_routing(kvm);
 	kvm_io_bus_destroy(&kvm->pio_bus);
 	kvm_io_bus_destroy(&kvm->mmio_bus);
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -1945,6 +1947,36 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+#endif
+#ifdef KVM_CAP_IRQ_ROUTING
+	case KVM_SET_GSI_ROUTING: {
+		struct kvm_irq_routing routing;
+		struct kvm_irq_routing __user *urouting;
+		struct kvm_irq_routing_entry *entries;
+
+		r = -EFAULT;
+		if (copy_from_user(&routing, argp, sizeof(routing)))
+			goto out;
+		r = -EINVAL;
+		if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+			goto out;
+		if (routing.flags)
+			goto out;
+		r = -ENOMEM;
+		entries = vmalloc(routing.nr * sizeof(*entries));
+		if (!entries)
+			goto out;
+		r = -EFAULT;
+		urouting = argp;
+		if (copy_from_user(entries, urouting->entries,
+				   routing.nr * sizeof(*entries)))
+			goto out_free_irq_routing;
+		r = kvm_set_irq_routing(kvm, entries, routing.nr,
+					routing.flags);
+	out_free_irq_routing:
+		vfree(entries);
+		break;
+	}
 #endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
@@ -2012,6 +2044,10 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 		return 1;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	case KVM_CAP_IRQ_ROUTING:
+		return 1;
+#endif
 	default:
 		break;
 	}
-- 
cgit v1.2.3-71-gd317


From 91b2ae773d3b168b763237fac33f75b13d891f20 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 19 Jan 2009 14:57:52 +0200
Subject: KVM: Avoid using CONFIG_ in userspace visible headers

Kconfig symbols are not available in userspace, and are not stripped by
headers-install.  Avoid their use by adding #defines in <asm/kvm.h> to
suit each architecture.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h | 1 +
 include/linux/kvm.h        | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 54bcf2281526..dc3f6cf11704 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -15,6 +15,7 @@
 #define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 7a5d73a8d4fa..869462ca7625 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -393,13 +393,13 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_USER_NMI
 #define KVM_CAP_USER_NMI 22
 #endif
-#if defined(CONFIG_X86)
+#ifdef __KVM_HAVE_GUEST_DEBUG
 #define KVM_CAP_SET_GUEST_DEBUG 23
 #endif
-#if defined(CONFIG_X86)
+#ifdef __KVM_HAVE_PIT
 #define KVM_CAP_REINJECT_CONTROL 24
 #endif
-#if defined(CONFIG_X86)||defined(CONFIG_IA64)
+#ifdef __KVM_HAVE_IOAPIC
 #define KVM_CAP_IRQ_ROUTING 25
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 44882eed2ebe7f75f8cdae5671ab1d6e0fa40dbc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 27 Jan 2009 15:12:38 -0200
Subject: KVM: make irq ack notifications aware of routing table

IRQ ack notifications assume an identity mapping between pin->gsi,
which might not be the case with, for example, HPET.

Translate before acking.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/i8259.c     |  5 +++--
 arch/x86/kvm/irq.h       |  2 ++
 include/linux/kvm_host.h |  2 +-
 virt/kvm/ioapic.c        | 10 +++++-----
 virt/kvm/irq_comm.c      | 13 ++++++++++---
 5 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 179dcb0103fd..93160375c841 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -49,7 +49,8 @@ static void pic_unlock(struct kvm_pic *s)
 	spin_unlock(&s->lock);
 
 	while (acks) {
-		kvm_notify_acked_irq(kvm, __ffs(acks));
+		kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
+				     __ffs(acks));
 		acks &= acks - 1;
 	}
 
@@ -232,7 +233,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	}
 	pic_update_irq(s);
 	pic_unlock(s);
-	kvm_notify_acked_irq(kvm, irq);
+	kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
 
 	return intno;
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 82579ee538d0..9f593188129e 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -32,6 +32,8 @@
 #include "lapic.h"
 
 #define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
 
 struct kvm;
 struct kvm_vcpu;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ce285e01bd57..c03a0a9a8584 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -352,7 +352,7 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
 void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e85a2bcd2db1..1c986ac59ad6 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -293,20 +293,20 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 	}
 }
 
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi,
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
 				    int trigger_mode)
 {
 	union ioapic_redir_entry *ent;
 
-	ent = &ioapic->redirtbl[gsi];
+	ent = &ioapic->redirtbl[pin];
 
-	kvm_notify_acked_irq(ioapic->kvm, gsi);
+	kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
 
 	if (trigger_mode == IOAPIC_LEVEL_TRIG) {
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
-		if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
-			ioapic_service(ioapic, gsi);
+		if (!ent->fields.mask && (ioapic->irr & (1 << pin)))
+			ioapic_service(ioapic, pin);
 	}
 }
 
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index a797fa5e6420..7aa5086c8622 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -59,10 +59,19 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 			e->set(e, kvm, !!(*irq_state));
 }
 
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
+	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_ack_notifier *kian;
 	struct hlist_node *n;
+	unsigned gsi = pin;
+
+	list_for_each_entry(e, &kvm->irq_routing, link)
+		if (e->irqchip.irqchip == irqchip &&
+		    e->irqchip.pin == pin) {
+			gsi = e->gsi;
+			break;
+		}
 
 	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
 		if (kian->gsi == gsi)
@@ -237,8 +246,6 @@ out:
 #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
 
 #ifdef CONFIG_X86
-#define SELECT_PIC(irq) \
-	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
 #  define PIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 }
-- 
cgit v1.2.3-71-gd317


From 79950e1073150909619b7c0f9a39a2fea83a42d8 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 10 Feb 2009 13:57:06 +0800
Subject: KVM: Use irq routing API for MSI

Merge MSI userspace interface with IRQ routing table. Notice the API have been
changed, and using IRQ routing table would be the only interface kvm-userspace
supported.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h      |  9 ++++++
 include/linux/kvm_host.h |  2 +-
 virt/kvm/irq_comm.c      | 78 +++++++++++++++++++++++++++++++++++++++++++-----
 virt/kvm/kvm_main.c      | 70 ++++---------------------------------------
 4 files changed, 86 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 869462ca7625..2163b3dd36e7 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -410,8 +410,16 @@ struct kvm_irq_routing_irqchip {
 	__u32 pin;
 };
 
+struct kvm_irq_routing_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 pad;
+};
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -420,6 +428,7 @@ struct kvm_irq_routing_entry {
 	__u32 pad;
 	union {
 		struct kvm_irq_routing_irqchip irqchip;
+		struct kvm_irq_routing_msi msi;
 		__u32 pad[8];
 	} u;
 };
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c03a0a9a8584..339eda3ca6ee 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -116,6 +116,7 @@ struct kvm_kernel_irq_routing_entry {
 			unsigned irqchip;
 			unsigned pin;
 		} irqchip;
+		struct msi_msg msi;
 	};
 	struct list_head link;
 };
@@ -327,7 +328,6 @@ struct kvm_assigned_dev_kernel {
 	int host_irq;
 	bool host_irq_disabled;
 	int guest_irq;
-	struct msi_msg guest_msi;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
 #define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
 #define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 7aa5086c8622..6bc7439eff6e 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -20,6 +20,11 @@
  */
 
 #include <linux/kvm_host.h>
+
+#ifdef CONFIG_X86
+#include <asm/msidef.h>
+#endif
+
 #include "irq.h"
 
 #include "ioapic.h"
@@ -38,17 +43,70 @@ static void kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
+static void kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+			struct kvm *kvm, int level)
+{
+	int vcpu_id;
+	struct kvm_vcpu *vcpu;
+	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK)
+			>> MSI_ADDR_DEST_ID_SHIFT;
+	int vector = (e->msi.data & MSI_DATA_VECTOR_MASK)
+			>> MSI_DATA_VECTOR_SHIFT;
+	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+				(unsigned long *)&e->msi.address_lo);
+	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+				(unsigned long *)&e->msi.data);
+	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
+				(unsigned long *)&e->msi.data);
+	u32 deliver_bitmask;
+
+	BUG_ON(!ioapic);
+
+	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+				dest_id, dest_mode);
+	/* IOAPIC delivery mode value is the same as MSI here */
+	switch (delivery_mode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+				deliver_bitmask);
+		if (vcpu != NULL)
+			kvm_apic_set_irq(vcpu, vector, trig_mode);
+		else
+			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
+		break;
+	case IOAPIC_FIXED:
+		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+			if (!(deliver_bitmask & (1 << vcpu_id)))
+				continue;
+			deliver_bitmask &= ~(1 << vcpu_id);
+			vcpu = ioapic->kvm->vcpus[vcpu_id];
+			if (vcpu)
+				kvm_apic_set_irq(vcpu, vector, trig_mode);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
 /* This should be called with the kvm->lock mutex held */
 void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 {
 	struct kvm_kernel_irq_routing_entry *e;
-	unsigned long *irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
+	unsigned long *irq_state, sig_level;
+
+	if (irq < KVM_IOAPIC_NUM_PINS) {
+		irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
 
-	/* Logical OR for level trig interrupt */
-	if (level)
-		set_bit(irq_source_id, irq_state);
-	else
-		clear_bit(irq_source_id, irq_state);
+		/* Logical OR for level trig interrupt */
+		if (level)
+			set_bit(irq_source_id, irq_state);
+		else
+			clear_bit(irq_source_id, irq_state);
+		sig_level = !!(*irq_state);
+	} else /* Deal with MSI/MSI-X */
+		sig_level = 1;
 
 	/* Not possible to detect if the guest uses the PIC or the
 	 * IOAPIC.  So set the bit in both. The guest will ignore
@@ -56,7 +114,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 	 */
 	list_for_each_entry(e, &kvm->irq_routing, link)
 		if (e->gsi == irq)
-			e->set(e, kvm, !!(*irq_state));
+			e->set(e, kvm, sig_level);
 }
 
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
@@ -186,6 +244,12 @@ int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
 		e->irqchip.pin = ue->u.irqchip.pin + delta;
 		break;
+	case KVM_IRQ_ROUTING_MSI:
+		e->set = kvm_set_msi;
+		e->msi.address_lo = ue->u.msi.address_lo;
+		e->msi.address_hi = ue->u.msi.address_hi;
+		e->msi.data = ue->u.msi.data;
+		break;
 	default:
 		goto out;
 	}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c65484b471c6..266bdaf0ce44 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,10 +47,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-#ifdef CONFIG_X86
-#include <asm/msidef.h>
-#endif
-
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 #include "coalesced_mmio.h"
 #endif
@@ -85,57 +81,6 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 static bool kvm_rebooting;
 
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
-
-#ifdef CONFIG_X86
-static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev)
-{
-	int vcpu_id;
-	struct kvm_vcpu *vcpu;
-	struct kvm_ioapic *ioapic = ioapic_irqchip(dev->kvm);
-	int dest_id = (dev->guest_msi.address_lo & MSI_ADDR_DEST_ID_MASK)
-			>> MSI_ADDR_DEST_ID_SHIFT;
-	int vector = (dev->guest_msi.data & MSI_DATA_VECTOR_MASK)
-			>> MSI_DATA_VECTOR_SHIFT;
-	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-				(unsigned long *)&dev->guest_msi.address_lo);
-	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-				(unsigned long *)&dev->guest_msi.data);
-	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
-				(unsigned long *)&dev->guest_msi.data);
-	u32 deliver_bitmask;
-
-	BUG_ON(!ioapic);
-
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-				dest_id, dest_mode);
-	/* IOAPIC delivery mode value is the same as MSI here */
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
-		if (vcpu != NULL)
-			kvm_apic_set_irq(vcpu, vector, trig_mode);
-		else
-			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
-		break;
-	case IOAPIC_FIXED:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu)
-				kvm_apic_set_irq(vcpu, vector, trig_mode);
-		}
-		break;
-	default:
-		printk(KERN_INFO "kvm: unsupported MSI delivery mode\n");
-	}
-}
-#else
-static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {}
-#endif
-
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
 {
@@ -162,13 +107,10 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&assigned_dev->kvm->lock);
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX)
-		kvm_set_irq(assigned_dev->kvm,
-			    assigned_dev->irq_source_id,
-			    assigned_dev->guest_irq, 1);
-	else if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_GUEST_MSI) {
-		assigned_device_msi_dispatch(assigned_dev);
+	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+		    assigned_dev->guest_irq, 1);
+
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) {
 		enable_irq(assigned_dev->host_irq);
 		assigned_dev->host_irq_disabled = false;
 	}
@@ -331,17 +273,15 @@ static int assigned_device_update_msi(struct kvm *kvm,
 {
 	int r;
 
+	adev->guest_irq = airq->guest_irq;
 	if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
 		/* x86 don't care upper address of guest msi message addr */
 		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
 		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
-		adev->guest_msi.data = airq->guest_msi.data;
 		adev->ack_notifier.gsi = -1;
 	} else if (msi2intx) {
 		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
 		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->guest_irq = airq->guest_irq;
 		adev->ack_notifier.gsi = airq->guest_irq;
 	} else {
 		/*
-- 
cgit v1.2.3-71-gd317


From c807660407a695f390034e402edfe544a1d2e40c Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 4 Feb 2009 17:52:04 +0100
Subject: KVM: Fix kvmclock on !constant_tsc boxes

kvmclock currently falls apart on machines without constant tsc.
This patch fixes it.  Changes:

  * keep tsc frequency in a per-cpu variable.
  * handle kvmclock update using a new request flag, thus checking
    whenever we need an update each time we enter guest context.
  * use a cpufreq notifier to track frequency changes and force
    kvmclock updates.
  * send ipis to kick cpu out of guest context if needed to make
    sure the guest doesn't see stale values.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       | 103 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/kvm_host.h |   1 +
 2 files changed, 95 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f83590b47dd..05d7be89b5eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,6 +36,7 @@
 #include <linux/highmem.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -617,6 +618,8 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 		 hv_clock->tsc_to_system_mul);
 }
 
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
@@ -627,9 +630,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
-	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
-		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = tsc_khz;
+	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
+		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	}
 
 	/* Keep irq disabled to prevent changes to the clock */
@@ -660,6 +663,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 }
 
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+
+	if (!vcpu->time_page)
+		return 0;
+	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	return 1;
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -790,7 +803,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			vcpu->arch.time_page = NULL;
 		}
 
-		kvm_write_guest_time(vcpu);
+		kvm_request_guest_time_update(vcpu);
 		break;
 	}
 	default:
@@ -1000,6 +1013,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
@@ -1025,9 +1039,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IOMMU:
 		r = iommu_found();
 		break;
-	case KVM_CAP_CLOCKSOURCE:
-		r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
-		break;
 	default:
 		r = 0;
 		break;
@@ -1098,7 +1109,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	kvm_write_guest_time(vcpu);
+	kvm_request_guest_time_update(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2642,9 +2653,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
+static void bounce_off(void *info)
+{
+	/* nothing */
+}
+
+static unsigned int  ref_freq;
+static unsigned long tsc_khz_ref;
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				     void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i, send_ipi = 0;
+
+	if (!ref_freq)
+		ref_freq = freq->old;
+
+	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+		return 0;
+	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+		return 0;
+	per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->cpu != freq->cpu)
+				continue;
+			if (!kvm_request_guest_time_update(vcpu))
+				continue;
+			if (vcpu->cpu != smp_processor_id())
+				send_ipi++;
+		}
+	}
+	spin_unlock(&kvm_lock);
+
+	if (freq->old < freq->new && send_ipi) {
+		/*
+		 * We upscale the frequency.  Must make the guest
+		 * doesn't see old kvmclock values while running with
+		 * the new frequency, otherwise we risk the guest sees
+		 * time go backwards.
+		 *
+		 * In case we update the frequency for another cpu
+		 * (which might be in guest context) send an interrupt
+		 * to kick the cpu out of guest context.  Next time
+		 * guest context is entered kvmclock will be updated,
+		 * so the guest will not see stale values.
+		 */
+		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+	}
+	return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+        .notifier_call  = kvmclock_cpufreq_notifier
+};
+
 int kvm_arch_init(void *opaque)
 {
-	int r;
+	int r, cpu;
 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
 	if (kvm_x86_ops) {
@@ -2675,6 +2749,15 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+
+	for_each_possible_cpu(cpu)
+		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		tsc_khz_ref = tsc_khz;
+		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+					  CPUFREQ_TRANSITION_NOTIFIER);
+	}
+
 	return 0;
 
 out:
@@ -3010,6 +3093,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
 			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+			kvm_write_guest_time(vcpu);
 		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
 			kvm_mmu_sync_roots(vcpu);
 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 339eda3ca6ee..18b4df8264cf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -37,6 +37,7 @@
 #define KVM_REQ_PENDING_TIMER      5
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
+#define KVM_REQ_KVMCLOCK_UPDATE    8
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-- 
cgit v1.2.3-71-gd317


From 4925663a079c77d95d8685228ad6675fc5639c8e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 4 Feb 2009 17:28:14 +0200
Subject: KVM: Report IRQ injection status to userspace.

IRQ injection status is either -1 (if there was no CPU found
that should except the interrupt because IRQ was masked or
ioapic was misconfigured or ...) or >= 0 in that case the
number indicates to how many CPUs interrupt was injected.
If the value is 0 it means that the interrupt was coalesced
and probably should be reinjected.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c        | 12 ++++++++++--
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/i8259.c            | 18 +++++++++++++-----
 arch/x86/kvm/x86.c              | 13 +++++++++++--
 include/linux/kvm.h             |  7 ++++++-
 include/linux/kvm_host.h        |  4 ++--
 virt/kvm/ioapic.c               | 23 ++++++++++++++++-------
 virt/kvm/ioapic.h               |  2 +-
 virt/kvm/irq_comm.c             | 41 ++++++++++++++++++++++++++++-------------
 9 files changed, 88 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 9c77e3939e97..076b00d1dbff 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -182,7 +182,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
 	case KVM_CAP_MP_STATE:
-
+	case KVM_CAP_IRQ_INJECT_STATUS:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -927,6 +927,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		}
 		break;
+	case KVM_IRQ_LINE_STATUS:
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 
@@ -934,10 +935,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
+			__s32 status;
 			mutex_lock(&kvm->lock);
-			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 				    irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
+			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				irq_event.status = status;
+				if (copy_to_user(argp, &irq_event,
+							sizeof irq_event))
+					goto out;
+			}
 			r = 0;
 		}
 		break;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55fd4c5fd388..f0faf58044ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -616,7 +616,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 
-void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93160375c841..b4e662e94ddc 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -77,12 +77,13 @@ void kvm_pic_clear_isr_ack(struct kvm *kvm)
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
-static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 {
-	int mask;
+	int mask, ret = 1;
 	mask = 1 << irq;
 	if (s->elcr & mask)	/* level triggered */
 		if (level) {
+			ret = !(s->irr & mask);
 			s->irr |= mask;
 			s->last_irr |= mask;
 		} else {
@@ -91,11 +92,15 @@ static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 		}
 	else	/* edge triggered */
 		if (level) {
-			if ((s->last_irr & mask) == 0)
+			if ((s->last_irr & mask) == 0) {
+				ret = !(s->irr & mask);
 				s->irr |= mask;
+			}
 			s->last_irr |= mask;
 		} else
 			s->last_irr &= ~mask;
+
+	return (s->imr & mask) ? -1 : ret;
 }
 
 /*
@@ -172,16 +177,19 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 	pic_unlock(s);
 }
 
-void kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(void *opaque, int irq, int level)
 {
 	struct kvm_pic *s = opaque;
+	int ret = -1;
 
 	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
-		pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 	}
 	pic_unlock(s);
+
+	return ret;
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05d7be89b5eb..e4db5be7c953 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1019,6 +1019,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_REINJECT_CONTROL:
+	case KVM_CAP_IRQ_INJECT_STATUS:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1877,6 +1878,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	create_pit_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_IRQ_LINE_STATUS:
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 
@@ -1884,10 +1886,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
+			__s32 status;
 			mutex_lock(&kvm->lock);
-			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-				    irq_event.irq, irq_event.level);
+			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
+			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				irq_event.status = status;
+				if (copy_to_user(argp, &irq_event,
+							sizeof irq_event))
+					goto out;
+			}
 			r = 0;
 		}
 		break;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2163b3dd36e7..dd48225d1824 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -48,7 +48,10 @@ struct kvm_irq_level {
 	 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
 	 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
 	 */
-	__u32 irq;
+	union {
+		__u32 irq;
+		__s32 status;
+	};
 	__u32 level;
 };
 
@@ -402,6 +405,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_IOAPIC
 #define KVM_CAP_IRQ_ROUTING 25
 #endif
+#define KVM_CAP_IRQ_INJECT_STATUS 26
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -465,6 +469,7 @@ struct kvm_irq_routing {
 #define KVM_CREATE_PIT		  _IO(KVMIO,  0x64)
 #define KVM_GET_PIT		  _IOWR(KVMIO, 0x65, struct kvm_pit_state)
 #define KVM_SET_PIT		  _IOR(KVMIO,  0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS	  _IOWR(KVMIO, 0x67, struct kvm_irq_level)
 #define KVM_REGISTER_COALESCED_MMIO \
 			_IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 18b4df8264cf..894a56e365e8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -110,7 +110,7 @@ struct kvm_memory_slot {
 
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
-	void (*set)(struct kvm_kernel_irq_routing_entry *e,
+	int (*set)(struct kvm_kernel_irq_routing_entry *e,
 		    struct kvm *kvm, int level);
 	union {
 		struct {
@@ -352,7 +352,7 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
-void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 1c986ac59ad6..c3b99def9cbc 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -83,19 +83,22 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 	return result;
 }
 
-static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 {
 	union ioapic_redir_entry *pent;
+	int injected = -1;
 
 	pent = &ioapic->redirtbl[idx];
 
 	if (!pent->fields.mask) {
-		int injected = ioapic_deliver(ioapic, idx);
+		injected = ioapic_deliver(ioapic, idx);
 		if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
 			pent->fields.remote_irr = 1;
 	}
 	if (!pent->fields.trig_mode)
 		ioapic->irr &= ~(1 << idx);
+
+	return injected;
 }
 
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
@@ -207,7 +210,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
 	u32 deliver_bitmask;
 	struct kvm_vcpu *vcpu;
-	int vcpu_id, r = 0;
+	int vcpu_id, r = -1;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
@@ -247,7 +250,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 			deliver_bitmask &= ~(1 << vcpu_id);
 			vcpu = ioapic->kvm->vcpus[vcpu_id];
 			if (vcpu) {
-				r = ioapic_inj_irq(ioapic, vcpu, vector,
+				if (r < 0)
+					r = 0;
+				r += ioapic_inj_irq(ioapic, vcpu, vector,
 					       trig_mode, delivery_mode);
 			}
 		}
@@ -258,8 +263,10 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 				continue;
 			deliver_bitmask &= ~(1 << vcpu_id);
 			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu)
+			if (vcpu) {
 				ioapic_inj_nmi(vcpu);
+				r = 1;
+			}
 			else
 				ioapic_debug("NMI to vcpu %d failed\n",
 						vcpu->vcpu_id);
@@ -273,11 +280,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 	return r;
 }
 
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
 	u32 mask = 1 << irq;
 	union ioapic_redir_entry entry;
+	int ret = 1;
 
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
 		entry = ioapic->redirtbl[irq];
@@ -288,9 +296,10 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 			ioapic->irr |= mask;
 			if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
 			    || !entry.fields.remote_irr)
-				ioapic_service(ioapic, irq);
+				ret = ioapic_service(ioapic, irq);
 		}
 	}
+	return ret;
 }
 
 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 49c9581d2586..a34bd5e6436b 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -83,7 +83,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 				       unsigned long bitmap);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 				u8 dest_mode);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 6bc7439eff6e..be8aba791554 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -29,22 +29,24 @@
 
 #include "ioapic.h"
 
-static void kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
-			    struct kvm *kvm, int level)
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+			   struct kvm *kvm, int level)
 {
 #ifdef CONFIG_X86
-	kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level);
+	return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level);
+#else
+	return -1;
 #endif
 }
 
-static void kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
-			       struct kvm *kvm, int level)
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int level)
 {
-	kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
+	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-static void kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-			struct kvm *kvm, int level)
+static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		       struct kvm *kvm, int level)
 {
 	int vcpu_id;
 	struct kvm_vcpu *vcpu;
@@ -88,13 +90,20 @@ static void kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	default:
 		break;
 	}
+	return 1;
 }
 
-/* This should be called with the kvm->lock mutex held */
-void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
+/* This should be called with the kvm->lock mutex held
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	unsigned long *irq_state, sig_level;
+	int ret = -1;
 
 	if (irq < KVM_IOAPIC_NUM_PINS) {
 		irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
@@ -113,8 +122,14 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 	 * writes to the unused one.
 	 */
 	list_for_each_entry(e, &kvm->irq_routing, link)
-		if (e->gsi == irq)
-			e->set(e, kvm, sig_level);
+		if (e->gsi == irq) {
+			int r = e->set(e, kvm, sig_level);
+			if (r < 0)
+				continue;
+
+			ret = r + ((ret < 0) ? 0 : ret);
+		}
+	return ret;
 }
 
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
@@ -232,7 +247,7 @@ int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
 			e->set = kvm_set_pic_irq;
 			break;
 		case KVM_IRQCHIP_PIC_SLAVE:
-				e->set = kvm_set_pic_irq;
+			e->set = kvm_set_pic_irq;
 			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
-- 
cgit v1.2.3-71-gd317


From 2df8a40bccf5999261d0d3a82eac5a77678e61bd Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 13 Feb 2009 10:50:56 +0800
Subject: KVM: define KVM_CAP_DEVICE_DEASSIGNMENT

define KVM_CAP_DEVICE_DEASSIGNMENT and KVM_DEASSIGN_PCI_DEVICE
for device deassignment.

the ioctl has been already implemented in the
commit: 0a920356748df4fb06e86c21c23d2ed6d31d37ad

Acked-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index dd48225d1824..0d94b274c3ae 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -406,6 +406,9 @@ struct kvm_trace_rec {
 #define KVM_CAP_IRQ_ROUTING 25
 #endif
 #define KVM_CAP_IRQ_INJECT_STATUS 26
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#define KVM_CAP_DEVICE_DEASSIGNMENT 27
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -480,6 +483,8 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
+#define KVM_DEASSIGN_PCI_DEVICE _IOR(KVMIO, 0x72, \
+				     struct kvm_assigned_pci_dev)
 
 /*
  * ioctls for vcpu fds
-- 
cgit v1.2.3-71-gd317


From bc7a8660df62da3fb5cad025322eda75fbee8731 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 17 Mar 2009 19:27:19 +0800
Subject: KVM: Correct deassign device ioctl to IOW

It's IOR by mistake, so fix it before release.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 0d94b274c3ae..311a073afe8a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -483,7 +483,7 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
-#define KVM_DEASSIGN_PCI_DEVICE _IOR(KVMIO, 0x72, \
+#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
 
 /*
-- 
cgit v1.2.3-71-gd317


From 58c610bd1a3f50820e45a7c09ec0e44d2cda15dd Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 18 Mar 2009 15:33:05 +0800
Subject: intel-iommu: Snooping control support

Snooping control enabled IOMMU to guarantee DMA cache coherency and thus reduce
software effort (VMM) in maintaining effective memory type.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c   | 38 +++++++++++++++++++++++++++++++++-----
 include/linux/intel-iommu.h |  2 +-
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index f3f686581a90..be999ff025af 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -231,6 +231,7 @@ struct dmar_domain {
 	int		flags;		/* flags to find out type of domain */
 
 	int		iommu_coherency;/* indicate coherency of iommu access */
+	int		iommu_snooping; /* indicate snooping control feature*/
 	int		iommu_count;	/* reference count of iommu */
 	spinlock_t	iommu_lock;	/* protect iommu set in domain */
 	u64		max_addr;	/* maximum mapped address */
@@ -421,7 +422,6 @@ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 	return g_iommus[iommu_id];
 }
 
-/* "Coherency" capability may be different across iommus */
 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 {
 	int i;
@@ -438,6 +438,29 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	}
 }
 
+static void domain_update_iommu_snooping(struct dmar_domain *domain)
+{
+	int i;
+
+	domain->iommu_snooping = 1;
+
+	i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
+	for (; i < g_num_of_iommus; ) {
+		if (!ecap_sc_support(g_iommus[i]->ecap)) {
+			domain->iommu_snooping = 0;
+			break;
+		}
+		i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
+	}
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+	domain_update_iommu_coherency(domain);
+	domain_update_iommu_snooping(domain);
+}
+
 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
@@ -1429,6 +1452,11 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	else
 		domain->iommu_coherency = 0;
 
+	if (ecap_sc_support(iommu->ecap))
+		domain->iommu_snooping = 1;
+	else
+		domain->iommu_snooping = 0;
+
 	domain->iommu_count = 1;
 
 	/* always allocate the top pgd */
@@ -1557,7 +1585,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	spin_lock_irqsave(&domain->iommu_lock, flags);
 	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
 		domain->iommu_count++;
-		domain_update_iommu_coherency(domain);
+		domain_update_iommu_cap(domain);
 	}
 	spin_unlock_irqrestore(&domain->iommu_lock, flags);
 	return 0;
@@ -2820,7 +2848,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
 		clear_bit(iommu->seq_id, &domain->iommu_bmp);
 		domain->iommu_count--;
-		domain_update_iommu_coherency(domain);
+		domain_update_iommu_cap(domain);
 		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
 	}
 
@@ -2848,13 +2876,13 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 
 		/* clear this iommu in iommu_bmp, update iommu count
-		 * and coherency
+		 * and capabilities
 		 */
 		spin_lock_irqsave(&domain->iommu_lock, flags2);
 		if (test_and_clear_bit(iommu->seq_id,
 				       &domain->iommu_bmp)) {
 			domain->iommu_count--;
-			domain_update_iommu_coherency(domain);
+			domain_update_iommu_cap(domain);
 		}
 		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d2e3cbfba14f..3ad894004938 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -123,7 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
-
+#define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
 
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
-- 
cgit v1.2.3-71-gd317


From dbb9fd8630e95b6155aff658a2b5f80e95ca2bc6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 18 Mar 2009 15:33:06 +0800
Subject: iommu: Add domain_has_cap iommu_ops

This iommu_op can tell if domain have a specific capability, like snooping
control for Intel IOMMU, which can be used by other components of kernel to
adjust the behaviour.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/amd_iommu.c |  7 +++++++
 drivers/base/iommu.c        |  7 +++++++
 drivers/pci/intel-iommu.c   | 12 ++++++++++++
 include/linux/iommu.h       | 12 ++++++++++++
 4 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5113c080f0c4..65c9b58655ff 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1924,6 +1924,12 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	return paddr;
 }
 
+static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
+				    unsigned long cap)
+{
+	return 0;
+}
+
 static struct iommu_ops amd_iommu_ops = {
 	.domain_init = amd_iommu_domain_init,
 	.domain_destroy = amd_iommu_domain_destroy,
@@ -1932,5 +1938,6 @@ static struct iommu_ops amd_iommu_ops = {
 	.map = amd_iommu_map_range,
 	.unmap = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
+	.domain_has_cap = amd_iommu_domain_has_cap,
 };
 
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index 5e039d4f877c..c314f144825f 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -98,3 +98,10 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 	return iommu_ops->iova_to_phys(domain, iova);
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
+
+int iommu_domain_has_cap(struct iommu_domain *domain,
+			 unsigned long cap)
+{
+	return iommu_ops->domain_has_cap(domain, cap);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index be999ff025af..3778ab149baf 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3158,6 +3158,17 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 	return phys;
 }
 
+static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
+				      unsigned long cap)
+{
+	struct dmar_domain *dmar_domain = domain->priv;
+
+	if (cap == IOMMU_CAP_CACHE_COHERENCY)
+		return dmar_domain->iommu_snooping;
+
+	return 0;
+}
+
 static struct iommu_ops intel_iommu_ops = {
 	.domain_init	= intel_iommu_domain_init,
 	.domain_destroy = intel_iommu_domain_destroy,
@@ -3166,6 +3177,7 @@ static struct iommu_ops intel_iommu_ops = {
 	.map		= intel_iommu_map_range,
 	.unmap		= intel_iommu_unmap_range,
 	.iova_to_phys	= intel_iommu_iova_to_phys,
+	.domain_has_cap = intel_iommu_domain_has_cap,
 };
 
 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 8a7bfb1b6ca0..0cf3a4e43f23 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -28,6 +28,8 @@ struct iommu_domain {
 	void *priv;
 };
 
+#define IOMMU_CAP_CACHE_COHERENCY	0x1
+
 struct iommu_ops {
 	int (*domain_init)(struct iommu_domain *domain);
 	void (*domain_destroy)(struct iommu_domain *domain);
@@ -39,6 +41,8 @@ struct iommu_ops {
 		      size_t size);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    unsigned long iova);
+	int (*domain_has_cap)(struct iommu_domain *domain,
+			      unsigned long cap);
 };
 
 #ifdef CONFIG_IOMMU_API
@@ -57,6 +61,8 @@ extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
 			      size_t size);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 				      unsigned long iova);
+extern int iommu_domain_has_cap(struct iommu_domain *domain,
+				unsigned long cap);
 
 #else /* CONFIG_IOMMU_API */
 
@@ -107,6 +113,12 @@ static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 	return 0;
 }
 
+static inline int domain_has_cap(struct iommu_domain *domain,
+				 unsigned long cap)
+{
+	return 0;
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #endif /* __LINUX_IOMMU_H */
-- 
cgit v1.2.3-71-gd317


From 9cf0669746be19a4906a6c48920060bcf54c708b Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 18 Mar 2009 15:33:07 +0800
Subject: intel-iommu: VT-d page table to support snooping control bit

The user can request to enable snooping control through VT-d page table.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 12 +++++++++++-
 include/linux/dma_remapping.h |  1 +
 include/linux/iommu.h         |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 3778ab149baf..a0ba568b831c 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -164,7 +164,8 @@ static inline void context_clear_entry(struct context_entry *context)
  * 1: writable
  * 2-6: reserved
  * 7: super page
- * 8-11: available
+ * 8-10: available
+ * 11: snoop behavior
  * 12-63: Host physcial address
  */
 struct dma_pte {
@@ -186,6 +187,11 @@ static inline void dma_set_pte_writable(struct dma_pte *pte)
 	pte->val |= DMA_PTE_WRITE;
 }
 
+static inline void dma_set_pte_snp(struct dma_pte *pte)
+{
+	pte->val |= DMA_PTE_SNP;
+}
+
 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 {
 	pte->val = (pte->val & ~3) | (prot & 3);
@@ -1685,6 +1691,8 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
 		BUG_ON(dma_pte_addr(pte));
 		dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
 		dma_set_pte_prot(pte, prot);
+		if (prot & DMA_PTE_SNP)
+			dma_set_pte_snp(pte);
 		domain_flush_cache(domain, pte, sizeof(*pte));
 		start_pfn++;
 		index++;
@@ -3105,6 +3113,8 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
 		prot |= DMA_PTE_READ;
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= DMA_PTE_WRITE;
+	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+		prot |= DMA_PTE_SNP;
 
 	max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
 	if (dmar_domain->max_addr < max_addr) {
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index af1dab41674b..1a455f1f86d7 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -11,6 +11,7 @@
 
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
+#define DMA_PTE_SNP (1 << 11)
 
 struct intel_iommu;
 struct dmar_domain;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0cf3a4e43f23..3af4ffd591b9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -21,6 +21,7 @@
 
 #define IOMMU_READ	(1)
 #define IOMMU_WRITE	(2)
+#define IOMMU_CACHE	(4) /* DMA cache coherency */
 
 struct device;
 
-- 
cgit v1.2.3-71-gd317


From f057f6cdf64175db1151b1f5d110e29904f119a1 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 12 Jan 2009 10:43:39 +0000
Subject: GFS2: Merge lock_dlm module into GFS2

This is the big patch that I've been working on for some time
now. There are many reasons for wanting to make this change
such as:
 o Reducing overhead by eliminating duplicated fields between structures
 o Simplifcation of the code (reduces the code size by a fair bit)
 o The locking interface is now the DLM interface itself as proposed
   some time ago.
 o Fewer lookups of glocks when processing replies from the DLM
 o Fewer memory allocations/deallocations for each glock
 o Scope to do further optimisations in the future (but this patch is
   more than big enough for now!)

Please note that (a) this patch relates to the lock_dlm module and
not the DLM itself, that is still a separate module; and (b) that
we retain the ability to build GFS2 as a standalone single node
filesystem with out requiring the DLM.

This patch needs a lot of testing, hence my keeping it I restarted
my -git tree after the last merge window. That way, this has the maximum
exposure before its merged. This is (modulo a few minor bug fixes) the
same patch that I've been posting on and off the the last three months
and its passed a number of different tests so far.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig                |  17 +-
 fs/gfs2/Makefile               |   4 +-
 fs/gfs2/acl.c                  |   1 -
 fs/gfs2/bmap.c                 |   1 -
 fs/gfs2/dir.c                  |   1 -
 fs/gfs2/eaops.c                |   1 -
 fs/gfs2/eattr.c                |   1 -
 fs/gfs2/glock.c                | 249 +++++----------
 fs/gfs2/glock.h                | 127 +++++++-
 fs/gfs2/glops.c                |  14 -
 fs/gfs2/incore.h               |  59 +++-
 fs/gfs2/inode.c                |  13 +-
 fs/gfs2/inode.h                |  22 +-
 fs/gfs2/lock_dlm.c             | 240 ++++++++++++++
 fs/gfs2/locking.c              | 314 ------------------
 fs/gfs2/locking/dlm/Makefile   |   3 -
 fs/gfs2/locking/dlm/lock.c     | 708 -----------------------------------------
 fs/gfs2/locking/dlm/lock_dlm.h | 166 ----------
 fs/gfs2/locking/dlm/main.c     |  48 ---
 fs/gfs2/locking/dlm/mount.c    | 276 ----------------
 fs/gfs2/locking/dlm/sysfs.c    | 226 -------------
 fs/gfs2/locking/dlm/thread.c   |  68 ----
 fs/gfs2/log.c                  |   1 -
 fs/gfs2/lops.c                 |   1 -
 fs/gfs2/main.c                 |   3 -
 fs/gfs2/meta_io.c              |   1 -
 fs/gfs2/mount.c                |   1 -
 fs/gfs2/ops_address.c          |   1 -
 fs/gfs2/ops_dentry.c           |   1 -
 fs/gfs2/ops_export.c           |   1 -
 fs/gfs2/ops_file.c             |  74 ++---
 fs/gfs2/ops_fstype.c           | 134 +++++---
 fs/gfs2/ops_inode.c            |   1 -
 fs/gfs2/ops_super.c            |   1 -
 fs/gfs2/quota.c                |  12 +-
 fs/gfs2/recovery.c             |  28 +-
 fs/gfs2/rgrp.c                 |   1 -
 fs/gfs2/super.c                |   1 -
 fs/gfs2/sys.c                  | 154 ++++++++-
 fs/gfs2/trans.c                |   3 +-
 fs/gfs2/util.c                 |  11 +-
 include/linux/lm_interface.h   | 277 ----------------
 42 files changed, 819 insertions(+), 2447 deletions(-)
 create mode 100644 fs/gfs2/lock_dlm.c
 delete mode 100644 fs/gfs2/locking.c
 delete mode 100644 fs/gfs2/locking/dlm/Makefile
 delete mode 100644 fs/gfs2/locking/dlm/lock.c
 delete mode 100644 fs/gfs2/locking/dlm/lock_dlm.h
 delete mode 100644 fs/gfs2/locking/dlm/main.c
 delete mode 100644 fs/gfs2/locking/dlm/mount.c
 delete mode 100644 fs/gfs2/locking/dlm/sysfs.c
 delete mode 100644 fs/gfs2/locking/dlm/thread.c
 delete mode 100644 include/linux/lm_interface.h

(limited to 'include/linux')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index e563a6449811..3a981b7f64ca 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,10 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
 	depends on EXPERIMENTAL && (64BIT || LBD)
+	select DLM if GFS2_FS_LOCKING_DLM
+	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
+	select SYSFS if GFS2_FS_LOCKING_DLM
+	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
 	help
@@ -18,17 +22,16 @@ config GFS2_FS
 	  the locking module below. Documentation and utilities for GFS2 can
 	  be found here: http://sources.redhat.com/cluster
 
-	  The "nolock" lock module is now built in to GFS2 by default.
+	  The "nolock" lock module is now built in to GFS2 by default. If
+	  you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
+	  networking.
 
 config GFS2_FS_LOCKING_DLM
-	tristate "GFS2 DLM locking module"
-	depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n)
-	select IP_SCTP if DLM_SCTP
-	select CONFIGFS_FS
-	select DLM
+	bool "GFS2 DLM locking"
+	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
 	help
 	  Multiple node locking module for GFS2
 
-	  Most users of GFS2 will require this module. It provides the locking
+	  Most users of GFS2 will require this. It provides the locking
 	  interface between GFS2 and the DLM, which is required to use GFS2
 	  in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index c1b4ec6a9650..a851ea4bdf70 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
-	glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
+	glops.o inode.o log.o lops.o main.o meta_io.o \
 	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
-obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
+gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
 
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index e335dceb6a4f..43764f4fa763 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -15,7 +15,6 @@
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 11ffc56f1f81..3a5d3f883e10 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b7c8e5c70791..aef4d0c06748 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -60,7 +60,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/vmalloc.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index f114ba2b3557..dee9b03e5b37 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -14,7 +14,6 @@
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 0d1c76d906ae..899763aed217 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6b983aef785d..cd200a564c79 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -10,7 +10,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/delay.h>
 #include <linux/sort.h>
@@ -18,7 +17,6 @@
 #include <linux/kallsyms.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/list.h>
-#include <linux/lm_interface.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include <linux/rwsem.h>
@@ -155,13 +153,10 @@ static void glock_free(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct inode *aspace = gl->gl_aspace;
 
-	if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
-		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
-
 	if (aspace)
 		gfs2_aspace_put(aspace);
 
-	kmem_cache_free(gfs2_glock_cachep, gl);
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
 }
 
 /**
@@ -211,7 +206,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 			atomic_dec(&lru_count);
 		}
 		spin_unlock(&lru_lock);
-		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
@@ -255,27 +249,6 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
 	return NULL;
 }
 
-/**
- * gfs2_glock_find() - Find glock by lock number
- * @sdp: The GFS2 superblock
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
-					  const struct lm_lockname *name)
-{
-	unsigned int hash = gl_hash(sdp, name);
-	struct gfs2_glock *gl;
-
-	read_lock(gl_lock_addr(hash));
-	gl = search_bucket(hash, sdp, name);
-	read_unlock(gl_lock_addr(hash));
-
-	return gl;
-}
-
 /**
  * may_grant - check if its ok to grant a new lock
  * @gl: The glock
@@ -523,7 +496,7 @@ out_locked:
 }
 
 static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-				 unsigned int cur_state, unsigned int req_state,
+				 unsigned int req_state,
 				 unsigned int flags)
 {
 	int ret = LM_OUT_ERROR;
@@ -532,7 +505,7 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
 		return req_state == LM_ST_UNLOCKED ? 0 : req_state;
 
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
 							 req_state, flags);
 	return ret;
 }
@@ -575,7 +548,7 @@ __acquires(&gl->gl_spin)
 	    gl->gl_state == LM_ST_DEFERRED) &&
 	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 		lck_flags |= LM_FLAG_TRY_1CB;
-	ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+	ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
 
 	if (!(ret & LM_OUT_ASYNC)) {
 		finish_xmote(gl, ret);
@@ -681,18 +654,6 @@ static void glock_work_func(struct work_struct *work)
 		gfs2_glock_put(gl);
 }
 
-static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		     void **lockp)
-{
-	int error = -EIO;
-	if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
-		return 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
-				sdp->sd_lockstruct.ls_lockspace, name, lockp);
-	return error;
-}
-
 /**
  * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
  * @sdp: The GFS2 superblock
@@ -736,6 +697,9 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_hash = hash;
 	gl->gl_ops = glops;
+	snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number);
+	memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+	gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
 	gl->gl_stamp = jiffies;
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
@@ -753,10 +717,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		}
 	}
 
-	error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
-	if (error)
-		goto fail_aspace;
-
 	write_lock(gl_lock_addr(hash));
 	tmp = search_bucket(hash, sdp, &name);
 	if (tmp) {
@@ -772,9 +732,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 
 	return 0;
 
-fail_aspace:
-	if (gl->gl_aspace)
-		gfs2_aspace_put(gl->gl_aspace);
 fail:
 	kmem_cache_free(gfs2_glock_cachep, gl);
 	return error;
@@ -966,7 +923,7 @@ do_cancel:
 	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
 		spin_unlock(&gl->gl_spin);
 		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
-			sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+			sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
 		spin_lock(&gl->gl_spin);
 	}
 	return;
@@ -1240,70 +1197,13 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 		gfs2_glock_dq_uninit(&ghs[x]);
 }
 
-static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 {
-	int error = -EIO;
-	if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
-		return 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
-	return error;
-}
-
-/**
- * gfs2_lvb_hold - attach a LVB from a glock
- * @gl: The glock in question
- *
- */
-
-int gfs2_lvb_hold(struct gfs2_glock *gl)
-{
-	int error;
-
-	if (!atomic_read(&gl->gl_lvb_count)) {
-		error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-		if (error) 
-			return error;
-		gfs2_glock_hold(gl);
-	}
-	atomic_inc(&gl->gl_lvb_count);
-
-	return 0;
-}
-
-/**
- * gfs2_lvb_unhold - detach a LVB from a glock
- * @gl: The glock in question
- *
- */
-
-void gfs2_lvb_unhold(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-
-	gfs2_glock_hold(gl);
-	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
-	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-		if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
-			sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
-		gl->gl_lvb = NULL;
-		gfs2_glock_put(gl);
-	}
-	gfs2_glock_put(gl);
-}
-
-static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
-			unsigned int state)
-{
-	struct gfs2_glock *gl;
 	unsigned long delay = 0;
 	unsigned long holdtime;
 	unsigned long now = jiffies;
 
-	gl = gfs2_glock_find(sdp, name);
-	if (!gl)
-		return;
-
+	gfs2_glock_hold(gl);
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
 	if (time_before(now, holdtime))
 		delay = holdtime - now;
@@ -1317,74 +1217,37 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		gfs2_glock_put(gl);
 }
 
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-	struct gfs2_jdesc *jd;
-
-	spin_lock(&sdp->sd_jindex_spin);
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_jid != jid)
-			continue;
-		jd->jd_dirty = 1;
-		break;
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-}
-
 /**
- * gfs2_glock_cb - Callback used by locking module
- * @sdp: Pointer to the superblock
- * @type: Type of callback
- * @data: Type dependent data pointer
+ * gfs2_glock_complete - Callback used by locking
+ * @gl: Pointer to the glock
+ * @ret: The return value from the dlm
  *
- * Called by the locking module when it wants to tell us something.
- * Either we need to drop a lock, one of our ASYNC requests completed, or
- * a journal from another client needs to be recovered.
  */
 
-void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
-	struct gfs2_sbd *sdp = cb_data;
-
-	switch (type) {
-	case LM_CB_NEED_E:
-		blocking_cb(sdp, data, LM_ST_UNLOCKED);
-		return;
-
-	case LM_CB_NEED_D:
-		blocking_cb(sdp, data, LM_ST_DEFERRED);
-		return;
-
-	case LM_CB_NEED_S:
-		blocking_cb(sdp, data, LM_ST_SHARED);
-		return;
-
-	case LM_CB_ASYNC: {
-		struct lm_async_cb *async = data;
-		struct gfs2_glock *gl;
-
-		down_read(&gfs2_umount_flush_sem);
-		gl = gfs2_glock_find(sdp, &async->lc_name);
-		if (gfs2_assert_warn(sdp, gl))
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	down_read(&gfs2_umount_flush_sem);
+	gl->gl_reply = ret;
+	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+		struct gfs2_holder *gh;
+		spin_lock(&gl->gl_spin);
+		gh = find_first_waiter(gl);
+		if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
+		     (gl->gl_target != LM_ST_UNLOCKED)) ||
+		    ((ret & ~LM_OUT_ST_MASK) != 0))
+			set_bit(GLF_FROZEN, &gl->gl_flags);
+		spin_unlock(&gl->gl_spin);
+		if (test_bit(GLF_FROZEN, &gl->gl_flags)) {
+			up_read(&gfs2_umount_flush_sem);
 			return;
-		gl->gl_reply = async->lc_ret;
-		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
-		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-			gfs2_glock_put(gl);
-		up_read(&gfs2_umount_flush_sem);
-		return;
-	}
-
-	case LM_CB_NEED_RECOVERY:
-		gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
-		if (sdp->sd_recoverd_process)
-			wake_up_process(sdp->sd_recoverd_process);
-		return;
-
-	default:
-		gfs2_assert_warn(sdp, 0);
-		return;
+		}
 	}
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+	up_read(&gfs2_umount_flush_sem);
 }
 
 /**
@@ -1515,6 +1378,27 @@ out:
 	return has_entries;
 }
 
+
+/**
+ * thaw_glock - thaw out a glock which has an unprocessed reply waiting
+ * @gl: The glock to thaw
+ *
+ * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
+ * so this has to result in the ref count being dropped by one.
+ */
+
+static void thaw_glock(struct gfs2_glock *gl)
+{
+	if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+		return;
+	down_read(&gfs2_umount_flush_sem);
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+	up_read(&gfs2_umount_flush_sem);
+}
+
 /**
  * clear_glock - look at a glock and see if we can free it from glock cache
  * @gl: the glock to look at
@@ -1539,6 +1423,20 @@ static void clear_glock(struct gfs2_glock *gl)
 		gfs2_glock_put(gl);
 }
 
+/**
+ * gfs2_glock_thaw - Thaw any frozen glocks
+ * @sdp: The super block
+ *
+ */
+
+void gfs2_glock_thaw(struct gfs2_sbd *sdp)
+{
+	unsigned x;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(thaw_glock, sdp, x);
+}
+
 /**
  * gfs2_gl_hash_clear - Empty out the glock hash table
  * @sdp: the filesystem
@@ -1619,7 +1517,7 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 	if (flags & LM_FLAG_NOEXP)
 		*p++ = 'e';
 	if (flags & LM_FLAG_ANY)
-		*p++ = 'a';
+		*p++ = 'A';
 	if (flags & LM_FLAG_PRIORITY)
 		*p++ = 'p';
 	if (flags & GL_ASYNC)
@@ -1683,6 +1581,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 		*p++ = 'i';
 	if (test_bit(GLF_REPLY_PENDING, gflags))
 		*p++ = 'r';
+	if (test_bit(GLF_INITIAL, gflags))
+		*p++ = 'i';
+	if (test_bit(GLF_FROZEN, gflags))
+		*p++ = 'F';
 	*p = 0;
 	return buf;
 }
@@ -1717,14 +1619,13 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
 		  gflags2str(gflags_buf, &gl->gl_flags),
 		  state2str(gl->gl_target),
 		  state2str(gl->gl_demote_state), dtime,
-		  atomic_read(&gl->gl_lvb_count),
 		  atomic_read(&gl->gl_ail_count),
 		  atomic_read(&gl->gl_ref));
 
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 543ec7ecfbda..a602a28f6f08 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -11,15 +11,130 @@
 #define __GLOCK_DOT_H__
 
 #include <linux/sched.h>
+#include <linux/parser.h>
 #include "incore.h"
 
-/* Flags for lock requests; used in gfs2_holder gh_flag field.
-   From lm_interface.h:
+/* Options for hostdata parser */
+
+enum {
+	Opt_jid,
+	Opt_id,
+	Opt_first,
+	Opt_nodir,
+	Opt_err,
+};
+
+/*
+ * lm_lockname types
+ */
+
+#define LM_TYPE_RESERVED	0x00
+#define LM_TYPE_NONDISK		0x01
+#define LM_TYPE_INODE		0x02
+#define LM_TYPE_RGRP		0x03
+#define LM_TYPE_META		0x04
+#define LM_TYPE_IOPEN		0x05
+#define LM_TYPE_FLOCK		0x06
+#define LM_TYPE_PLOCK		0x07
+#define LM_TYPE_QUOTA		0x08
+#define LM_TYPE_JOURNAL		0x09
+
+/*
+ * lm_lock() states
+ *
+ * SHARED is compatible with SHARED, not with DEFERRED or EX.
+ * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
+ */
+
+#define LM_ST_UNLOCKED		0
+#define LM_ST_EXCLUSIVE		1
+#define LM_ST_DEFERRED		2
+#define LM_ST_SHARED		3
+
+/*
+ * lm_lock() flags
+ *
+ * LM_FLAG_TRY
+ * Don't wait to acquire the lock if it can't be granted immediately.
+ *
+ * LM_FLAG_TRY_1CB
+ * Send one blocking callback if TRY is set and the lock is not granted.
+ *
+ * LM_FLAG_NOEXP
+ * GFS sets this flag on lock requests it makes while doing journal recovery.
+ * These special requests should not be blocked due to the recovery like
+ * ordinary locks would be.
+ *
+ * LM_FLAG_ANY
+ * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
+ * also be granted in SHARED.  The preferred state is whichever is compatible
+ * with other granted locks, or the specified state if no other locks exist.
+ *
+ * LM_FLAG_PRIORITY
+ * Override fairness considerations.  Suppose a lock is held in a shared state
+ * and there is a pending request for the deferred state.  A shared lock
+ * request with the priority flag would be allowed to bypass the deferred
+ * request and directly join the other shared lock.  A shared lock request
+ * without the priority flag might be forced to wait until the deferred
+ * requested had acquired and released the lock.
+ */
+
 #define LM_FLAG_TRY		0x00000001
 #define LM_FLAG_TRY_1CB		0x00000002
 #define LM_FLAG_NOEXP		0x00000004
 #define LM_FLAG_ANY		0x00000008
-#define LM_FLAG_PRIORITY	0x00000010 */
+#define LM_FLAG_PRIORITY	0x00000010
+#define GL_ASYNC		0x00000040
+#define GL_EXACT		0x00000080
+#define GL_SKIP			0x00000100
+#define GL_ATIME		0x00000200
+#define GL_NOCACHE		0x00000400
+  
+/*
+ * lm_lock() and lm_async_cb return flags
+ *
+ * LM_OUT_ST_MASK
+ * Masks the lower two bits of lock state in the returned value.
+ *
+ * LM_OUT_CANCELED
+ * The lock request was canceled.
+ *
+ * LM_OUT_ASYNC
+ * The result of the request will be returned in an LM_CB_ASYNC callback.
+ *
+ */
+
+#define LM_OUT_ST_MASK		0x00000003
+#define LM_OUT_CANCELED		0x00000008
+#define LM_OUT_ASYNC		0x00000080
+#define LM_OUT_ERROR		0x00000100
+
+/*
+ * lm_recovery_done() messages
+ */
+
+#define LM_RD_GAVEUP		308
+#define LM_RD_SUCCESS		309
+
+#define GLR_TRYFAILED		13
+
+struct lm_lockops {
+	const char *lm_proto_name;
+	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
+ 	void (*lm_unmount) (struct gfs2_sbd *sdp);
+	void (*lm_withdraw) (struct gfs2_sbd *sdp);
+	void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
+	unsigned int (*lm_lock) (struct gfs2_glock *gl,
+				 unsigned int req_state, unsigned int flags);
+	void (*lm_cancel) (struct gfs2_glock *gl);
+	const match_table_t *lm_tokens;
+};
+
+#define LM_FLAG_TRY		0x00000001
+#define LM_FLAG_TRY_1CB		0x00000002
+#define LM_FLAG_NOEXP		0x00000004
+#define LM_FLAG_ANY		0x00000008
+#define LM_FLAG_PRIORITY	0x00000010
 
 #define GL_ASYNC		0x00000040
 #define GL_EXACT		0x00000080
@@ -128,10 +243,12 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
-void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+void gfs2_glock_thaw(struct gfs2_sbd *sdp);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
@@ -141,4 +258,6 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
 int gfs2_register_debugfs(void);
 void gfs2_unregister_debugfs(void);
 
+extern const struct lm_lockops gfs2_dlm_ops;
+
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 8522d3aa64fc..f07ede8cb9ba 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/bio.h>
 
 #include "gfs2.h"
@@ -390,18 +389,6 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
 	return 0;
 }
 
-/**
- * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int quota_go_demote_ok(const struct gfs2_glock *gl)
-{
-	return !atomic_read(&gl->gl_lvb_count);
-}
-
 const struct gfs2_glock_operations gfs2_meta_glops = {
 	.go_xmote_th = meta_go_sync,
 	.go_type = LM_TYPE_META,
@@ -448,7 +435,6 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
 };
 
 const struct gfs2_glock_operations gfs2_quota_glops = {
-	.go_demote_ok = quota_go_demote_ok,
 	.go_type = LM_TYPE_QUOTA,
 };
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a0117d6eb145..0af7c24de6a1 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,8 @@
 
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/dlm.h>
+#include <linux/buffer_head.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -26,6 +28,7 @@ struct gfs2_trans;
 struct gfs2_ail;
 struct gfs2_jdesc;
 struct gfs2_sbd;
+struct lm_lockops;
 
 typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
 
@@ -121,6 +124,28 @@ struct gfs2_bufdata {
 	struct list_head bd_ail_gl_list;
 };
 
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.
+ */
+
+#define GDLM_STRNAME_BYTES	25
+#define GDLM_LVB_SIZE		32
+
+enum {
+	DFL_BLOCK_LOCKS		= 0,
+};
+
+struct lm_lockname {
+	u64 ln_number;
+	unsigned int ln_type;
+};
+
+#define lm_name_equal(name1, name2) \
+        (((name1)->ln_number == (name2)->ln_number) && \
+         ((name1)->ln_type == (name2)->ln_type))
+
+
 struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
 	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
@@ -162,6 +187,8 @@ enum {
 	GLF_LFLUSH			= 7,
 	GLF_INVALIDATE_IN_PROGRESS	= 8,
 	GLF_REPLY_PENDING		= 9,
+	GLF_INITIAL			= 10,
+	GLF_FROZEN			= 11,
 };
 
 struct gfs2_glock {
@@ -181,10 +208,9 @@ struct gfs2_glock {
 	struct list_head gl_holders;
 
 	const struct gfs2_glock_operations *gl_ops;
-	void *gl_lock;
-	char *gl_lvb;
-	atomic_t gl_lvb_count;
-
+	char gl_strname[GDLM_STRNAME_BYTES];
+	struct dlm_lksb gl_lksb;
+	char gl_lvb[32];
 	unsigned long gl_stamp;
 	unsigned long gl_tchange;
 	void *gl_object;
@@ -447,6 +473,30 @@ struct gfs2_sb_host {
 	char sb_locktable[GFS2_LOCKNAME_LEN];
 };
 
+/*
+ * lm_mount() return values
+ *
+ * ls_jid - the journal ID this node should use
+ * ls_first - this node is the first to mount the file system
+ * ls_lockspace - lock module's context for this file system
+ * ls_ops - lock module's functions
+ */
+
+struct lm_lockstruct {
+	u32 ls_id;
+	unsigned int ls_jid;
+	unsigned int ls_first;
+	unsigned int ls_first_done;
+	unsigned int ls_nodir;
+	const struct lm_lockops *ls_ops;
+	unsigned long ls_flags;
+	dlm_lockspace_t *ls_dlm;
+
+	int ls_recover_jid;
+	int ls_recover_jid_done;
+	int ls_recover_jid_status;
+};
+
 struct gfs2_sbd {
 	struct super_block *sd_vfs;
 	struct kobject sd_kobj;
@@ -520,7 +570,6 @@ struct gfs2_sbd {
 	spinlock_t sd_jindex_spin;
 	struct mutex sd_jindex_mutex;
 	unsigned int sd_journals;
-	unsigned long sd_jindex_refresh_time;
 
 	struct gfs2_jdesc *sd_jdesc;
 	struct gfs2_holder sd_journal_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3b87c188da41..7b277d449155 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -16,7 +16,6 @@
 #include <linux/sort.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/security.h>
 #include <linux/time.h>
 
@@ -137,16 +136,16 @@ void gfs2_set_iop(struct inode *inode)
 
 	if (S_ISREG(mode)) {
 		inode->i_op = &gfs2_file_iops;
-		if (sdp->sd_args.ar_localflocks)
-			inode->i_fop = &gfs2_file_fops_nolock;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = gfs2_file_fops_nolock;
 		else
-			inode->i_fop = &gfs2_file_fops;
+			inode->i_fop = gfs2_file_fops;
 	} else if (S_ISDIR(mode)) {
 		inode->i_op = &gfs2_dir_iops;
-		if (sdp->sd_args.ar_localflocks)
-			inode->i_fop = &gfs2_dir_fops_nolock;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = gfs2_dir_fops_nolock;
 		else
-			inode->i_fop = &gfs2_dir_fops;
+			inode->i_fop = gfs2_dir_fops;
 	} else if (S_ISLNK(mode)) {
 		inode->i_op = &gfs2_symlink_iops;
 	} else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d5329364cdff..dca4fee3078b 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,12 +101,26 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
+extern const struct file_operations *gfs2_file_fops_nolock;
+extern const struct file_operations *gfs2_dir_fops_nolock;
 
 extern void gfs2_set_inode_flags(struct inode *inode);
+ 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+extern const struct file_operations *gfs2_file_fops;
+extern const struct file_operations *gfs2_dir_fops;
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return sdp->sd_args.ar_localflocks;
+}
+#else /* Single node only */
+#define gfs2_file_fops NULL
+#define gfs2_dir_fops NULL
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return 1;
+}
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
 
 #endif /* __INODE_DOT_H__ */
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
new file mode 100644
index 000000000000..a0bb7d2251a0
--- /dev/null
+++ b/fs/gfs2/lock_dlm.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/fs.h>
+#include <linux/dlm.h>
+#include <linux/types.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "incore.h"
+#include "glock.h"
+#include "util.h"
+
+
+static void gdlm_ast(void *arg)
+{
+	struct gfs2_glock *gl = arg;
+	unsigned ret = gl->gl_state;
+
+	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
+
+	if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
+		memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
+
+	switch (gl->gl_lksb.sb_status) {
+	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
+		kmem_cache_free(gfs2_glock_cachep, gl);
+		return;
+	case -DLM_ECANCEL: /* Cancel while getting lock */
+		ret |= LM_OUT_CANCELED;
+		goto out;
+	case -EAGAIN: /* Try lock fails */
+		goto out;
+	case -EINVAL: /* Invalid */
+	case -ENOMEM: /* Out of memory */
+		ret |= LM_OUT_ERROR;
+		goto out;
+	case 0: /* Success */
+		break;
+	default: /* Something unexpected */
+		BUG();
+	}
+
+	ret = gl->gl_target;
+	if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (gl->gl_target == LM_ST_SHARED)
+			ret = LM_ST_DEFERRED;
+		else if (gl->gl_target == LM_ST_DEFERRED)
+			ret = LM_ST_SHARED;
+		else
+			BUG();
+	}
+
+	set_bit(GLF_INITIAL, &gl->gl_flags);
+	gfs2_glock_complete(gl, ret);
+	return;
+out:
+	if (!test_bit(GLF_INITIAL, &gl->gl_flags))
+		gl->gl_lksb.sb_lkid = 0;
+	gfs2_glock_complete(gl, ret);
+}
+
+static void gdlm_bast(void *arg, int mode)
+{
+	struct gfs2_glock *gl = arg;
+
+	switch (mode) {
+	case DLM_LOCK_EX:
+		gfs2_glock_cb(gl, LM_ST_UNLOCKED);
+		break;
+	case DLM_LOCK_CW:
+		gfs2_glock_cb(gl, LM_ST_DEFERRED);
+		break;
+	case DLM_LOCK_PR:
+		gfs2_glock_cb(gl, LM_ST_SHARED);
+		break;
+	default:
+		printk(KERN_ERR "unknown bast mode %d", mode);
+		BUG();
+	}
+}
+
+/* convert gfs lock-state to dlm lock-mode */
+
+static int make_mode(const unsigned int lmstate)
+{
+	switch (lmstate) {
+	case LM_ST_UNLOCKED:
+		return DLM_LOCK_NL;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	}
+	printk(KERN_ERR "unknown LM state %d", lmstate);
+	BUG();
+	return -1;
+}
+
+static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
+		      const int req)
+{
+	u32 lkf = 0;
+
+	if (gfs_flags & LM_FLAG_TRY)
+		lkf |= DLM_LKF_NOQUEUE;
+
+	if (gfs_flags & LM_FLAG_TRY_1CB) {
+		lkf |= DLM_LKF_NOQUEUE;
+		lkf |= DLM_LKF_NOQUEUEBAST;
+	}
+
+	if (gfs_flags & LM_FLAG_PRIORITY) {
+		lkf |= DLM_LKF_NOORDER;
+		lkf |= DLM_LKF_HEADQUE;
+	}
+
+	if (gfs_flags & LM_FLAG_ANY) {
+		if (req == DLM_LOCK_PR)
+			lkf |= DLM_LKF_ALTCW;
+		else if (req == DLM_LOCK_CW)
+			lkf |= DLM_LKF_ALTPR;
+		else
+			BUG();
+	}
+
+	if (lkid != 0) 
+		lkf |= DLM_LKF_CONVERT;
+
+	lkf |= DLM_LKF_VALBLK;
+
+	return lkf;
+}
+
+static unsigned int gdlm_lock(struct gfs2_glock *gl,
+			      unsigned int req_state, unsigned int flags)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int error;
+	int req;
+	u32 lkf;
+
+	req = make_mode(req_state);
+	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
+
+	/*
+	 * Submit the actual lock request.
+	 */
+
+	error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+			 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+	if (error == -EAGAIN)
+		return 0;
+	if (error)
+		return LM_OUT_ERROR;
+	return LM_OUT_ASYNC;
+}
+
+static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
+{
+	struct gfs2_glock *gl = ptr;
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int error;
+
+	if (gl->gl_lksb.sb_lkid == 0) {
+		kmem_cache_free(cachep, gl);
+		return;
+	}
+
+	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
+			   NULL, gl);
+	if (error) {
+		printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
+		       gl->gl_name.ln_type,
+		       (unsigned long long)gl->gl_name.ln_number, error);
+		return;
+	}
+}
+
+static void gdlm_cancel(struct gfs2_glock *gl)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+}
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	if (fsname == NULL) {
+		fs_info(sdp, "no fsname found\n");
+		return -EINVAL;
+	}
+
+	error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
+				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+				  (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
+				  GDLM_LVB_SIZE);
+	if (error)
+		printk(KERN_ERR "dlm_new_lockspace error %d", error);
+
+	return error;
+}
+
+static void gdlm_unmount(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (ls->ls_dlm) {
+		dlm_release_lockspace(ls->ls_dlm, 2);
+		ls->ls_dlm = NULL;
+	}
+}
+
+static const match_table_t dlm_tokens = {
+	{ Opt_jid, "jid=%d"},
+	{ Opt_id, "id=%d"},
+	{ Opt_first, "first=%d"},
+	{ Opt_nodir, "nodir=%d"},
+	{ Opt_err, NULL },
+};
+
+const struct lm_lockops gfs2_dlm_ops = {
+	.lm_proto_name = "lock_dlm",
+	.lm_mount = gdlm_mount,
+	.lm_unmount = gdlm_unmount,
+	.lm_put_lock = gdlm_put_lock,
+	.lm_lock = gdlm_lock,
+	.lm_cancel = gdlm_cancel,
+	.lm_tokens = &dlm_tokens,
+};
+
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
deleted file mode 100644
index d3657bc7938a..000000000000
--- a/fs/gfs2/locking.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include <linux/fs.h>
-#include <linux/delay.h>
-#include <linux/lm_interface.h>
-
-struct lmh_wrapper {
-	struct list_head lw_list;
-	const struct lm_lockops *lw_ops;
-};
-
-struct nolock_lockspace {
-	unsigned int nl_lvb_size;
-};
-
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
-                           void **lockp)
-{
-        *lockp = lockspace;
-        return 0;
-}
-
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-
-static void nolock_put_lock(void *lock)
-{
-}
-
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
-	struct nolock_lockspace *nl = lock;
-	int error = 0;
-
-	*lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
-	if (!*lvbp)
-		error = -ENOMEM;
-
-	return error;
-}
-
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
-	kfree(lvb);
-}
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj);
-static void nolock_unmount(void *lockspace);
-
-/* List of registered low-level locking protocols.  A file system selects one
-   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
-
-static const struct lm_lockops nolock_ops = {
-	.lm_proto_name = "lock_nolock",
-	.lm_mount = nolock_mount,
-	.lm_unmount = nolock_unmount,
-	.lm_get_lock = nolock_get_lock,
-	.lm_put_lock = nolock_put_lock,
-	.lm_hold_lvb = nolock_hold_lvb,
-	.lm_unhold_lvb = nolock_unhold_lvb,
-};
-
-static struct lmh_wrapper nolock_proto  = {
-	.lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
-	.lw_ops = &nolock_ops,
-};
-
-static LIST_HEAD(lmh_list);
-static DEFINE_MUTEX(lmh_lock);
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	char *c;
-	unsigned int jid;
-	struct nolock_lockspace *nl;
-
-	c = strstr(host_data, "jid=");
-	if (!c)
-		jid = 0;
-	else {
-		c += 4;
-		sscanf(c, "%u", &jid);
-	}
-
-	nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
-	if (!nl)
-		return -ENOMEM;
-
-	nl->nl_lvb_size = min_lvb_size;
-
-	lockstruct->ls_jid = jid;
-	lockstruct->ls_first = 1;
-	lockstruct->ls_lvb_size = min_lvb_size;
-	lockstruct->ls_lockspace = nl;
-	lockstruct->ls_ops = &nolock_ops;
-	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
-	return 0;
-}
-
-static void nolock_unmount(void *lockspace)
-{
-        struct nolock_lockspace *nl = lockspace;
-        kfree(nl);
-}
-
-/**
- * gfs2_register_lockproto - Register a low-level locking protocol
- * @proto: the protocol definition
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-int gfs2_register_lockproto(const struct lm_lockops *proto)
-{
-	struct lmh_wrapper *lw;
-
-	mutex_lock(&lmh_lock);
-
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
-			mutex_unlock(&lmh_lock);
-			printk(KERN_INFO "GFS2: protocol %s already exists\n",
-			       proto->lm_proto_name);
-			return -EEXIST;
-		}
-	}
-
-	lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
-	if (!lw) {
-		mutex_unlock(&lmh_lock);
-		return -ENOMEM;
-	}
-
-	lw->lw_ops = proto;
-	list_add(&lw->lw_list, &lmh_list);
-
-	mutex_unlock(&lmh_lock);
-
-	return 0;
-}
-
-/**
- * gfs2_unregister_lockproto - Unregister a low-level locking protocol
- * @proto: the protocol definition
- *
- */
-
-void gfs2_unregister_lockproto(const struct lm_lockops *proto)
-{
-	struct lmh_wrapper *lw;
-
-	mutex_lock(&lmh_lock);
-
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
-			list_del(&lw->lw_list);
-			mutex_unlock(&lmh_lock);
-			kfree(lw);
-			return;
-		}
-	}
-
-	mutex_unlock(&lmh_lock);
-
-	printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
-	       proto->lm_proto_name);
-}
-
-/**
- * gfs2_mount_lockproto - Mount a lock protocol
- * @proto_name - the name of the protocol
- * @table_name - the name of the lock space
- * @host_data - data specific to this host
- * @cb - the callback to the code using the lock module
- * @sdp - The GFS2 superblock
- * @min_lvb_size - the mininum LVB size that the caller can deal with
- * @flags - LM_MFLAG_*
- * @lockstruct - a structure returned describing the mount
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
-			 lm_callback_t cb, void *cb_data,
-			 unsigned int min_lvb_size, int flags,
-			 struct lm_lockstruct *lockstruct,
-			 struct kobject *fskobj)
-{
-	struct lmh_wrapper *lw = NULL;
-	int try = 0;
-	int error, found;
-
-
-retry:
-	mutex_lock(&lmh_lock);
-
-	if (list_empty(&nolock_proto.lw_list))
-		list_add(&nolock_proto.lw_list, &lmh_list);
-
-	found = 0;
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
-			found = 1;
-			break;
-		}
-	}
-
-	if (!found) {
-		if (!try && capable(CAP_SYS_MODULE)) {
-			try = 1;
-			mutex_unlock(&lmh_lock);
-			request_module(proto_name);
-			goto retry;
-		}
-		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
-		error = -ENOENT;
-		goto out;
-	}
-
-	if (lw->lw_ops->lm_owner &&
-	    !try_module_get(lw->lw_ops->lm_owner)) {
-		try = 0;
-		mutex_unlock(&lmh_lock);
-		msleep(1000);
-		goto retry;
-	}
-
-	error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
-				     min_lvb_size, flags, lockstruct, fskobj);
-	if (error)
-		module_put(lw->lw_ops->lm_owner);
-out:
-	mutex_unlock(&lmh_lock);
-	return error;
-}
-
-void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
-{
-	mutex_lock(&lmh_lock);
-	if (lockstruct->ls_ops->lm_unmount)
-		lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
-	if (lockstruct->ls_ops->lm_owner)
-		module_put(lockstruct->ls_ops->lm_owner);
-	mutex_unlock(&lmh_lock);
-}
-
-/**
- * gfs2_withdraw_lockproto - abnormally unmount a lock module
- * @lockstruct: the lockstruct passed into mount
- *
- */
-
-void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
-{
-	mutex_lock(&lmh_lock);
-	lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
-	if (lockstruct->ls_ops->lm_owner)
-		module_put(lockstruct->ls_ops->lm_owner);
-	mutex_unlock(&lmh_lock);
-}
-
-EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
-EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
-
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
deleted file mode 100644
index 2609bb6cd013..000000000000
--- a/fs/gfs2/locking/dlm/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
-lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
-
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
deleted file mode 100644
index 2482c9047505..000000000000
--- a/fs/gfs2/locking/dlm/lock.c
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-static char junk_lvb[GDLM_LVB_SIZE];
-
-
-/* convert dlm lock-mode to gfs lock-state */
-
-static s16 gdlm_make_lmstate(s16 dlmmode)
-{
-	switch (dlmmode) {
-	case DLM_LOCK_IV:
-	case DLM_LOCK_NL:
-		return LM_ST_UNLOCKED;
-	case DLM_LOCK_EX:
-		return LM_ST_EXCLUSIVE;
-	case DLM_LOCK_CW:
-		return LM_ST_DEFERRED;
-	case DLM_LOCK_PR:
-		return LM_ST_SHARED;
-	}
-	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-	return -1;
-}
-
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
-   thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->submit);
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-	clear_bit(LFL_AST_WAIT, &lp->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->delay_list))
-		list_del_init(&lp->delay_list);
-	ls->all_locks_count--;
-	spin_unlock(&ls->async_lock);
-
-	kfree(lp);
-}
-
-static void gdlm_queue_delayed(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->delayed);
-	spin_unlock(&ls->async_lock);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	struct lm_async_cb acb;
-
-	memset(&acb, 0, sizeof(acb));
-
-	if (lp->lksb.sb_status == -DLM_ECANCEL) {
-		log_info("complete dlm cancel %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		if (lp->cur == DLM_LOCK_IV)
-			lp->lksb.sb_lkid = 0;
-		goto out;
-	}
-
-	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-			log_info("unlock sb_status %d %x,%llx flags %lx",
-				 lp->lksb.sb_status, lp->lockname.ln_type,
-				 (unsigned long long)lp->lockname.ln_number,
-				 lp->flags);
-			return;
-		}
-
-		lp->cur = DLM_LOCK_IV;
-		lp->req = DLM_LOCK_IV;
-		lp->lksb.sb_lkid = 0;
-
-		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-			gdlm_delete_lp(lp);
-			return;
-		}
-		goto out;
-	}
-
-	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
-	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-		if (lp->req == DLM_LOCK_PR)
-			lp->req = DLM_LOCK_CW;
-		else if (lp->req == DLM_LOCK_CW)
-			lp->req = DLM_LOCK_PR;
-	}
-
-	/*
-	 * A canceled lock request.  The lock was just taken off the delayed
-	 * list and was never even submitted to dlm.
-	 */
-
-	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-		log_info("complete internal cancel %x,%llx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number);
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		goto out;
-	}
-
-	/*
-	 * An error occured.
-	 */
-
-	if (lp->lksb.sb_status) {
-		/* a "normal" error */
-		if ((lp->lksb.sb_status == -EAGAIN) &&
-		    (lp->lkf & DLM_LKF_NOQUEUE)) {
-			lp->req = lp->cur;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		}
-
-		/* this could only happen with cancels I think */
-		log_info("ast sb_status %d %x,%llx flags %lx",
-			 lp->lksb.sb_status, lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-		return;
-	}
-
-	/*
-	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-	 */
-
-	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * A lock has been demoted to NL because it initially completed during
-	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
-	 * mode.
-	 */
-
-	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-
-		lp->cur = DLM_LOCK_NL;
-		lp->req = lp->prev_req;
-		lp->prev_req = DLM_LOCK_IV;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-		    !test_bit(LFL_NOBLOCK, &lp->flags))
-			gdlm_queue_delayed(lp);
-		else
-			queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * A request is granted during dlm recovery.  It may be granted
-	 * because the locks of a failed node were cleared.  In that case,
-	 * there may be inconsistent data beneath this lock and we must wait
-	 * for recovery to complete to use it.  When gfs recovery is done this
-	 * granted lock will be converted to NL and then reacquired in this
-	 * granted state.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
-	    lp->req != DLM_LOCK_NL) {
-
-		lp->cur = lp->req;
-		lp->prev_req = lp->req;
-		lp->req = DLM_LOCK_NL;
-		lp->lkf |= DLM_LKF_CONVERT;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		log_debug("rereq %x,%llx id %x %d,%d",
-			  lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number,
-			  lp->lksb.sb_lkid, lp->cur, lp->req);
-
-		set_bit(LFL_REREQUEST, &lp->flags);
-		queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * DLM demoted the lock to NL before it was granted so GFS must be
-	 * told it cannot cache data for this lock.
-	 */
-
-	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
-	/*
-	 * This is an internal lock_dlm lock
-	 */
-
-	if (test_bit(LFL_INLOCK, &lp->flags)) {
-		clear_bit(LFL_NOBLOCK, &lp->flags);
-		lp->cur = lp->req;
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * Normal completion of a lock request.  Tell GFS it now has the lock.
-	 */
-
-	clear_bit(LFL_NOBLOCK, &lp->flags);
-	lp->cur = lp->req;
-
-	acb.lc_name = lp->lockname;
-	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
-	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static void gdlm_ast(void *astarg)
-{
-	struct gdlm_lock *lp = astarg;
-	clear_bit(LFL_ACTIVE, &lp->flags);
-	process_complete(lp);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int cb = 0;
-
-	switch (gdlm_make_lmstate(bast_mode)) {
-	case LM_ST_EXCLUSIVE:
-		cb = LM_CB_NEED_E;
-		break;
-	case LM_ST_DEFERRED:
-		cb = LM_CB_NEED_D;
-		break;
-	case LM_ST_SHARED:
-		cb = LM_CB_NEED_S;
-		break;
-	default:
-		gdlm_assert(0, "unknown bast mode %u", bast_mode);
-	}
-
-	ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-
-static void gdlm_bast(void *astarg, int mode)
-{
-	struct gdlm_lock *lp = astarg;
-
-	if (!mode) {
-		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-			lp->lockname.ln_type,
-			(unsigned long long)lp->lockname.ln_number);
-		return;
-	}
-
-	process_blocking(lp, mode);
-}
-
-/* convert gfs lock-state to dlm lock-mode */
-
-static s16 make_mode(s16 lmstate)
-{
-	switch (lmstate) {
-	case LM_ST_UNLOCKED:
-		return DLM_LOCK_NL;
-	case LM_ST_EXCLUSIVE:
-		return DLM_LOCK_EX;
-	case LM_ST_DEFERRED:
-		return DLM_LOCK_CW;
-	case LM_ST_SHARED:
-		return DLM_LOCK_PR;
-	}
-	gdlm_assert(0, "unknown LM state %d", lmstate);
-	return -1;
-}
-
-
-/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
-   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
-
-static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
-{
-	s16 cur = make_mode(cur_state);
-	if (lp->cur != DLM_LOCK_IV)
-		gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
-}
-
-static inline unsigned int make_flags(struct gdlm_lock *lp,
-				      unsigned int gfs_flags,
-				      s16 cur, s16 req)
-{
-	unsigned int lkf = 0;
-
-	if (gfs_flags & LM_FLAG_TRY)
-		lkf |= DLM_LKF_NOQUEUE;
-
-	if (gfs_flags & LM_FLAG_TRY_1CB) {
-		lkf |= DLM_LKF_NOQUEUE;
-		lkf |= DLM_LKF_NOQUEUEBAST;
-	}
-
-	if (gfs_flags & LM_FLAG_PRIORITY) {
-		lkf |= DLM_LKF_NOORDER;
-		lkf |= DLM_LKF_HEADQUE;
-	}
-
-	if (gfs_flags & LM_FLAG_ANY) {
-		if (req == DLM_LOCK_PR)
-			lkf |= DLM_LKF_ALTCW;
-		else if (req == DLM_LOCK_CW)
-			lkf |= DLM_LKF_ALTPR;
-	}
-
-	if (lp->lksb.sb_lkid != 0) {
-		lkf |= DLM_LKF_CONVERT;
-	}
-
-	if (lp->lvb)
-		lkf |= DLM_LKF_VALBLK;
-
-	return lkf;
-}
-
-/* make_strname - convert GFS lock numbers to a string */
-
-static inline void make_strname(const struct lm_lockname *lockname,
-				struct gdlm_strname *str)
-{
-	sprintf(str->name, "%8x%16llx", lockname->ln_type,
-		(unsigned long long)lockname->ln_number);
-	str->namelen = GDLM_STRNAME_BYTES;
-}
-
-static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
-			  struct gdlm_lock **lpp)
-{
-	struct gdlm_lock *lp;
-
-	lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
-	if (!lp)
-		return -ENOMEM;
-
-	lp->lockname = *name;
-	make_strname(name, &lp->strname);
-	lp->ls = ls;
-	lp->cur = DLM_LOCK_IV;
-	INIT_LIST_HEAD(&lp->delay_list);
-
-	spin_lock(&ls->async_lock);
-	ls->all_locks_count++;
-	spin_unlock(&ls->async_lock);
-
-	*lpp = lp;
-	return 0;
-}
-
-int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
-		  void **lockp)
-{
-	struct gdlm_lock *lp;
-	int error;
-
-	error = gdlm_create_lp(lockspace, name, &lp);
-
-	*lockp = lp;
-	return error;
-}
-
-void gdlm_put_lock(void *lock)
-{
-	gdlm_delete_lp(lock);
-}
-
-unsigned int gdlm_do_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	int error, bast = 1;
-
-	/*
-	 * When recovery is in progress, delay lock requests for submission
-	 * once recovery is done.  Requests for recovery (NOEXP) and unlocks
-	 * can pass.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
-		gdlm_queue_delayed(lp);
-		return LM_OUT_ASYNC;
-	}
-
-	/*
-	 * Submit the actual lock request.
-	 */
-
-	if (test_bit(LFL_NOBAST, &lp->flags))
-		bast = 0;
-
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
-		  (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
-		  lp->cur, lp->req, lp->lkf);
-
-	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
-			 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
-			 lp, bast ? gdlm_bast : NULL);
-
-	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
-		lp->lksb.sb_status = -EAGAIN;
-		gdlm_ast(lp);
-		error = 0;
-	}
-
-	if (error) {
-		log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
-			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number, error,
-			  lp->cur, lp->req, lp->lkf, lp->flags);
-		return LM_OUT_ERROR;
-	}
-	return LM_OUT_ASYNC;
-}
-
-static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int lkf = 0;
-	int error;
-
-	set_bit(LFL_DLM_UNLOCK, &lp->flags);
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	if (lp->lvb)
-		lkf = DLM_LKF_VALBLK;
-
-	log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
-		  (unsigned long long)lp->lockname.ln_number,
-		  lp->lksb.sb_lkid, lp->cur, lkf);
-
-	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
-
-	if (error) {
-		log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
-			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number, error,
-			  lp->cur, lp->req, lp->lkf, lp->flags);
-		return LM_OUT_ERROR;
-	}
-	return LM_OUT_ASYNC;
-}
-
-unsigned int gdlm_lock(void *lock, unsigned int cur_state,
-		       unsigned int req_state, unsigned int flags)
-{
-	struct gdlm_lock *lp = lock;
-
-	if (req_state == LM_ST_UNLOCKED)
-		return gdlm_unlock(lock, cur_state);
-
-	if (req_state == LM_ST_UNLOCKED)
-		return gdlm_unlock(lock, cur_state);
-
-	clear_bit(LFL_DLM_CANCEL, &lp->flags);
-	if (flags & LM_FLAG_NOEXP)
-		set_bit(LFL_NOBLOCK, &lp->flags);
-
-	check_cur_state(lp, cur_state);
-	lp->req = make_mode(req_state);
-	lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
-
-	return gdlm_do_lock(lp);
-}
-
-unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
-{
-	struct gdlm_lock *lp = lock;
-
-	clear_bit(LFL_DLM_CANCEL, &lp->flags);
-	if (lp->cur == DLM_LOCK_IV)
-		return 0;
-	return gdlm_do_unlock(lp);
-}
-
-void gdlm_cancel(void *lock)
-{
-	struct gdlm_lock *lp = lock;
-	struct gdlm_ls *ls = lp->ls;
-	int error, delay_list = 0;
-
-	if (test_bit(LFL_DLM_CANCEL, &lp->flags))
-		return;
-
-	log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
-		 (unsigned long long)lp->lockname.ln_number, lp->flags);
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->delay_list)) {
-		list_del_init(&lp->delay_list);
-		delay_list = 1;
-	}
-	spin_unlock(&ls->async_lock);
-
-	if (delay_list) {
-		set_bit(LFL_CANCEL, &lp->flags);
-		set_bit(LFL_ACTIVE, &lp->flags);
-		gdlm_ast(lp);
-		return;
-	}
-
-	if (!test_bit(LFL_ACTIVE, &lp->flags) ||
-	    test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		log_info("gdlm_cancel skip %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number, lp->flags);
-		return;
-	}
-
-	/* the lock is blocked in the dlm */
-
-	set_bit(LFL_DLM_CANCEL, &lp->flags);
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
-			   NULL, lp);
-
-	log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
-		 lp->lockname.ln_type,
-		 (unsigned long long)lp->lockname.ln_number, lp->flags);
-
-	if (error == -EBUSY)
-		clear_bit(LFL_DLM_CANCEL, &lp->flags);
-}
-
-static int gdlm_add_lvb(struct gdlm_lock *lp)
-{
-	char *lvb;
-
-	lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
-	if (!lvb)
-		return -ENOMEM;
-
-	lp->lksb.sb_lvbptr = lvb;
-	lp->lvb = lvb;
-	return 0;
-}
-
-static void gdlm_del_lvb(struct gdlm_lock *lp)
-{
-	kfree(lp->lvb);
-	lp->lvb = NULL;
-	lp->lksb.sb_lvbptr = NULL;
-}
-
-static int gdlm_ast_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
-   the completion) because gfs won't call hold_lvb() during a callback (from
-   the context of a lock_dlm thread). */
-
-static int hold_null_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_lock *lpn = NULL;
-	int error;
-
-	if (lp->hold_null) {
-		printk(KERN_INFO "lock_dlm: lvb already held\n");
-		return 0;
-	}
-
-	error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
-	if (error)
-		goto out;
-
-	lpn->lksb.sb_lvbptr = junk_lvb;
-	lpn->lvb = junk_lvb;
-
-	lpn->req = DLM_LOCK_NL;
-	lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
-	set_bit(LFL_NOBAST, &lpn->flags);
-	set_bit(LFL_INLOCK, &lpn->flags);
-	set_bit(LFL_AST_WAIT, &lpn->flags);
-
-	gdlm_do_lock(lpn);
-	wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE);
-	error = lpn->lksb.sb_status;
-	if (error) {
-		printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
-		       error);
-		gdlm_delete_lp(lpn);
-		lpn = NULL;
-	}
-out:
-	lp->hold_null = lpn;
-	return error;
-}
-
-/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
-   the completion) because gfs may call unhold_lvb() during a callback (from
-   the context of a lock_dlm thread) which could cause a deadlock since the
-   other lock_dlm thread could be engaged in recovery. */
-
-static void unhold_null_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_lock *lpn = lp->hold_null;
-
-	gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
-		    (unsigned long long)lp->lockname.ln_number);
-	lpn->lksb.sb_lvbptr = NULL;
-	lpn->lvb = NULL;
-	set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
-	gdlm_do_unlock(lpn);
-	lp->hold_null = NULL;
-}
-
-/* Acquire a NL lock because gfs requires the value block to remain
-   intact on the resource while the lvb is "held" even if it's holding no locks
-   on the resource. */
-
-int gdlm_hold_lvb(void *lock, char **lvbp)
-{
-	struct gdlm_lock *lp = lock;
-	int error;
-
-	error = gdlm_add_lvb(lp);
-	if (error)
-		return error;
-
-	*lvbp = lp->lvb;
-
-	error = hold_null_lock(lp);
-	if (error)
-		gdlm_del_lvb(lp);
-
-	return error;
-}
-
-void gdlm_unhold_lvb(void *lock, char *lvb)
-{
-	struct gdlm_lock *lp = lock;
-
-	unhold_null_lock(lp);
-	gdlm_del_lvb(lp);
-}
-
-void gdlm_submit_delayed(struct gdlm_ls *ls)
-{
-	struct gdlm_lock *lp, *safe;
-
-	spin_lock(&ls->async_lock);
-	list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
-		list_del_init(&lp->delay_list);
-		list_add_tail(&lp->delay_list, &ls->submit);
-	}
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
deleted file mode 100644
index 3c98e7c6f93b..000000000000
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef LOCK_DLM_DOT_H
-#define LOCK_DLM_DOT_H
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/list.h>
-#include <linux/socket.h>
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <linux/kobject.h>
-#include <linux/fcntl.h>
-#include <linux/wait.h>
-#include <net/sock.h>
-
-#include <linux/dlm.h>
-#include <linux/dlm_plock.h>
-#include <linux/lm_interface.h>
-
-/*
- * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
- * prefix of lock_dlm_ gets awkward.  Externally, GFS refers to this module
- * as "lock_dlm".
- */
-
-#define GDLM_STRNAME_BYTES	24
-#define GDLM_LVB_SIZE		32
-#define GDLM_DROP_COUNT		0
-#define GDLM_DROP_PERIOD	60
-#define GDLM_NAME_LEN		128
-
-/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
-   We sprintf these numbers into a 24 byte string of hex values to make them
-   human-readable (to make debugging simpler.) */
-
-struct gdlm_strname {
-	unsigned char		name[GDLM_STRNAME_BYTES];
-	unsigned short		namelen;
-};
-
-enum {
-	DFL_BLOCK_LOCKS		= 0,
-	DFL_SPECTATOR		= 1,
-	DFL_WITHDRAW		= 2,
-};
-
-struct gdlm_ls {
-	u32		id;
-	int			jid;
-	int			first;
-	int			first_done;
-	unsigned long		flags;
-	struct kobject		kobj;
-	char			clustername[GDLM_NAME_LEN];
-	char			fsname[GDLM_NAME_LEN];
-	int			fsflags;
-	dlm_lockspace_t		*dlm_lockspace;
-	lm_callback_t		fscb;
-	struct gfs2_sbd		*sdp;
-	int			recover_jid;
-	int			recover_jid_done;
-	int			recover_jid_status;
-	spinlock_t		async_lock;
-	struct list_head	delayed;
-	struct list_head	submit;
-	u32		all_locks_count;
-	wait_queue_head_t	wait_control;
-	struct task_struct	*thread;
-	wait_queue_head_t	thread_wait;
-};
-
-enum {
-	LFL_NOBLOCK		= 0,
-	LFL_NOCACHE		= 1,
-	LFL_DLM_UNLOCK		= 2,
-	LFL_DLM_CANCEL		= 3,
-	LFL_SYNC_LVB		= 4,
-	LFL_FORCE_PROMOTE	= 5,
-	LFL_REREQUEST		= 6,
-	LFL_ACTIVE		= 7,
-	LFL_INLOCK		= 8,
-	LFL_CANCEL		= 9,
-	LFL_NOBAST		= 10,
-	LFL_HEADQUE		= 11,
-	LFL_UNLOCK_DELETE	= 12,
-	LFL_AST_WAIT		= 13,
-};
-
-struct gdlm_lock {
-	struct gdlm_ls		*ls;
-	struct lm_lockname	lockname;
-	struct gdlm_strname	strname;
-	char			*lvb;
-	struct dlm_lksb		lksb;
-
-	s16			cur;
-	s16			req;
-	s16			prev_req;
-	u32			lkf;		/* dlm flags DLM_LKF_ */
-	unsigned long		flags;		/* lock_dlm flags LFL_ */
-
-	struct list_head	delay_list;	/* delayed */
-	struct gdlm_lock	*hold_null;	/* NL lock for hold_lvb */
-};
-
-#define gdlm_assert(assertion, fmt, args...)                                  \
-do {                                                                          \
-	if (unlikely(!(assertion))) {                                         \
-		printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
-				  "lock_dlm:  " fmt "\n",                     \
-				  #assertion, ##args);                        \
-		BUG();                                                        \
-	}                                                                     \
-} while (0)
-
-#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
-#define log_info(fmt, arg...)  log_print(KERN_INFO , fmt , ## arg)
-#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
-#ifdef LOCK_DLM_LOG_DEBUG
-#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
-#else
-#define log_debug(fmt, arg...)
-#endif
-
-/* sysfs.c */
-
-int gdlm_sysfs_init(void);
-void gdlm_sysfs_exit(void);
-int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
-void gdlm_kobject_release(struct gdlm_ls *);
-
-/* thread.c */
-
-int gdlm_init_threads(struct gdlm_ls *);
-void gdlm_release_threads(struct gdlm_ls *);
-
-/* lock.c */
-
-void gdlm_submit_delayed(struct gdlm_ls *);
-unsigned int gdlm_do_lock(struct gdlm_lock *);
-
-int gdlm_get_lock(void *, struct lm_lockname *, void **);
-void gdlm_put_lock(void *);
-unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
-unsigned int gdlm_unlock(void *, unsigned int);
-void gdlm_cancel(void *);
-int gdlm_hold_lvb(void *, char **);
-void gdlm_unhold_lvb(void *, char *);
-
-/* mount.c */
-
-extern const struct lm_lockops gdlm_ops;
-
-#endif
-
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
deleted file mode 100644
index b9a03a7ff801..000000000000
--- a/fs/gfs2/locking/dlm/main.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/init.h>
-
-#include "lock_dlm.h"
-
-static int __init init_lock_dlm(void)
-{
-	int error;
-
-	error = gfs2_register_lockproto(&gdlm_ops);
-	if (error) {
-		printk(KERN_WARNING "lock_dlm:  can't register protocol: %d\n",
-		       error);
-		return error;
-	}
-
-	error = gdlm_sysfs_init();
-	if (error) {
-		gfs2_unregister_lockproto(&gdlm_ops);
-		return error;
-	}
-
-	printk(KERN_INFO
-	       "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
-	return 0;
-}
-
-static void __exit exit_lock_dlm(void)
-{
-	gdlm_sysfs_exit();
-	gfs2_unregister_lockproto(&gdlm_ops);
-}
-
-module_init(init_lock_dlm);
-module_exit(exit_lock_dlm);
-
-MODULE_DESCRIPTION("GFS DLM Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
deleted file mode 100644
index 1aa7eb6a0226..000000000000
--- a/fs/gfs2/locking/dlm/mount.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-const struct lm_lockops gdlm_ops;
-
-
-static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
-				 int flags, char *table_name)
-{
-	struct gdlm_ls *ls;
-	char buf[256], *p;
-
-	ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
-	if (!ls)
-		return NULL;
-
-	ls->fscb = cb;
-	ls->sdp = sdp;
-	ls->fsflags = flags;
-	spin_lock_init(&ls->async_lock);
-	INIT_LIST_HEAD(&ls->delayed);
-	INIT_LIST_HEAD(&ls->submit);
-	init_waitqueue_head(&ls->thread_wait);
-	init_waitqueue_head(&ls->wait_control);
-	ls->jid = -1;
-
-	strncpy(buf, table_name, 256);
-	buf[255] = '\0';
-
-	p = strchr(buf, ':');
-	if (!p) {
-		log_info("invalid table_name \"%s\"", table_name);
-		kfree(ls);
-		return NULL;
-	}
-	*p = '\0';
-	p++;
-
-	strncpy(ls->clustername, buf, GDLM_NAME_LEN);
-	strncpy(ls->fsname, p, GDLM_NAME_LEN);
-
-	return ls;
-}
-
-static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
-{
-	char data[256];
-	char *options, *x, *y;
-	int error = 0;
-
-	memset(data, 0, 256);
-	strncpy(data, data_arg, 255);
-
-	if (!strlen(data)) {
-		log_error("no mount options, (u)mount helpers not installed");
-		return -EINVAL;
-	}
-
-	for (options = data; (x = strsep(&options, ":")); ) {
-		if (!*x)
-			continue;
-
-		y = strchr(x, '=');
-		if (y)
-			*y++ = 0;
-
-		if (!strcmp(x, "jid")) {
-			if (!y) {
-				log_error("need argument to jid");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->jid);
-
-		} else if (!strcmp(x, "first")) {
-			if (!y) {
-				log_error("need argument to first");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->first);
-
-		} else if (!strcmp(x, "id")) {
-			if (!y) {
-				log_error("need argument to id");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->id);
-
-		} else if (!strcmp(x, "nodir")) {
-			if (!y) {
-				log_error("need argument to nodir");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", nodir);
-
-		} else {
-			log_error("unkonwn option: %s", x);
-			error = -EINVAL;
-			break;
-		}
-	}
-
-	return error;
-}
-
-static int gdlm_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	struct gdlm_ls *ls;
-	int error = -ENOMEM, nodir = 0;
-
-	if (min_lvb_size > GDLM_LVB_SIZE)
-		goto out;
-
-	ls = init_gdlm(cb, cb_data, flags, table_name);
-	if (!ls)
-		goto out;
-
-	error = make_args(ls, host_data, &nodir);
-	if (error)
-		goto out;
-
-	error = gdlm_init_threads(ls);
-	if (error)
-		goto out_free;
-
-	error = gdlm_kobject_setup(ls, fskobj);
-	if (error)
-		goto out_thread;
-
-	error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
-				  &ls->dlm_lockspace,
-				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
-				  (nodir ? DLM_LSFL_NODIR : 0),
-				  GDLM_LVB_SIZE);
-	if (error) {
-		log_error("dlm_new_lockspace error %d", error);
-		goto out_kobj;
-	}
-
-	lockstruct->ls_jid = ls->jid;
-	lockstruct->ls_first = ls->first;
-	lockstruct->ls_lockspace = ls;
-	lockstruct->ls_ops = &gdlm_ops;
-	lockstruct->ls_flags = 0;
-	lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
-	return 0;
-
-out_kobj:
-	gdlm_kobject_release(ls);
-out_thread:
-	gdlm_release_threads(ls);
-out_free:
-	kfree(ls);
-out:
-	return error;
-}
-
-static void gdlm_unmount(void *lockspace)
-{
-	struct gdlm_ls *ls = lockspace;
-
-	log_debug("unmount flags %lx", ls->flags);
-
-	/* FIXME: serialize unmount and withdraw in case they
-	   happen at once.  Also, if unmount follows withdraw,
-	   wait for withdraw to finish. */
-
-	if (test_bit(DFL_WITHDRAW, &ls->flags))
-		goto out;
-
-	gdlm_kobject_release(ls);
-	dlm_release_lockspace(ls->dlm_lockspace, 2);
-	gdlm_release_threads(ls);
-	BUG_ON(ls->all_locks_count);
-out:
-	kfree(ls);
-}
-
-static void gdlm_recovery_done(void *lockspace, unsigned int jid,
-                               unsigned int message)
-{
-	char env_jid[20];
-	char env_status[20];
-	char *envp[] = { env_jid, env_status, NULL };
-	struct gdlm_ls *ls = lockspace;
-	ls->recover_jid_done = jid;
-	ls->recover_jid_status = message;
-	sprintf(env_jid, "JID=%d", jid);
-	sprintf(env_status, "RECOVERY=%s",
-		message == LM_RD_SUCCESS ? "Done" : "Failed");
-	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
-}
-
-static void gdlm_others_may_mount(void *lockspace)
-{
-	char *message = "FIRSTMOUNT=Done";
-	char *envp[] = { message, NULL };
-	struct gdlm_ls *ls = lockspace;
-	ls->first_done = 1;
-	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
-}
-
-/* Userspace gets the offline uevent, blocks new gfs locks on
-   other mounters, and lets us know (sets WITHDRAW flag).  Then,
-   userspace leaves the mount group while we leave the lockspace. */
-
-static void gdlm_withdraw(void *lockspace)
-{
-	struct gdlm_ls *ls = lockspace;
-
-	kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
-
-	wait_event_interruptible(ls->wait_control,
-				 test_bit(DFL_WITHDRAW, &ls->flags));
-
-	dlm_release_lockspace(ls->dlm_lockspace, 2);
-	gdlm_release_threads(ls);
-	gdlm_kobject_release(ls);
-}
-
-static int gdlm_plock(void *lockspace, struct lm_lockname *name,
-	       struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
-}
-
-static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
-		 struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
-}
-
-static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
-		   struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
-}
-
-const struct lm_lockops gdlm_ops = {
-	.lm_proto_name = "lock_dlm",
-	.lm_mount = gdlm_mount,
-	.lm_others_may_mount = gdlm_others_may_mount,
-	.lm_unmount = gdlm_unmount,
-	.lm_withdraw = gdlm_withdraw,
-	.lm_get_lock = gdlm_get_lock,
-	.lm_put_lock = gdlm_put_lock,
-	.lm_lock = gdlm_lock,
-	.lm_unlock = gdlm_unlock,
-	.lm_plock = gdlm_plock,
-	.lm_punlock = gdlm_punlock,
-	.lm_plock_get = gdlm_plock_get,
-	.lm_cancel = gdlm_cancel,
-	.lm_hold_lvb = gdlm_hold_lvb,
-	.lm_unhold_lvb = gdlm_unhold_lvb,
-	.lm_recovery_done = gdlm_recovery_done,
-	.lm_owner = THIS_MODULE,
-};
-
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
deleted file mode 100644
index 9b7edcf7bd49..000000000000
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/ctype.h>
-#include <linux/stat.h>
-
-#include "lock_dlm.h"
-
-static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
-}
-
-static ssize_t block_show(struct gdlm_ls *ls, char *buf)
-{
-	ssize_t ret;
-	int val = 0;
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
-		val = 1;
-	ret = sprintf(buf, "%d\n", val);
-	return ret;
-}
-
-static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ssize_t ret = len;
-	int val;
-
-	val = simple_strtol(buf, NULL, 0);
-
-	if (val == 1)
-		set_bit(DFL_BLOCK_LOCKS, &ls->flags);
-	else if (val == 0) {
-		clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
-		gdlm_submit_delayed(ls);
-	} else {
-		ret = -EINVAL;
-	}
-	return ret;
-}
-
-static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
-{
-	ssize_t ret;
-	int val = 0;
-
-	if (test_bit(DFL_WITHDRAW, &ls->flags))
-		val = 1;
-	ret = sprintf(buf, "%d\n", val);
-	return ret;
-}
-
-static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ssize_t ret = len;
-	int val;
-
-	val = simple_strtol(buf, NULL, 0);
-
-	if (val == 1)
-		set_bit(DFL_WITHDRAW, &ls->flags);
-	else
-		ret = -EINVAL;
-	wake_up(&ls->wait_control);
-	return ret;
-}
-
-static ssize_t id_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%u\n", ls->id);
-}
-
-static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->jid);
-}
-
-static ssize_t first_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->first);
-}
-
-static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->first_done);
-}
-
-static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid);
-}
-
-static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ls->recover_jid = simple_strtol(buf, NULL, 0);
-	ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
-	return len;
-}
-
-static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid_done);
-}
-
-static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid_status);
-}
-
-struct gdlm_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gdlm_ls *, char *);
-	ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
-};
-
-#define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
-GDLM_ATTR(id,             0444, id_show,             NULL);
-GDLM_ATTR(jid,            0444, jid_show,            NULL);
-GDLM_ATTR(first,          0444, first_show,          NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-
-static struct attribute *gdlm_attrs[] = {
-	&gdlm_attr_proto_name.attr,
-	&gdlm_attr_block.attr,
-	&gdlm_attr_withdraw.attr,
-	&gdlm_attr_id.attr,
-	&gdlm_attr_jid.attr,
-	&gdlm_attr_first.attr,
-	&gdlm_attr_first_done.attr,
-	&gdlm_attr_recover.attr,
-	&gdlm_attr_recover_done.attr,
-	&gdlm_attr_recover_status.attr,
-	NULL,
-};
-
-static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
-	return a->show ? a->show(ls, buf) : 0;
-}
-
-static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
-	return a->store ? a->store(ls, buf, len) : len;
-}
-
-static struct sysfs_ops gdlm_attr_ops = {
-	.show  = gdlm_attr_show,
-	.store = gdlm_attr_store,
-};
-
-static struct kobj_type gdlm_ktype = {
-	.default_attrs = gdlm_attrs,
-	.sysfs_ops     = &gdlm_attr_ops,
-};
-
-static struct kset *gdlm_kset;
-
-int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
-{
-	int error;
-
-	ls->kobj.kset = gdlm_kset;
-	error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
-				     "lock_module");
-	if (error)
-		log_error("can't register kobj %d", error);
-	kobject_uevent(&ls->kobj, KOBJ_ADD);
-
-	return error;
-}
-
-void gdlm_kobject_release(struct gdlm_ls *ls)
-{
-	kobject_put(&ls->kobj);
-}
-
-static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
-		       struct kobj_uevent_env *env)
-{
-        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
-        add_uevent_var(env, "LOCKPROTO=lock_dlm");
-        return 0;
-}
-
-static struct kset_uevent_ops gdlm_uevent_ops = {
-	.uevent = gdlm_uevent,
-};
-
-
-int gdlm_sysfs_init(void)
-{
-	gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
-	if (!gdlm_kset) {
-		printk(KERN_WARNING "%s: can not create kset\n", __func__);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-void gdlm_sysfs_exit(void)
-{
-	kset_unregister(gdlm_kset);
-}
-
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
deleted file mode 100644
index 38823efd698c..000000000000
--- a/fs/gfs2/locking/dlm/thread.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-static inline int no_work(struct gdlm_ls *ls)
-{
-	int ret;
-
-	spin_lock(&ls->async_lock);
-	ret = list_empty(&ls->submit);
-	spin_unlock(&ls->async_lock);
-
-	return ret;
-}
-
-static int gdlm_thread(void *data)
-{
-	struct gdlm_ls *ls = (struct gdlm_ls *) data;
-	struct gdlm_lock *lp = NULL;
-
-	while (!kthread_should_stop()) {
-		wait_event_interruptible(ls->thread_wait,
-				!no_work(ls) || kthread_should_stop());
-
-		spin_lock(&ls->async_lock);
-
-		if (!list_empty(&ls->submit)) {
-			lp = list_entry(ls->submit.next, struct gdlm_lock,
-					delay_list);
-			list_del_init(&lp->delay_list);
-			spin_unlock(&ls->async_lock);
-			gdlm_do_lock(lp);
-			spin_lock(&ls->async_lock);
-		}
-		spin_unlock(&ls->async_lock);
-	}
-
-	return 0;
-}
-
-int gdlm_init_threads(struct gdlm_ls *ls)
-{
-	struct task_struct *p;
-	int error;
-
-	p = kthread_run(gdlm_thread, ls, "lock_dlm");
-	error = IS_ERR(p);
-	if (error) {
-		log_error("can't start lock_dlm thread %d", error);
-		return error;
-	}
-	ls->thread = p;
-
-	return 0;
-}
-
-void gdlm_release_threads(struct gdlm_ls *ls)
-{
-	kthread_stop(ls->thread);
-}
-
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ad305854bdc6..98918a756410 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -14,7 +14,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4390f6f4047d..80e4f5f898bb 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,7 +13,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 86fe06798711..a6892ed0840a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/atomic.h>
 
 #include "gfs2.h"
@@ -47,8 +46,6 @@ static void gfs2_init_glock_once(void *foo)
 	INIT_HLIST_NODE(&gl->gl_list);
 	spin_lock_init(&gl->gl_spin);
 	INIT_LIST_HEAD(&gl->gl_holders);
-	gl->gl_lvb = NULL;
-	atomic_set(&gl->gl_lvb_count, 0);
 	INIT_LIST_HEAD(&gl->gl_lru);
 	INIT_LIST_HEAD(&gl->gl_ail_list);
 	atomic_set(&gl->gl_ail_count, 0);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 09853620c951..870d65ae7ae2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 3524ae81189b..fba502aa8b2d 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/parser.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index dde4ead2c3be..a6d00e8ffe10 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -19,7 +19,6 @@
 #include <linux/writeback.h>
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/backing-dev.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index c2ad36330ca3..5eb57b044382 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 7fdeb14ddd1a..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -14,7 +14,6 @@
 #include <linux/exportfs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 93fe41b67f97..99d726f1c7a6 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -20,9 +20,10 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -560,57 +561,24 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return ret;
 }
 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+
 /**
  * gfs2_setlease - acquire/release a file lease
  * @file: the file pointer
  * @arg: lease type
  * @fl: file lock
  *
+ * We don't currently have a way to enforce a lease across the whole
+ * cluster; until we do, disable leases (by just returning -EINVAL),
+ * unless the administrator has requested purely local locking.
+ *
  * Returns: errno
  */
 
 static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-
-	/*
-	 * We don't currently have a way to enforce a lease across the whole
-	 * cluster; until we do, disable leases (by just returning -EINVAL),
-	 * unless the administrator has requested purely local locking.
-	 */
-	if (!sdp->sd_args.ar_localflocks)
-		return -EINVAL;
-	return generic_setlease(file, arg, fl);
-}
-
-static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		      struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
-}
-
-static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		  struct file *file, int cmd, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
-	return error;
-}
-
-static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		    struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_punlock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
+	return -EINVAL;
 }
 
 /**
@@ -626,9 +594,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-	struct lm_lockname name =
-		{ .ln_number = ip->i_no_addr,
-		  .ln_type = LM_TYPE_PLOCK };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
@@ -640,12 +606,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 		cmd = F_SETLK;
 		fl->fl_type = F_UNLCK;
 	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
 	if (IS_GETLK(cmd))
-		return gfs2_lm_plock_get(sdp, &name, file, fl);
+		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else if (fl->fl_type == F_UNLCK)
-		return gfs2_lm_punlock(sdp, &name, file, fl);
+		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else
-		return gfs2_lm_plock(sdp, &name, file, cmd, fl);
+		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
 }
 
 static int do_flock(struct file *file, int cmd, struct file_lock *fl)
@@ -732,7 +700,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	}
 }
 
-const struct file_operations gfs2_file_fops = {
+const struct file_operations *gfs2_file_fops = &(const struct file_operations){
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -750,7 +718,7 @@ const struct file_operations gfs2_file_fops = {
 	.setlease	= gfs2_setlease,
 };
 
-const struct file_operations gfs2_dir_fops = {
+const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
@@ -760,7 +728,9 @@ const struct file_operations gfs2_dir_fops = {
 	.flock		= gfs2_flock,
 };
 
-const struct file_operations gfs2_file_fops_nolock = {
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -773,10 +743,10 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.fsync		= gfs2_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
-	.setlease	= gfs2_setlease,
+	.setlease	= generic_setlease,
 };
 
-const struct file_operations gfs2_dir_fops_nolock = {
+const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 402b6a2cd2c9..95bb33e41a76 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -627,13 +626,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 	return rc;
 }
 
-static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
 {
-	if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
-		return;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
-					sdp->sd_lockstruct.ls_lockspace);
+	char *message = "FIRSTMOUNT=Done";
+	char *envp[] = { message, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ls->ls_first_done = 1;
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
 /**
@@ -793,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 			}
 		}
 
-		gfs2_lm_others_may_mount(sdp);
+		gfs2_others_may_mount(sdp);
 	} else if (!sdp->sd_args.ar_spectator) {
 		error = gfs2_recover_journal(sdp->sd_jdesc);
 		if (error) {
@@ -1002,7 +1001,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
 		goto fail_quotad;
 
 	sdp->sd_log_flush_time = jiffies;
-	sdp->sd_jindex_refresh_time = jiffies;
 
 	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
 	error = IS_ERR(p);
@@ -1030,6 +1028,17 @@ fail:
 	return error;
 }
 
+static const match_table_t nolock_tokens = {
+	{ Opt_jid, "jid=%d\n", },
+	{ Opt_err, NULL },
+};
+
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_put_lock = kmem_cache_free,
+	.lm_tokens = &nolock_tokens,
+};
+
 /**
  * gfs2_lm_mount - mount a locking protocol
  * @sdp: the filesystem
@@ -1041,31 +1050,73 @@ fail:
 
 static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 {
-	char *proto = sdp->sd_proto_name;
-	char *table = sdp->sd_table_name;
-	int flags = LM_MFLAG_CONV_NODROP;
-	int error;
+	const struct lm_lockops *lm;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	struct gfs2_args *args = &sdp->sd_args;
+	const char *proto = sdp->sd_proto_name;
+	const char *table = sdp->sd_table_name;
+	const char *fsname;
+	char *o, *options;
+	int ret;
 
-	if (sdp->sd_args.ar_spectator)
-		flags |= LM_MFLAG_SPECTATOR;
+	if (!strcmp("lock_nolock", proto)) {
+		lm = &nolock_ops;
+		sdp->sd_args.ar_localflocks = 1;
+		sdp->sd_args.ar_localcaching = 1;
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+	} else if (!strcmp("lock_dlm", proto)) {
+		lm = &gfs2_dlm_ops;
+#endif
+	} else {
+		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
+		return -ENOENT;
+	}
 
 	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
 
-	error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
-				     gfs2_glock_cb, sdp,
-				     GFS2_MIN_LVB_SIZE, flags,
-				     &sdp->sd_lockstruct, &sdp->sd_kobj);
-	if (error) {
-		fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
-			proto, table, sdp->sd_args.ar_hostdata);
-		goto out;
-	}
+	ls->ls_ops = lm;
+	ls->ls_first = 1;
+	ls->ls_id = 0;
 
-	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
-				  GFS2_MIN_LVB_SIZE)) {
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-		goto out;
+	for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
+		substring_t tmp[MAX_OPT_ARGS];
+		int token, option;
+
+		if (!o || !*o)
+			continue;
+
+		token = match_token(o, *lm->lm_tokens, tmp);
+		switch (token) {
+		case Opt_jid:
+			ret = match_int(&tmp[0], &option);
+			if (ret || option < 0) 
+				goto hostdata_error;
+			ls->ls_jid = option;
+			break;
+		case Opt_id:
+			ret = match_int(&tmp[0], &option);
+			if (ret)
+				goto hostdata_error;
+			ls->ls_id = option;
+			break;
+		case Opt_first:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_first = option;
+			break;
+		case Opt_nodir:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_nodir = option;
+			break;
+		case Opt_err:
+		default:
+hostdata_error:
+			fs_info(sdp, "unknown hostdata (%s)\n", o);
+			return -EINVAL;
+		}
 	}
 
 	if (sdp->sd_args.ar_spectator)
@@ -1074,22 +1125,25 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
 			 sdp->sd_lockstruct.ls_jid);
 
-	fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-
-	if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
-	    !sdp->sd_args.ar_ignore_local_fs) {
-		sdp->sd_args.ar_localflocks = 1;
-		sdp->sd_args.ar_localcaching = 1;
+	fsname = strchr(table, ':');
+	if (fsname)
+		fsname++;
+	if (lm->lm_mount == NULL) {
+		fs_info(sdp, "Now mounting FS...\n");
+		return 0;
 	}
-
-out:
-	return error;
+	ret = lm->lm_mount(sdp, fsname);
+	if (ret == 0)
+		fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+	return ret;
 }
 
 void gfs2_lm_unmount(struct gfs2_sbd *sdp)
 {
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+	const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	    lm->lm_unmount)
+		lm->lm_unmount(sdp);
 }
 
 /**
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 49877546beb9..abd5429ae285 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,7 +18,6 @@
 #include <linux/posix_acl.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/fiemap.h>
 #include <asm/uaccess.h>
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f0699ac453f7..4ecdad026eaf 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/time.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e8ef0f80fb11..8d53f66b5bcc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -45,7 +45,6 @@
 #include <linux/fs.h>
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
@@ -108,7 +107,7 @@ int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
 		gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
-		gfs2_lvb_unhold(qd->qd_gl);
+		gfs2_glock_put(qd->qd_gl);
 		atomic_dec(&sdp->sd_quota_count);
 
 		/* Delete it from the common reclaim list */
@@ -157,11 +156,6 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	if (error)
 		goto fail;
 
-	error = gfs2_lvb_hold(qd->qd_gl);
-	gfs2_glock_put(qd->qd_gl);
-	if (error)
-		goto fail;
-
 	*qdp = qd;
 
 	return 0;
@@ -211,7 +205,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 
 		if (qd || !create) {
 			if (new_qd) {
-				gfs2_lvb_unhold(new_qd->qd_gl);
+				gfs2_glock_put(new_qd->qd_gl);
 				kmem_cache_free(gfs2_quotad_cachep, new_qd);
 			}
 			*qdp = qd;
@@ -1280,7 +1274,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 			gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
-		gfs2_lvb_unhold(qd->qd_gl);
+		gfs2_glock_put(qd->qd_gl);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
 
 		spin_lock(&qd_lru_lock);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index efd09c3d2b26..247e8f7d6b3d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
@@ -427,20 +426,23 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 }
 
 
-static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-				  unsigned int message)
+static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                               unsigned int message)
 {
-	if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
-		return;
-
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
-			sdp->sd_lockstruct.ls_lockspace, jid, message);
+	char env_jid[20];
+	char env_status[20];
+	char *envp[] = { env_jid, env_status, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        ls->ls_recover_jid_done = jid;
+        ls->ls_recover_jid_status = message;
+	sprintf(env_jid, "JID=%d", jid);
+	sprintf(env_status, "RECOVERY=%s",
+		message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
-
 /**
- * gfs2_recover_journal - recovery a given journal
+ * gfs2_recover_journal - recover a given journal
  * @jd: the struct gfs2_jdesc describing the journal
  *
  * Acquire the journal's lock, check to see if the journal is clean, and
@@ -561,7 +563,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
 		gfs2_glock_dq_uninit(&ji_gh);
 
-	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
 
 	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
 		gfs2_glock_dq_uninit(&j_gh);
@@ -581,7 +583,7 @@ fail_gunlock_j:
 	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
 
 fail:
-	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b01c635d925..ba5a021b1c57 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/prefetch.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 141b781f2fcc..7cf302b135ce 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -15,7 +15,6 @@
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a58a120dac92..a78997ea5037 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -14,9 +14,8 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
+#include <linux/gfs2_ondisk.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -224,14 +223,145 @@ static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
 
 LOCKSTRUCT_ATTR(jid,      "%u\n");
 LOCKSTRUCT_ATTR(first,    "%u\n");
-LOCKSTRUCT_ATTR(lvb_size, "%u\n");
-LOCKSTRUCT_ATTR(flags,    "%d\n");
 
 static struct attribute *lockstruct_attrs[] = {
 	&lockstruct_attr_jid.attr,
 	&lockstruct_attr_first.attr,
-	&lockstruct_attr_lvb_size.attr,
-	&lockstruct_attr_flags.attr,
+	NULL,
+};
+
+/*
+ * lock_module. Originally from lock_dlm
+ */
+
+static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf)
+{
+	const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops;
+	return sprintf(buf, "%s\n", ops->lm_proto_name);
+}
+
+static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret;
+	int val = 0;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+		val = 1;
+	ret = sprintf(buf, "%d\n", val);
+	return ret;
+}
+
+static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if (val == 1)
+		set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+	else if (val == 0) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+		smp_mb__after_clear_bit();
+		gfs2_glock_thaw(sdp);
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%u\n", ls->ls_id);
+}
+
+static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first);
+}
+
+static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first_done);
+}
+
+static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid);
+}
+
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_jid != jid)
+			continue;
+		jd->jd_dirty = 1;
+		break;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
+	gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
+	if (sdp->sd_recoverd_process)
+		wake_up_process(sdp->sd_recoverd_process);
+	return len;
+}
+
+static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_done);
+}
+
+static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
+}
+
+struct gdlm_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *sdp, char *);
+	ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
+};
+
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+
+GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(id,             0444, lkid_show,           NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+
+static struct attribute *lock_module_attrs[] = {
+	&gdlm_attr_proto_name.attr,
+	&gdlm_attr_block.attr,
+	&gdlm_attr_withdraw.attr,
+	&gdlm_attr_id.attr,
+	&lockstruct_attr_jid.attr,
+	&gdlm_attr_first.attr,
+	&gdlm_attr_first_done.attr,
+	&gdlm_attr_recover.attr,
+	&gdlm_attr_recover_done.attr,
+	&gdlm_attr_recover_status.attr,
 	NULL,
 };
 
@@ -412,6 +542,11 @@ static struct attribute_group tune_group = {
 	.attrs = tune_attrs,
 };
 
+static struct attribute_group lock_module_group = {
+	.name = "lock_module",
+	.attrs = lock_module_attrs,
+};
+
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
 	int error;
@@ -434,9 +569,15 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_args;
 
+	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
+	if (error)
+		goto fail_tune;
+
 	kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
 	return 0;
 
+fail_tune:
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 fail_lockstruct:
@@ -453,6 +594,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
 
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f677b8a83f0c..33cd523ec97e 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -12,9 +12,8 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
 #include <linux/kallsyms.h>
-#include <linux/lm_interface.h>
+#include <linux/gfs2_ondisk.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 374f50e95496..9d12b1118ba0 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -35,6 +34,8 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
 
 int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 {
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	const struct lm_lockops *lm = ls->ls_ops;
 	va_list args;
 
 	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
@@ -47,8 +48,12 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 	fs_err(sdp, "about to withdraw this file system\n");
 	BUG_ON(sdp->sd_args.ar_debug);
 
-	fs_err(sdp, "telling LM to withdraw\n");
-	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+	kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+
+	if (lm->lm_unmount) {
+		fs_err(sdp, "telling LM to unmount\n");
+		lm->lm_unmount(sdp);
+	}
 	fs_err(sdp, "withdrawn\n");
 	dump_stack();
 
diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h
deleted file mode 100644
index 2ed8fa1b762b..000000000000
--- a/include/linux/lm_interface.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __LM_INTERFACE_DOT_H__
-#define __LM_INTERFACE_DOT_H__
-
-
-typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
-
-/*
- * lm_mount() flags
- *
- * LM_MFLAG_SPECTATOR
- * GFS is asking to join the filesystem's lockspace, but it doesn't want to
- * modify the filesystem.  The lock module shouldn't assign a journal to the FS
- * mount.  It shouldn't send recovery callbacks to the FS mount.  If the node
- * dies or withdraws, all locks can be wiped immediately.
- *
- * LM_MFLAG_CONV_NODROP
- * Do not allow the dlm to internally resolve conversion deadlocks by demoting
- * the lock to unlocked and then reacquiring it in the requested mode. Instead,
- * it should cancel the request and return LM_OUT_CONV_DEADLK.
- */
-
-#define LM_MFLAG_SPECTATOR	0x00000001
-#define LM_MFLAG_CONV_NODROP	0x00000002
-
-/*
- * lm_lockstruct flags
- *
- * LM_LSFLAG_LOCAL
- * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
- * can make single-node optimizations.
- */
-
-#define LM_LSFLAG_LOCAL		0x00000001
-
-/*
- * lm_lockname types
- */
-
-#define LM_TYPE_RESERVED	0x00
-#define LM_TYPE_NONDISK		0x01
-#define LM_TYPE_INODE		0x02
-#define LM_TYPE_RGRP		0x03
-#define LM_TYPE_META		0x04
-#define LM_TYPE_IOPEN		0x05
-#define LM_TYPE_FLOCK		0x06
-#define LM_TYPE_PLOCK		0x07
-#define LM_TYPE_QUOTA		0x08
-#define LM_TYPE_JOURNAL		0x09
-
-/*
- * lm_lock() states
- *
- * SHARED is compatible with SHARED, not with DEFERRED or EX.
- * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
- */
-
-#define LM_ST_UNLOCKED		0
-#define LM_ST_EXCLUSIVE		1
-#define LM_ST_DEFERRED		2
-#define LM_ST_SHARED		3
-
-/*
- * lm_lock() flags
- *
- * LM_FLAG_TRY
- * Don't wait to acquire the lock if it can't be granted immediately.
- *
- * LM_FLAG_TRY_1CB
- * Send one blocking callback if TRY is set and the lock is not granted.
- *
- * LM_FLAG_NOEXP
- * GFS sets this flag on lock requests it makes while doing journal recovery.
- * These special requests should not be blocked due to the recovery like
- * ordinary locks would be.
- *
- * LM_FLAG_ANY
- * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
- * also be granted in SHARED.  The preferred state is whichever is compatible
- * with other granted locks, or the specified state if no other locks exist.
- *
- * LM_FLAG_PRIORITY
- * Override fairness considerations.  Suppose a lock is held in a shared state
- * and there is a pending request for the deferred state.  A shared lock
- * request with the priority flag would be allowed to bypass the deferred
- * request and directly join the other shared lock.  A shared lock request
- * without the priority flag might be forced to wait until the deferred
- * requested had acquired and released the lock.
- */
-
-#define LM_FLAG_TRY		0x00000001
-#define LM_FLAG_TRY_1CB		0x00000002
-#define LM_FLAG_NOEXP		0x00000004
-#define LM_FLAG_ANY		0x00000008
-#define LM_FLAG_PRIORITY	0x00000010
-
-/*
- * lm_lock() and lm_async_cb return flags
- *
- * LM_OUT_ST_MASK
- * Masks the lower two bits of lock state in the returned value.
- *
- * LM_OUT_CACHEABLE
- * The lock hasn't been released so GFS can continue to cache data for it.
- *
- * LM_OUT_CANCELED
- * The lock request was canceled.
- *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
- * LM_OUT_CONV_DEADLK
- * The lock request was canceled do to a conversion deadlock.
- */
-
-#define LM_OUT_ST_MASK		0x00000003
-#define LM_OUT_CANCELED		0x00000008
-#define LM_OUT_ASYNC		0x00000080
-#define LM_OUT_ERROR		0x00000100
-
-/*
- * lm_callback_t types
- *
- * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
- * Blocking callback, a remote node is requesting the given lock in
- * EXCLUSIVE, DEFERRED, or SHARED.
- *
- * LM_CB_NEED_RECOVERY
- * The given journal needs to be recovered.
- *
- * LM_CB_ASYNC
- * The given lock has been granted.
- */
-
-#define LM_CB_NEED_E		257
-#define LM_CB_NEED_D		258
-#define LM_CB_NEED_S		259
-#define LM_CB_NEED_RECOVERY	260
-#define LM_CB_ASYNC		262
-
-/*
- * lm_recovery_done() messages
- */
-
-#define LM_RD_GAVEUP		308
-#define LM_RD_SUCCESS		309
-
-
-struct lm_lockname {
-	u64 ln_number;
-	unsigned int ln_type;
-};
-
-#define lm_name_equal(name1, name2) \
-	(((name1)->ln_number == (name2)->ln_number) && \
-	 ((name1)->ln_type == (name2)->ln_type)) \
-
-struct lm_async_cb {
-	struct lm_lockname lc_name;
-	int lc_ret;
-};
-
-struct lm_lockstruct;
-
-struct lm_lockops {
-	const char *lm_proto_name;
-
-	/*
-	 * Mount/Unmount
-	 */
-
-	int (*lm_mount) (char *table_name, char *host_data,
-			 lm_callback_t cb, void *cb_data,
-			 unsigned int min_lvb_size, int flags,
-			 struct lm_lockstruct *lockstruct,
-			 struct kobject *fskobj);
-
-	void (*lm_others_may_mount) (void *lockspace);
-
-	void (*lm_unmount) (void *lockspace);
-
-	void (*lm_withdraw) (void *lockspace);
-
-	/*
-	 * Lock oriented operations
-	 */
-
-	int (*lm_get_lock) (void *lockspace, struct lm_lockname *name, void **lockp);
-
-	void (*lm_put_lock) (void *lock);
-
-	unsigned int (*lm_lock) (void *lock, unsigned int cur_state,
-				 unsigned int req_state, unsigned int flags);
-
-	unsigned int (*lm_unlock) (void *lock, unsigned int cur_state);
-
-	void (*lm_cancel) (void *lock);
-
-	int (*lm_hold_lvb) (void *lock, char **lvbp);
-	void (*lm_unhold_lvb) (void *lock, char *lvb);
-
-	/*
-	 * Posix Lock oriented operations
-	 */
-
-	int (*lm_plock_get) (void *lockspace, struct lm_lockname *name,
-			     struct file *file, struct file_lock *fl);
-
-	int (*lm_plock) (void *lockspace, struct lm_lockname *name,
-			 struct file *file, int cmd, struct file_lock *fl);
-
-	int (*lm_punlock) (void *lockspace, struct lm_lockname *name,
-			   struct file *file, struct file_lock *fl);
-
-	/*
-	 * Client oriented operations
-	 */
-
-	void (*lm_recovery_done) (void *lockspace, unsigned int jid,
-				  unsigned int message);
-
-	struct module *lm_owner;
-};
-
-/*
- * lm_mount() return values
- *
- * ls_jid - the journal ID this node should use
- * ls_first - this node is the first to mount the file system
- * ls_lvb_size - size in bytes of lock value blocks
- * ls_lockspace - lock module's context for this file system
- * ls_ops - lock module's functions
- * ls_flags - lock module features
- */
-
-struct lm_lockstruct {
-	unsigned int ls_jid;
-	unsigned int ls_first;
-	unsigned int ls_lvb_size;
-	void *ls_lockspace;
-	const struct lm_lockops *ls_ops;
-	int ls_flags;
-};
-
-/*
- * Lock module bottom interface.  A lock module makes itself available to GFS
- * with these functions.
- */
-
-int gfs2_register_lockproto(const struct lm_lockops *proto);
-void gfs2_unregister_lockproto(const struct lm_lockops *proto);
-
-/*
- * Lock module top interface.  GFS calls these functions when mounting or
- * unmounting a file system.
- */
-
-int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
-			 lm_callback_t cb, void *cb_data,
-			 unsigned int min_lvb_size, int flags,
-			 struct lm_lockstruct *lockstruct,
-			 struct kobject *fskobj);
-
-void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
-
-void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
-
-#endif /* __LM_INTERFACE_DOT_H__ */
-
-- 
cgit v1.2.3-71-gd317


From 32ca163c9cdb33151d79e95a7cf244f62b5d4418 Mon Sep 17 00:00:00 2001
From: Petros Koutoupis <pkoutoupis@hydrasystemsllc.com>
Date: Tue, 10 Mar 2009 08:25:54 +0100
Subject: block: genhd.h comment needs updating

The include/linux/genhd.h file, on line 338-352 declares some function
prototypes in which the comment on line 338 states that the definition of
these prototypes are to be found at drivers/block/genhd.c. The problem is
that genhd.c has been relocated to block/genhd.c. See attached patch to
correct this minor cosmetic typo.

Signed-off-by: Petros Koutoupis <pkoutoupis@hydrasystemsllc.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/genhd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 16948eaecae3..56946b21ab78 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -336,7 +336,7 @@ static inline void part_dec_in_flight(struct hd_struct *part)
 /* drivers/block/ll_rw_blk.c */
 extern void part_round_stats(int cpu, struct hd_struct *part);
 
-/* drivers/block/genhd.c */
+/* block/genhd.c */
 extern int get_blkdev_list(char *, int);
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
-- 
cgit v1.2.3-71-gd317


From 6d2a78e783416ba99e36beb1d4395b785b34e867 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 10 Mar 2009 08:27:39 +0100
Subject: block: add private bio_set for bio integrity allocations

The integrity bio allocation needs its own bio_set to avoid violating
the mempool allocation rules and risking deadlocks.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c  | 85 ++++++++++++++++-------------------------------------
 fs/bio.c            |  9 ++----
 include/linux/bio.h | 18 +++---------
 3 files changed, 32 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index fe2b1aa2464e..31c46a241bac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -26,23 +26,23 @@
 #include <linux/workqueue.h>
 
 static struct kmem_cache *bio_integrity_slab __read_mostly;
+static mempool_t *bio_integrity_pool;
+static struct bio_set *integrity_bio_set;
 static struct workqueue_struct *kintegrityd_wq;
 
 /**
- * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
  * @bio:	bio to attach integrity metadata to
  * @gfp_mask:	Memory allocation mask
  * @nr_vecs:	Number of integrity metadata scatter-gather elements
- * @bs:		bio_set to allocate from
  *
  * Description: This function prepares a bio for attaching integrity
  * metadata.  nr_vecs specifies the maximum number of pages containing
  * integrity metadata that can be attached.
  */
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
-							 gfp_t gfp_mask,
-							 unsigned int nr_vecs,
-							 struct bio_set *bs)
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
 {
 	struct bio_integrity_payload *bip;
 	struct bio_vec *iv;
@@ -50,7 +50,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	BUG_ON(bio == NULL);
 
-	bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+	bip = mempool_alloc(bio_integrity_pool, gfp_mask);
 	if (unlikely(bip == NULL)) {
 		printk(KERN_ERR "%s: could not alloc bip\n", __func__);
 		return NULL;
@@ -58,10 +58,10 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	memset(bip, 0, sizeof(*bip));
 
-	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
 	if (unlikely(iv == NULL)) {
 		printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
-		mempool_free(bip, bs->bio_integrity_pool);
+		mempool_free(bip, bio_integrity_pool);
 		return NULL;
 	}
 
@@ -72,35 +72,16 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	return bip;
 }
-EXPORT_SYMBOL(bio_integrity_alloc_bioset);
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio:	bio to attach integrity metadata to
- * @gfp_mask:	Memory allocation mask
- * @nr_vecs:	Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata.  nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-						  gfp_t gfp_mask,
-						  unsigned int nr_vecs)
-{
-	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
-}
 EXPORT_SYMBOL(bio_integrity_alloc);
 
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
- * @bs:		bio_set this bio was allocated from
  *
  * Description: Used to free the integrity portion of a bio. Usually
  * called from bio_free().
  */
-void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+void bio_integrity_free(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 
@@ -111,8 +92,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
-	bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
-	mempool_free(bip, bs->bio_integrity_pool);
+	bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
+	mempool_free(bip, bio_integrity_pool);
 
 	bio->bi_integrity = NULL;
 }
@@ -686,19 +667,17 @@ EXPORT_SYMBOL(bio_integrity_split);
  * @bio:	New bio
  * @bio_src:	Original bio
  * @gfp_mask:	Memory allocation mask
- * @bs:		bio_set to allocate bip from
  *
  * Description:	Called to allocate a bip when cloning a bio
  */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-			gfp_t gfp_mask, struct bio_set *bs)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
 {
 	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
 	struct bio_integrity_payload *bip;
 
 	BUG_ON(bip_src == NULL);
 
-	bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
+	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
 
 	if (bip == NULL)
 		return -EIO;
@@ -714,37 +693,25 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 }
 EXPORT_SYMBOL(bio_integrity_clone);
 
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
+static int __init bio_integrity_init(void)
 {
-	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
-							  bio_integrity_slab);
-	if (!bs->bio_integrity_pool)
-		return -1;
-
-	return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
+	kintegrityd_wq = create_workqueue("kintegrityd");
 
-void bioset_integrity_free(struct bio_set *bs)
-{
-	if (bs->bio_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
-}
-EXPORT_SYMBOL(bioset_integrity_free);
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
 
-void __init bio_integrity_init_slab(void)
-{
 	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
 					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-}
 
-static int __init integrity_init(void)
-{
-	kintegrityd_wq = create_workqueue("kintegrityd");
+	bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
+						      bio_integrity_slab);
+	if (!bio_integrity_pool)
+		panic("bio_integrity: can't allocate bip pool\n");
 
-	if (!kintegrityd_wq)
-		panic("Failed to create kintegrityd\n");
+	integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+	if (!integrity_bio_set)
+		panic("bio_integrity: can't allocate bio_set\n");
 
 	return 0;
 }
-subsys_initcall(integrity_init);
+subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 9cc1430b4495..a040cde7f6fd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,7 +248,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
-		bio_integrity_free(bio, bs);
+		bio_integrity_free(bio);
 
 	/*
 	 * If we have front padding, adjust the bio pointer before freeing
@@ -466,7 +466,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 	if (bio_integrity(bio)) {
 		int ret;
 
-		ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+		ret = bio_integrity_clone(b, bio, gfp_mask);
 
 		if (ret < 0) {
 			bio_put(b);
@@ -1529,7 +1529,6 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
-	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
 	bio_put_slab(bs);
 
@@ -1570,9 +1569,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (bioset_integrity_create(bs, pool_size))
-		goto bad;
-
 	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
@@ -1610,7 +1606,6 @@ static int __init init_bio(void)
 	if (!bio_slabs)
 		panic("bio: can't allocate bios\n");
 
-	bio_integrity_init_slab();
 	biovec_init_slabs();
 
 	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d8bd43bfdcf5..b05b1d4d17d2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -426,9 +426,6 @@ struct bio_set {
 	unsigned int front_pad;
 
 	mempool_t *bio_pool;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-	mempool_t *bio_integrity_pool;
-#endif
 	mempool_t *bvec_pool;
 };
 
@@ -519,9 +516,8 @@ static inline int bio_has_data(struct bio *bio)
 
 #define bio_integrity(bio) (bio->bi_integrity != NULL)
 
-extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
-extern void bio_integrity_free(struct bio *, struct bio_set *);
+extern void bio_integrity_free(struct bio *);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern int bio_integrity_enabled(struct bio *bio);
 extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
@@ -531,27 +527,21 @@ extern void bio_integrity_endio(struct bio *, int);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
 extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
-extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t, struct bio_set *);
-extern int bioset_integrity_create(struct bio_set *, int);
-extern void bioset_integrity_free(struct bio_set *);
-extern void bio_integrity_init_slab(void);
+extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 #define bio_integrity(a)		(0)
-#define bioset_integrity_create(a, b)	(0)
 #define bio_integrity_prep(a)		(0)
 #define bio_integrity_enabled(a)	(0)
-#define bio_integrity_clone(a, b, c,d )	(0)
-#define bioset_integrity_free(a)	do { } while (0)
-#define bio_integrity_free(a, b)	do { } while (0)
+#define bio_integrity_clone(a, b, c)	(0)
+#define bio_integrity_free(a)		do { } while (0)
 #define bio_integrity_endio(a, b)	do { } while (0)
 #define bio_integrity_advance(a, b)	do { } while (0)
 #define bio_integrity_trim(a, b, c)	do { } while (0)
 #define bio_integrity_split(a, b, c)	do { } while (0)
 #define bio_integrity_set_tag(a, b, c)	do { } while (0)
 #define bio_integrity_get_tag(a, b, c)	do { } while (0)
-#define bio_integrity_init_slab(a)	do { } while (0)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-- 
cgit v1.2.3-71-gd317


From d399228646e26db315d6233bed65ec9d08c57f57 Mon Sep 17 00:00:00 2001
From: Petros Koutoupis <pkoutoupis@hydrasystemsllc.com>
Date: Wed, 11 Mar 2009 10:49:35 +0100
Subject: block: genhd.h cleanup patch

In include/linux/genhd.h: Line 335 has a comment that needs to be updated from: /* drivers/block/ll_rw_blk.c */ to /* block/blk-core.c */. Also as of kernel 2.6.16, the function definition for get_blkdev_list was removed from block/genhd.c but the function declaration is still present on line 339. This patch addresses both those fixes, by updating the comment and removing the declaration.

Signed-off-by: Petros Koutoupis <pkoutoupis@hydrasystemsllc.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/genhd.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 56946b21ab78..634c53028fb8 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -333,11 +333,10 @@ static inline void part_dec_in_flight(struct hd_struct *part)
 		part_to_disk(part)->part0.in_flight--;
 }
 
-/* drivers/block/ll_rw_blk.c */
+/* block/blk-core.c */
 extern void part_round_stats(int cpu, struct hd_struct *part);
 
 /* block/genhd.c */
-extern int get_blkdev_list(char *, int);
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
 extern void unlink_gendisk(struct gendisk *gp);
-- 
cgit v1.2.3-71-gd317


From 05378940caf979a8655c18b18a17213dcfa52412 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 24 Mar 2009 12:23:40 +0100
Subject: bsg: add support for tail queuing

Currently inherited from sg.c bsg will submit asynchronous request
 at the head-of-the-queue, (using "at_head" set in the call to
 blk_execute_rq_nowait()). This is bad in situation where the queues
 are full, requests will execute out of order, and can cause
 starvation of the first submitted requests.

The sg_io_v4->flags member is used and a bit is allocated to denote the
Q_AT_TAIL. Zero is to queue at_head as before, to be compatible with old
code at the write/read path. SG_IO code path behavior was changed so to
be the same as write/read behavior. SG_IO was very rarely used and breaking
compatibility with it is OK at this stage.

sg_io_hdr at sg.h also has a flags member and uses 3 bits from the first
nibble and one bit from the last nibble. Even though none of these bits
are supported by bsg, The second nibble is allocated for use by bsg. Just
in case.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
CC: Douglas Gilbert <dgilbert@interlog.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/bsg.c         | 9 +++++++--
 include/linux/bsg.h | 8 ++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 0ce8806dd0c1..0f63b91d0af6 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -353,6 +353,8 @@ static void bsg_rq_end_io(struct request *rq, int uptodate)
 static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
 			    struct bsg_command *bc, struct request *rq)
 {
+	int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL));
+
 	/*
 	 * add bc command to busy queue and submit rq for io
 	 */
@@ -368,7 +370,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
 	dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
 
 	rq->end_io_data = bc;
-	blk_execute_rq_nowait(q, NULL, rq, 1, bsg_rq_end_io);
+	blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
 }
 
 static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
@@ -924,6 +926,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		struct request *rq;
 		struct bio *bio, *bidi_bio = NULL;
 		struct sg_io_v4 hdr;
+		int at_head;
 		u8 sense[SCSI_SENSE_BUFFERSIZE];
 
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
@@ -936,7 +939,9 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		bio = rq->bio;
 		if (rq->next_rq)
 			bidi_bio = rq->next_rq->bio;
-		blk_execute_rq(bd->queue, NULL, rq, 0);
+
+		at_head = (0 == (hdr.flags & BSG_FLAG_Q_AT_TAIL));
+		blk_execute_rq(bd->queue, NULL, rq, at_head);
 		ret = blk_complete_sgv4_hdr_rq(rq, &hdr, bio, bidi_bio);
 
 		if (copy_to_user(uarg, &hdr, sizeof(hdr)))
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index cf0303a60611..3f0c64ace424 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -7,6 +7,14 @@
 #define BSG_SUB_PROTOCOL_SCSI_TMF	1
 #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT	2
 
+/*
+ * For flags member below
+ * sg.h sg_io_hdr also has bits defined for it's flags member. However
+ * none of these bits are implemented/used by bsg. The bits below are
+ * allocated to not conflict with sg.h ones anyway.
+ */
+#define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */
+
 struct sg_io_v4 {
 	__s32 guard;		/* [i] 'Q' to differentiate from v3 */
 	__u32 protocol;		/* [i] 0 -> SCSI , .... */
-- 
cgit v1.2.3-71-gd317


From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:38:49 -0400
Subject: function-graph: moved the timestamp from arch to generic code

This patch move the timestamp from happening in the arch specific
code into the general code. This allows for better control by the tracer
to time manipulation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c             | 6 +-----
 include/linux/ftrace.h               | 3 +--
 kernel/trace/trace_functions_graph.c | 8 +++++---
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 57b33edb7ce3..61df77532120 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -410,7 +410,6 @@ int ftrace_disable_ftrace_graph_caller(void)
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 {
 	unsigned long old;
-	unsigned long long calltime;
 	int faulted;
 	struct ftrace_graph_ent trace;
 	unsigned long return_hooker = (unsigned long)
@@ -453,10 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = trace_clock_local();
-
-	if (ftrace_push_return_trace(old, calltime,
-				self_addr, &trace.depth) == -EBUSY) {
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
 		return;
 	}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db3fed630db3..1141248c84ee 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -369,8 +369,7 @@ struct ftrace_ret_stack {
 extern void return_to_handler(void);
 
 extern int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth);
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
 extern void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e876816fa8e7..d28687e7b3a7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = {
 
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 {
+	unsigned long long calltime;
 	int index;
 
 	if (!current->ret_stack)
@@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time,
 		return -EBUSY;
 	}
 
+	calltime = trace_clock_local();
+
 	index = ++current->curr_ret_stack;
 	barrier();
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
+	current->ret_stack[index].calltime = calltime;
 	*depth = index;
 
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 01:10:15 -0400
Subject: function-graph: ignore times across schedule

Impact: more accurate timings

The current method of function graph tracing does not take into
account the time spent when a task is not running. This shows functions
that call schedule have increased costs:

 3) + 18.664 us   |      }
 ------------------------------------------
 3)    <idle>-0    =>  kblockd-123
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   1.441 us    |        _spin_unlock_irq();
 3)   3.966 us    |      }
 3) ! 2959.433 us |    }
 3) ! 2961.465 us |  }

This patch uses the tracepoint in the scheduling context switch to
account for time that has elapsed while a task is scheduled out.
Now we see:

 ------------------------------------------
 3)    <idle>-0    =>  edac-po-1067
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   0.685 us    |        _spin_unlock_irq();
 3)   2.331 us    |      }
 3) + 41.439 us   |    }
 3) + 42.663 us   |  }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/sched.h |  2 ++
 kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89cd308cc7a5..471e36d30123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1409,6 +1409,8 @@ struct task_struct {
 	int curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
 	struct ftrace_ret_stack	*ret_stack;
+	/* time stamp for last schedule */
+	unsigned long long ftrace_timestamp;
 	/*
 	 * Number of functions that haven't been traced
 	 * because of depth overrun.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c81a759fbf76..0b90364d1a2c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,6 +29,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+#include <trace/sched.h>
+
 #include <asm/ftrace.h>
 
 #include "trace.h"
@@ -2590,6 +2592,31 @@ free:
 	return ret;
 }
 
+static void
+ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+				struct task_struct *next)
+{
+	unsigned long long timestamp;
+	int index;
+
+	timestamp = trace_clock_local();
+
+	prev->ftrace_timestamp = timestamp;
+
+	/* only process tasks that we timestamped */
+	if (!next->ftrace_timestamp)
+		return;
+
+	/*
+	 * Update all the counters in next to make up for the
+	 * time next was sleeping.
+	 */
+	timestamp -= next->ftrace_timestamp;
+
+	for (index = next->curr_ret_stack; index >= 0; index--)
+		next->ret_stack[index].calltime += timestamp;
+}
+
 /* Allocate a return stack for each task */
 static int start_graph_tracing(void)
 {
@@ -2611,6 +2638,13 @@ static int start_graph_tracing(void)
 		ret = alloc_retstack_tasklist(ret_stack_list);
 	} while (ret == -EAGAIN);
 
+	if (!ret) {
+		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+		if (ret)
+			pr_info("ftrace_graph: Couldn't activate tracepoint"
+				" probe to kernel_sched_switch\n");
+	}
+
 	kfree(ret_stack_list);
 	return ret;
 }
@@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void)
 	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
@@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
+		t->ftrace_timestamp = 0;
 	} else
 		t->ret_stack = NULL;
 }
-- 
cgit v1.2.3-71-gd317


From ee000b7f9fe429d2470c674ccec8d344f6789e0d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 13:38:06 +0800
Subject: tracing: use union for multi-usages field

Impact: cleanup

struct dyn_ftrace::ip has different usages in his lifecycle,
we use union for it. And also for struct dyn_ftrace::flags.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C871BE.3080405@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 12 +++++++++---
 kernel/trace/ftrace.c  |  8 ++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1141248c84ee..015a3d22cf74 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,9 +145,15 @@ enum {
 };
 
 struct dyn_ftrace {
-	unsigned long		ip; /* address of mcount call-site */
-	unsigned long		flags;
-	struct dyn_arch_ftrace	arch;
+	union {
+		unsigned long		ip; /* address of mcount call-site */
+		struct dyn_ftrace	*freelist;
+	};
+	union {
+		unsigned long		flags;
+		struct dyn_ftrace	*newlist;
+	};
+	struct dyn_arch_ftrace		arch;
 };
 
 int ftrace_force_update(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb377112b1bb..7b8722baf153 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -341,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec)
 
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
-	rec->ip = (unsigned long)ftrace_free_records;
+	rec->freelist = ftrace_free_records;
 	ftrace_free_records = rec;
 	rec->flags |= FTRACE_FL_FREE;
 }
@@ -379,7 +379,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 			return NULL;
 		}
 
-		ftrace_free_records = (void *)rec->ip;
+		ftrace_free_records = rec->freelist;
 		memset(rec, 0, sizeof(*rec));
 		return rec;
 	}
@@ -411,7 +411,7 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-	rec->flags = (unsigned long)ftrace_new_addrs;
+	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
 	return rec;
@@ -731,7 +731,7 @@ static int ftrace_update_code(struct module *mod)
 			return -1;
 
 		p = ftrace_new_addrs;
-		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		ftrace_new_addrs = p->newlist;
 		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-- 
cgit v1.2.3-71-gd317


From bf8e3355ec8f4e472f9841e94203cd759b45226e Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 5 Dec 2008 22:43:41 +0100
Subject: firewire: cdev: documentation fixlet

Reported-by: Jay Fenlason <fenlason@redhat.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 4d078e99c017..899ef279f5be 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -229,7 +229,7 @@ struct fw_cdev_get_info {
  * Send a request to the device.  This ioctl implements all outgoing requests.
  * Both quadlet and block request specify the payload as a pointer to the data
  * in the @data field.  Once the transaction completes, the kernel writes an
- * &fw_cdev_event_request event back.  The @closure field is passed back to
+ * &fw_cdev_event_response event back.  The @closure field is passed back to
  * user space in the response event.
  */
 struct fw_cdev_send_request {
-- 
cgit v1.2.3-71-gd317


From 632321ecd99bf85c982a75f8329b4ecbb95b3a8f Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 2 Jan 2009 12:47:13 +0100
Subject: firewire: cdev: fix documentation of FW_CDEV_IOC_GET_INFO

The FW_CDEV_IOC_GET_INFO ioctl looks at client->device->config_rom, not
at the local node's config ROM.

We could fix the implementation or the documentation.  I believe the way
how it is currently implemented is more useful than the way how it is
currently documented.  In fact, libdc1394 uses the ABI already as
implemented, not as documented.  Hence let's change the documentation.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 899ef279f5be..86c8ff5326f9 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -201,7 +201,7 @@ union fw_cdev_event {
  *		case, @rom_length is updated with the actual length of the
  *		configuration ROM.
  * @rom:	If non-zero, address of a buffer to be filled by a copy of the
- *		local node's configuration ROM
+ *		device's configuration ROM
  * @bus_reset:	If non-zero, address of a buffer to be filled by a
  *		&struct fw_cdev_event_bus_reset with the current state
  *		of the bus.  This does not cause a bus reset to happen.
-- 
cgit v1.2.3-71-gd317


From b1bda4cdc2037447bd66753bf5ccab66d91b0b59 Mon Sep 17 00:00:00 2001
From: "Jay Fenlason, Stefan Richter" <stefanr@s5r6.in-berlin.de>
Date: Sun, 4 Jan 2009 16:23:29 +0100
Subject: firewire: cdev: add ioctls for isochronous resource management

Based on
    Date: Tue, 18 Nov 2008 11:41:27 -0500
    From: Jay Fenlason <fenlason@redhat.com>
    Subject: [Patch V4] Add ISO resource management support
with several changes to the ABI and implementation.  Only the part of
the ABI which enables auto-reallocation and auto-deallocation is
included here.

This implements ioctls for kernel-assisted allocation of isochronous
channels and isochronous bandwidth.  The benefits are:
  - The client does not have to have write access to the /dev/fw* device
    corresponding to the IRM.
  - The client does not have to perform reallocation after bus resets.
  - Channel and bandwidth are deallocated by the kernel if the file is
    closed before the client deallocated the resources.  Thus resources
    are released even if the client crashes.

It is anticipated that future in-kernel code (firewire-core IRM code;
the firewire port of firedtv), will use the fw-iso.c portions of this
code too.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Tested-by: David Moore <dcm@acm.org>
---
 drivers/firewire/fw-cdev.c        | 215 +++++++++++++++++++++++++++++++++++++-
 drivers/firewire/fw-iso.c         | 176 +++++++++++++++++++++++++++++--
 drivers/firewire/fw-transaction.h |   4 +
 include/linux/firewire-cdev.h     | 100 +++++++++++++++---
 4 files changed, 475 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 4c33b51b735a..a227853aa1e2 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -24,6 +24,7 @@
 #include <linux/errno.h>
 #include <linux/firewire-cdev.h>
 #include <linux/idr.h>
+#include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/mm.h>
@@ -35,6 +36,7 @@
 #include <linux/time.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
+#include <linux/workqueue.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -114,6 +116,21 @@ struct descriptor_resource {
 	u32 data[0];
 };
 
+struct iso_resource {
+	struct client_resource resource;
+	struct client *client;
+	/* Schedule work and access todo only with client->lock held. */
+	struct delayed_work work;
+	enum {ISO_RES_ALLOC, ISO_RES_REALLOC, ISO_RES_DEALLOC,} todo;
+	int generation;
+	u64 channels;
+	s32 bandwidth;
+	struct iso_resource_event *e_alloc, *e_dealloc;
+};
+
+static void schedule_iso_resource(struct iso_resource *);
+static void release_iso_resource(struct client *, struct client_resource *);
+
 /*
  * dequeue_event() just kfree()'s the event, so the event has to be
  * the first field in a struct XYZ_event.
@@ -145,6 +162,11 @@ struct iso_interrupt_event {
 	struct fw_cdev_event_iso_interrupt interrupt;
 };
 
+struct iso_resource_event {
+	struct event event;
+	struct fw_cdev_event_iso_resource resource;
+};
+
 static inline void __user *u64_to_uptr(__u64 value)
 {
 	return (void __user *)(unsigned long)value;
@@ -290,6 +312,16 @@ static void for_each_client(struct fw_device *device,
 	mutex_unlock(&device->client_list_mutex);
 }
 
+static int schedule_reallocations(int id, void *p, void *data)
+{
+	struct client_resource *r = p;
+
+	if (r->release == release_iso_resource)
+		schedule_iso_resource(container_of(r,
+					struct iso_resource, resource));
+	return 0;
+}
+
 static void queue_bus_reset_event(struct client *client)
 {
 	struct bus_reset_event *e;
@@ -304,6 +336,10 @@ static void queue_bus_reset_event(struct client *client)
 
 	queue_event(client, &e->event,
 		    &e->reset, sizeof(e->reset), NULL, 0);
+
+	spin_lock_irq(&client->lock);
+	idr_for_each(&client->resource_idr, schedule_reallocations, client);
+	spin_unlock_irq(&client->lock);
 }
 
 void fw_device_cdev_update(struct fw_device *device)
@@ -376,8 +412,12 @@ static int add_client_resource(struct client *client,
 	else
 		ret = idr_get_new(&client->resource_idr, resource,
 				  &resource->handle);
-	if (ret >= 0)
+	if (ret >= 0) {
 		client_get(client);
+		if (resource->release == release_iso_resource)
+			schedule_iso_resource(container_of(resource,
+						struct iso_resource, resource));
+	}
 	spin_unlock_irqrestore(&client->lock, flags);
 
 	if (ret == -EAGAIN)
@@ -970,6 +1010,177 @@ static int ioctl_get_cycle_timer(struct client *client, void *buffer)
 	return 0;
 }
 
+static void iso_resource_work(struct work_struct *work)
+{
+	struct iso_resource_event *e;
+	struct iso_resource *r =
+			container_of(work, struct iso_resource, work.work);
+	struct client *client = r->client;
+	int generation, channel, bandwidth, todo;
+	bool skip, free, success;
+
+	spin_lock_irq(&client->lock);
+	generation = client->device->generation;
+	todo = r->todo;
+	/* Allow 1000ms grace period for other reallocations. */
+	if (todo == ISO_RES_ALLOC &&
+	    time_is_after_jiffies(client->device->card->reset_jiffies + HZ)) {
+		if (schedule_delayed_work(&r->work, DIV_ROUND_UP(HZ, 3)))
+			client_get(client);
+		skip = true;
+	} else {
+		/* We could be called twice within the same generation. */
+		skip = todo == ISO_RES_REALLOC &&
+		       r->generation == generation;
+	}
+	free = todo == ISO_RES_DEALLOC;
+	r->generation = generation;
+	spin_unlock_irq(&client->lock);
+
+	if (skip)
+		goto out;
+
+	bandwidth = r->bandwidth;
+
+	fw_iso_resource_manage(client->device->card, generation,
+			r->channels, &channel, &bandwidth,
+			todo == ISO_RES_ALLOC || todo == ISO_RES_REALLOC);
+	/*
+	 * Is this generation outdated already?  As long as this resource sticks
+	 * in the idr, it will be scheduled again for a newer generation or at
+	 * shutdown.
+	 */
+	if (channel == -EAGAIN &&
+	    (todo == ISO_RES_ALLOC || todo == ISO_RES_REALLOC))
+		goto out;
+
+	success = channel >= 0 || bandwidth > 0;
+
+	spin_lock_irq(&client->lock);
+	/*
+	 * Transit from allocation to reallocation, except if the client
+	 * requested deallocation in the meantime.
+	 */
+	if (r->todo == ISO_RES_ALLOC)
+		r->todo = ISO_RES_REALLOC;
+	/*
+	 * Allocation or reallocation failure?  Pull this resource out of the
+	 * idr and prepare for deletion, unless the client is shutting down.
+	 */
+	if (r->todo == ISO_RES_REALLOC && !success &&
+	    !client->in_shutdown &&
+	    idr_find(&client->resource_idr, r->resource.handle)) {
+		idr_remove(&client->resource_idr, r->resource.handle);
+		client_put(client);
+		free = true;
+	}
+	spin_unlock_irq(&client->lock);
+
+	if (todo == ISO_RES_ALLOC && channel >= 0)
+		r->channels = 1ULL << (63 - channel);
+
+	if (todo == ISO_RES_REALLOC && success)
+		goto out;
+
+	if (todo == ISO_RES_ALLOC) {
+		e = r->e_alloc;
+		r->e_alloc = NULL;
+	} else {
+		e = r->e_dealloc;
+		r->e_dealloc = NULL;
+	}
+	e->resource.handle	= r->resource.handle;
+	e->resource.channel	= channel;
+	e->resource.bandwidth	= bandwidth;
+
+	queue_event(client, &e->event,
+		    &e->resource, sizeof(e->resource), NULL, 0);
+
+	if (free) {
+		cancel_delayed_work(&r->work);
+		kfree(r->e_alloc);
+		kfree(r->e_dealloc);
+		kfree(r);
+	}
+ out:
+	client_put(client);
+}
+
+static void schedule_iso_resource(struct iso_resource *r)
+{
+	if (schedule_delayed_work(&r->work, 0))
+		client_get(r->client);
+}
+
+static void release_iso_resource(struct client *client,
+				 struct client_resource *resource)
+{
+	struct iso_resource *r =
+		container_of(resource, struct iso_resource, resource);
+
+	spin_lock_irq(&client->lock);
+	r->todo = ISO_RES_DEALLOC;
+	schedule_iso_resource(r);
+	spin_unlock_irq(&client->lock);
+}
+
+static int ioctl_allocate_iso_resource(struct client *client, void *buffer)
+{
+	struct fw_cdev_allocate_iso_resource *request = buffer;
+	struct iso_resource_event *e1, *e2;
+	struct iso_resource *r;
+	int ret;
+
+	if ((request->channels == 0 && request->bandwidth == 0) ||
+	    request->bandwidth > BANDWIDTH_AVAILABLE_INITIAL ||
+	    request->bandwidth < 0)
+		return -EINVAL;
+
+	r  = kmalloc(sizeof(*r), GFP_KERNEL);
+	e1 = kmalloc(sizeof(*e1), GFP_KERNEL);
+	e2 = kmalloc(sizeof(*e2), GFP_KERNEL);
+	if (r == NULL || e1 == NULL || e2 == NULL) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	INIT_DELAYED_WORK(&r->work, iso_resource_work);
+	r->client	= client;
+	r->todo		= ISO_RES_ALLOC;
+	r->generation	= -1;
+	r->channels	= request->channels;
+	r->bandwidth	= request->bandwidth;
+	r->e_alloc	= e1;
+	r->e_dealloc	= e2;
+
+	e1->resource.closure	= request->closure;
+	e1->resource.type	= FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED;
+	e2->resource.closure	= request->closure;
+	e2->resource.type	= FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED;
+
+	r->resource.release = release_iso_resource;
+	ret = add_client_resource(client, &r->resource, GFP_KERNEL);
+	if (ret < 0)
+		goto fail;
+	request->handle = r->resource.handle;
+
+	return 0;
+ fail:
+	kfree(r);
+	kfree(e1);
+	kfree(e2);
+
+	return ret;
+}
+
+static int ioctl_deallocate_iso_resource(struct client *client, void *buffer)
+{
+	struct fw_cdev_deallocate *request = buffer;
+
+	return release_client_resource(client, request->handle,
+				       release_iso_resource, NULL);
+}
+
 static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_get_info,
 	ioctl_send_request,
@@ -984,6 +1195,8 @@ static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_start_iso,
 	ioctl_stop_iso,
 	ioctl_get_cycle_timer,
+	ioctl_allocate_iso_resource,
+	ioctl_deallocate_iso_resource,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/drivers/firewire/fw-iso.c b/drivers/firewire/fw-iso.c
index 39f3bacee404..a7b57b253b06 100644
--- a/drivers/firewire/fw-iso.c
+++ b/drivers/firewire/fw-iso.c
@@ -1,5 +1,7 @@
 /*
- * Isochronous IO functionality
+ * Isochronous I/O functionality:
+ *   - Isochronous DMA context management
+ *   - Isochronous bus resource management (channels, bandwidth), client side
  *
  * Copyright (C) 2006 Kristian Hoegsberg <krh@bitplanet.net>
  *
@@ -18,15 +20,20 @@
  * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
-#include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/dma-mapping.h>
-#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/firewire-constants.h>
+#include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
 
-#include "fw-transaction.h"
 #include "fw-topology.h"
-#include "fw-device.h"
+#include "fw-transaction.h"
+
+/*
+ * Isochronous DMA context management
+ */
 
 int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
 		       int page_count, enum dma_data_direction direction)
@@ -153,3 +160,160 @@ int fw_iso_context_stop(struct fw_iso_context *ctx)
 {
 	return ctx->card->driver->stop_iso(ctx);
 }
+
+/*
+ * Isochronous bus resource management (channels, bandwidth), client side
+ */
+
+static int manage_bandwidth(struct fw_card *card, int irm_id, int generation,
+			    int bandwidth, bool allocate)
+{
+	__be32 data[2];
+	int try, new, old = allocate ? BANDWIDTH_AVAILABLE_INITIAL : 0;
+
+	/*
+	 * On a 1394a IRM with low contention, try < 1 is enough.
+	 * On a 1394-1995 IRM, we need at least try < 2.
+	 * Let's just do try < 5.
+	 */
+	for (try = 0; try < 5; try++) {
+		new = allocate ? old - bandwidth : old + bandwidth;
+		if (new < 0 || new > BANDWIDTH_AVAILABLE_INITIAL)
+			break;
+
+		data[0] = cpu_to_be32(old);
+		data[1] = cpu_to_be32(new);
+		switch (fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP,
+				irm_id, generation, SCODE_100,
+				CSR_REGISTER_BASE + CSR_BANDWIDTH_AVAILABLE,
+				data, sizeof(data))) {
+		case RCODE_GENERATION:
+			/* A generation change frees all bandwidth. */
+			return allocate ? -EAGAIN : bandwidth;
+
+		case RCODE_COMPLETE:
+			if (be32_to_cpup(data) == old)
+				return bandwidth;
+
+			old = be32_to_cpup(data);
+			/* Fall through. */
+		}
+	}
+
+	return -EIO;
+}
+
+static int manage_channel(struct fw_card *card, int irm_id, int generation,
+			  __be32 channels_mask, u64 offset, bool allocate)
+{
+	__be32 data[2], c, old = allocate ? cpu_to_be32(~0) : 0;
+	int i, retry = 5;
+
+	for (i = 0; i < 32; i++) {
+		c = cpu_to_be32(1 << (31 - i));
+		if (!(channels_mask & c))
+			continue;
+
+		if (allocate == !(old & c))
+			continue;
+
+		data[0] = old;
+		data[1] = old ^ c;
+		switch (fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP,
+					   irm_id, generation, SCODE_100,
+					   offset, data, sizeof(data))) {
+		case RCODE_GENERATION:
+			/* A generation change frees all channels. */
+			return allocate ? -EAGAIN : i;
+
+		case RCODE_COMPLETE:
+			if (data[0] == old)
+				return i;
+
+			old = data[0];
+
+			/* Is the IRM 1394a-2000 compliant? */
+			if ((data[0] & c) != (data[1] & c))
+				continue;
+
+			/* 1394-1995 IRM, fall through to retry. */
+		default:
+			if (retry--)
+				i--;
+		}
+	}
+
+	return -EIO;
+}
+
+static void deallocate_channel(struct fw_card *card, int irm_id,
+			       int generation, int channel)
+{
+	__be32 mask;
+	u64 offset;
+
+	mask = channel < 32 ? cpu_to_be32(1 << (31 - channel)) :
+			      cpu_to_be32(1 << (63 - channel));
+	offset = channel < 32 ? CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_HI :
+				CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_LO;
+
+	manage_channel(card, irm_id, generation, mask, offset, false);
+}
+
+/**
+ * fw_iso_resource_manage - Allocate or deallocate a channel and/or bandwidth
+ *
+ * In parameters: card, generation, channels_mask, bandwidth, allocate
+ * Out parameters: channel, bandwidth
+ * This function blocks (sleeps) during communication with the IRM.
+ * Allocates or deallocates at most one channel out of channels_mask.
+ *
+ * Returns channel < 0 if no channel was allocated or deallocated.
+ * Returns bandwidth = 0 if no bandwidth was allocated or deallocated.
+ *
+ * If generation is stale, deallocations succeed but allocations fail with
+ * channel = -EAGAIN.
+ *
+ * If channel (de)allocation fails, bandwidth (de)allocation fails too.
+ * If bandwidth allocation fails, no channel will be allocated either.
+ * If bandwidth deallocation fails, channel deallocation may still have been
+ * successful.
+ */
+void fw_iso_resource_manage(struct fw_card *card, int generation,
+			    u64 channels_mask, int *channel, int *bandwidth,
+			    bool allocate)
+{
+	__be32 channels_hi = cpu_to_be32(channels_mask >> 32);
+	__be32 channels_lo = cpu_to_be32(channels_mask);
+	int irm_id, ret, c = -EINVAL;
+
+	spin_lock_irq(&card->lock);
+	irm_id = card->irm_node->node_id;
+	spin_unlock_irq(&card->lock);
+
+	if (channels_hi)
+		c = manage_channel(card, irm_id, generation, channels_hi,
+		    CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_HI, allocate);
+	if (channels_lo && c < 0) {
+		c = manage_channel(card, irm_id, generation, channels_lo,
+		    CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_LO, allocate);
+		if (c >= 0)
+			c += 32;
+	}
+	*channel = c;
+
+	if (channels_mask != 0 && c < 0)
+		*bandwidth = 0;
+
+	if (*bandwidth == 0)
+		return;
+
+	ret = manage_bandwidth(card, irm_id, generation, *bandwidth, allocate);
+	if (ret < 0)
+		*bandwidth = 0;
+
+	if (ret < 0 && c >= 0 && allocate) {
+		deallocate_channel(card, irm_id, generation, c);
+		*channel = ret;
+	}
+}
diff --git a/drivers/firewire/fw-transaction.h b/drivers/firewire/fw-transaction.h
index 48e88d53998b..212a10293828 100644
--- a/drivers/firewire/fw-transaction.h
+++ b/drivers/firewire/fw-transaction.h
@@ -82,6 +82,7 @@
 #define CSR_SPEED_MAP			0x2000
 #define CSR_SPEED_MAP_END		0x3000
 
+#define BANDWIDTH_AVAILABLE_INITIAL	4915
 #define BROADCAST_CHANNEL_INITIAL	(1 << 31 | 31)
 #define BROADCAST_CHANNEL_VALID		(1 << 30)
 
@@ -343,6 +344,9 @@ int fw_iso_context_start(struct fw_iso_context *ctx,
 int fw_iso_context_stop(struct fw_iso_context *ctx);
 void fw_iso_context_destroy(struct fw_iso_context *ctx);
 
+void fw_iso_resource_manage(struct fw_card *card, int generation,
+		u64 channels_mask, int *channel, int *bandwidth, bool allocate);
+
 struct fw_card_driver {
 	/*
 	 * Enable the given card with the given initial config rom.
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 86c8ff5326f9..25b96dd0574f 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -25,10 +25,12 @@
 #include <linux/types.h>
 #include <linux/firewire-constants.h>
 
-#define FW_CDEV_EVENT_BUS_RESET		0x00
-#define FW_CDEV_EVENT_RESPONSE		0x01
-#define FW_CDEV_EVENT_REQUEST		0x02
-#define FW_CDEV_EVENT_ISO_INTERRUPT	0x03
+#define FW_CDEV_EVENT_BUS_RESET			0x00
+#define FW_CDEV_EVENT_RESPONSE			0x01
+#define FW_CDEV_EVENT_REQUEST			0x02
+#define FW_CDEV_EVENT_ISO_INTERRUPT		0x03
+#define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED	0x04
+#define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED	0x05
 
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
@@ -146,6 +148,37 @@ struct fw_cdev_event_iso_interrupt {
 	__u32 header[0];
 };
 
+/**
+ * struct fw_cdev_event_iso_resource - Iso resources were allocated or freed
+ * @closure:	See &fw_cdev_event_common;
+ *		set by %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE ioctl
+ * @type:	%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
+ *		%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
+ * @handle:	Reference by which an allocated resource can be deallocated
+ * @channel:	Isochronous channel which was (de)allocated, if any
+ * @bandwidth:	Bandwidth allocation units which were (de)allocated, if any
+ * @channels_available:  Last known availability of channels
+ * @bandwidth_available: Last known availability of bandwidth
+ *
+ * An %FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED event is sent after an isochronous
+ * resource was allocated at the IRM.  The client has to check @channel and
+ * @bandwidth for whether the allocation actually succeeded.
+ *
+ * @channel is <0 if no channel was allocated.
+ * @bandwidth is 0 if no bandwidth was allocated.
+ *
+ * An %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED event is sent after an isochronous
+ * resource was deallocated at the IRM.  It is also sent when automatic
+ * reallocation after a bus reset failed.
+ */
+struct fw_cdev_event_iso_resource {
+	__u64 closure;
+	__u32 type;
+	__u32 handle;
+	__s32 channel;
+	__s32 bandwidth;
+};
+
 /**
  * union fw_cdev_event - Convenience union of fw_cdev_event_ types
  * @common:        Valid for all types
@@ -153,6 +186,9 @@ struct fw_cdev_event_iso_interrupt {
  * @response:      Valid if @common.type == %FW_CDEV_EVENT_RESPONSE
  * @request:       Valid if @common.type == %FW_CDEV_EVENT_REQUEST
  * @iso_interrupt: Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
+ * @iso_resource:  Valid if @common.type ==
+ *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
+ *				%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
  *
  * Convenience union for userspace use.  Events could be read(2) into an
  * appropriately aligned char buffer and then cast to this union for further
@@ -163,13 +199,15 @@ struct fw_cdev_event_iso_interrupt {
  * not fit will be discarded so that the next read(2) will return a new event.
  */
 union fw_cdev_event {
-	struct fw_cdev_event_common common;
-	struct fw_cdev_event_bus_reset bus_reset;
-	struct fw_cdev_event_response response;
-	struct fw_cdev_event_request request;
-	struct fw_cdev_event_iso_interrupt iso_interrupt;
+	struct fw_cdev_event_common		common;
+	struct fw_cdev_event_bus_reset		bus_reset;
+	struct fw_cdev_event_response		response;
+	struct fw_cdev_event_request		request;
+	struct fw_cdev_event_iso_interrupt	iso_interrupt;
+	struct fw_cdev_event_iso_resource	iso_resource;
 };
 
+/* available since kernel version 2.6.22 */
 #define FW_CDEV_IOC_GET_INFO		_IOWR('#', 0x00, struct fw_cdev_get_info)
 #define FW_CDEV_IOC_SEND_REQUEST	_IOW('#', 0x01, struct fw_cdev_send_request)
 #define FW_CDEV_IOC_ALLOCATE		_IOWR('#', 0x02, struct fw_cdev_allocate)
@@ -178,13 +216,18 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_INITIATE_BUS_RESET	_IOW('#', 0x05, struct fw_cdev_initiate_bus_reset)
 #define FW_CDEV_IOC_ADD_DESCRIPTOR	_IOWR('#', 0x06, struct fw_cdev_add_descriptor)
 #define FW_CDEV_IOC_REMOVE_DESCRIPTOR	_IOW('#', 0x07, struct fw_cdev_remove_descriptor)
-
 #define FW_CDEV_IOC_CREATE_ISO_CONTEXT	_IOWR('#', 0x08, struct fw_cdev_create_iso_context)
 #define FW_CDEV_IOC_QUEUE_ISO		_IOWR('#', 0x09, struct fw_cdev_queue_iso)
 #define FW_CDEV_IOC_START_ISO		_IOW('#', 0x0a, struct fw_cdev_start_iso)
 #define FW_CDEV_IOC_STOP_ISO		_IOW('#', 0x0b, struct fw_cdev_stop_iso)
+
+/* available since kernel version 2.6.24 */
 #define FW_CDEV_IOC_GET_CYCLE_TIMER	_IOR('#', 0x0c, struct fw_cdev_get_cycle_timer)
 
+/* available since kernel version 2.6.30 */
+#define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE   _IOWR('#', 0x0d, struct fw_cdev_allocate_iso_resource)
+#define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE _IOW('#', 0x0e, struct fw_cdev_deallocate)
+
 /* FW_CDEV_VERSION History
  *
  * 1	Feb 18, 2007:  Initial version.
@@ -284,9 +327,9 @@ struct fw_cdev_allocate {
 };
 
 /**
- * struct fw_cdev_deallocate - Free an address range allocation
- * @handle:	Handle to the address range, as returned by the kernel when the
- *		range was allocated
+ * struct fw_cdev_deallocate - Free a CSR address range or isochronous resource
+ * @handle:	Handle to the address range or iso resource, as returned by the
+ *		kernel when the range or resource was allocated
  */
 struct fw_cdev_deallocate {
 	__u32 handle;
@@ -479,4 +522,35 @@ struct fw_cdev_get_cycle_timer {
 	__u32 cycle_timer;
 };
 
+/**
+ * struct fw_cdev_allocate_iso_resource - Allocate a channel or bandwidth
+ * @closure:	Passed back to userspace in correponding iso resource events
+ * @channels:	Isochronous channels of which one is to be allocated
+ * @bandwidth:	Isochronous bandwidth units to be allocated
+ * @handle:	Handle to the allocation, written by the kernel
+ *
+ * The %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE ioctl initiates allocation of an
+ * isochronous channel and/or of isochronous bandwidth at the isochronous
+ * resource manager (IRM).  Only one of the channels specified in @channels is
+ * allocated.  An %FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED is sent after
+ * communication with the IRM, indicating success or failure in the event data.
+ * The kernel will automatically reallocate the resources after bus resets.
+ * Should a reallocation fail, an %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED event
+ * will be sent.  The kernel will also automatically deallocate the resources
+ * when the file descriptor is closed.
+ *
+ * @channels is a host-endian bitfield with the most significant bit
+ * representing channel 0 and the least significant bit representing channel 63:
+ * 1ULL << (63 - c)
+ *
+ * @bandwidth is expressed in bandwidth allocation units, i.e. the time to send
+ * one quadlet of data (payload or header data) at speed S1600.
+ */
+struct fw_cdev_allocate_iso_resource {
+	__u64 closure;
+	__u64 channels;
+	__u32 bandwidth;
+	__u32 handle;
+};
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
-- 
cgit v1.2.3-71-gd317


From 1ec3c0269d7196118cc7c403654ca5f19ef4d584 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 4 Jan 2009 16:23:29 +0100
Subject: firewire: cdev: add ioctls for manual iso resource management

This adds ioctls for allocation and deallocation of a channel or/and
bandwidth without auto-reallocation and without auto-deallocation.

The benefit of these ioctls is that libraw1394-style isochronous
resource management can be implemented without write access to the IRM's
character device file.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/fw-cdev.c    | 67 ++++++++++++++++++++++++++++++++++---------
 include/linux/firewire-cdev.h | 42 ++++++++++++++++++++-------
 2 files changed, 86 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index a227853aa1e2..08fe68d34f32 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -121,14 +121,15 @@ struct iso_resource {
 	struct client *client;
 	/* Schedule work and access todo only with client->lock held. */
 	struct delayed_work work;
-	enum {ISO_RES_ALLOC, ISO_RES_REALLOC, ISO_RES_DEALLOC,} todo;
+	enum {ISO_RES_ALLOC, ISO_RES_REALLOC, ISO_RES_DEALLOC,
+	      ISO_RES_ALLOC_ONCE, ISO_RES_DEALLOC_ONCE,} todo;
 	int generation;
 	u64 channels;
 	s32 bandwidth;
 	struct iso_resource_event *e_alloc, *e_dealloc;
 };
 
-static void schedule_iso_resource(struct iso_resource *);
+static int schedule_iso_resource(struct iso_resource *);
 static void release_iso_resource(struct client *, struct client_resource *);
 
 /*
@@ -1033,7 +1034,9 @@ static void iso_resource_work(struct work_struct *work)
 		skip = todo == ISO_RES_REALLOC &&
 		       r->generation == generation;
 	}
-	free = todo == ISO_RES_DEALLOC;
+	free = todo == ISO_RES_DEALLOC ||
+	       todo == ISO_RES_ALLOC_ONCE ||
+	       todo == ISO_RES_DEALLOC_ONCE;
 	r->generation = generation;
 	spin_unlock_irq(&client->lock);
 
@@ -1044,7 +1047,9 @@ static void iso_resource_work(struct work_struct *work)
 
 	fw_iso_resource_manage(client->device->card, generation,
 			r->channels, &channel, &bandwidth,
-			todo == ISO_RES_ALLOC || todo == ISO_RES_REALLOC);
+			todo == ISO_RES_ALLOC ||
+			todo == ISO_RES_REALLOC ||
+			todo == ISO_RES_ALLOC_ONCE);
 	/*
 	 * Is this generation outdated already?  As long as this resource sticks
 	 * in the idr, it will be scheduled again for a newer generation or at
@@ -1082,7 +1087,7 @@ static void iso_resource_work(struct work_struct *work)
 	if (todo == ISO_RES_REALLOC && success)
 		goto out;
 
-	if (todo == ISO_RES_ALLOC) {
+	if (todo == ISO_RES_ALLOC || todo == ISO_RES_ALLOC_ONCE) {
 		e = r->e_alloc;
 		r->e_alloc = NULL;
 	} else {
@@ -1106,10 +1111,17 @@ static void iso_resource_work(struct work_struct *work)
 	client_put(client);
 }
 
-static void schedule_iso_resource(struct iso_resource *r)
+static int schedule_iso_resource(struct iso_resource *r)
 {
-	if (schedule_delayed_work(&r->work, 0))
-		client_get(r->client);
+	int scheduled;
+
+	client_get(r->client);
+
+	scheduled = schedule_delayed_work(&r->work, 0);
+	if (!scheduled)
+		client_put(r->client);
+
+	return scheduled;
 }
 
 static void release_iso_resource(struct client *client,
@@ -1124,9 +1136,9 @@ static void release_iso_resource(struct client *client,
 	spin_unlock_irq(&client->lock);
 }
 
-static int ioctl_allocate_iso_resource(struct client *client, void *buffer)
+static int init_iso_resource(struct client *client,
+		struct fw_cdev_allocate_iso_resource *request, int todo)
 {
-	struct fw_cdev_allocate_iso_resource *request = buffer;
 	struct iso_resource_event *e1, *e2;
 	struct iso_resource *r;
 	int ret;
@@ -1146,7 +1158,7 @@ static int ioctl_allocate_iso_resource(struct client *client, void *buffer)
 
 	INIT_DELAYED_WORK(&r->work, iso_resource_work);
 	r->client	= client;
-	r->todo		= ISO_RES_ALLOC;
+	r->todo		= todo;
 	r->generation	= -1;
 	r->channels	= request->channels;
 	r->bandwidth	= request->bandwidth;
@@ -1158,8 +1170,14 @@ static int ioctl_allocate_iso_resource(struct client *client, void *buffer)
 	e2->resource.closure	= request->closure;
 	e2->resource.type	= FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED;
 
-	r->resource.release = release_iso_resource;
-	ret = add_client_resource(client, &r->resource, GFP_KERNEL);
+	if (todo == ISO_RES_ALLOC) {
+		r->resource.release = release_iso_resource;
+		ret = add_client_resource(client, &r->resource, GFP_KERNEL);
+	} else {
+		r->resource.release = NULL;
+		r->resource.handle = -1;
+		ret = schedule_iso_resource(r) ? 0 : -ENOMEM;
+	}
 	if (ret < 0)
 		goto fail;
 	request->handle = r->resource.handle;
@@ -1173,6 +1191,13 @@ static int ioctl_allocate_iso_resource(struct client *client, void *buffer)
 	return ret;
 }
 
+static int ioctl_allocate_iso_resource(struct client *client, void *buffer)
+{
+	struct fw_cdev_allocate_iso_resource *request = buffer;
+
+	return init_iso_resource(client, request, ISO_RES_ALLOC);
+}
+
 static int ioctl_deallocate_iso_resource(struct client *client, void *buffer)
 {
 	struct fw_cdev_deallocate *request = buffer;
@@ -1181,6 +1206,20 @@ static int ioctl_deallocate_iso_resource(struct client *client, void *buffer)
 				       release_iso_resource, NULL);
 }
 
+static int ioctl_allocate_iso_resource_once(struct client *client, void *buffer)
+{
+	struct fw_cdev_allocate_iso_resource *request = buffer;
+
+	return init_iso_resource(client, request, ISO_RES_ALLOC_ONCE);
+}
+
+static int ioctl_deallocate_iso_resource_once(struct client *client, void *buffer)
+{
+	struct fw_cdev_allocate_iso_resource *request = buffer;
+
+	return init_iso_resource(client, request, ISO_RES_DEALLOC_ONCE);
+}
+
 static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_get_info,
 	ioctl_send_request,
@@ -1197,6 +1236,8 @@ static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_get_cycle_timer,
 	ioctl_allocate_iso_resource,
 	ioctl_deallocate_iso_resource,
+	ioctl_allocate_iso_resource_once,
+	ioctl_deallocate_iso_resource_once,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 25b96dd0574f..08ca838a727b 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -151,7 +151,7 @@ struct fw_cdev_event_iso_interrupt {
 /**
  * struct fw_cdev_event_iso_resource - Iso resources were allocated or freed
  * @closure:	See &fw_cdev_event_common;
- *		set by %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE ioctl
+ *		set by %FW_CDEV_IOC_(DE)ALLOCATE_ISO_RESOURCE(_ONCE) ioctl
  * @type:	%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
  *		%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
  * @handle:	Reference by which an allocated resource can be deallocated
@@ -164,12 +164,12 @@ struct fw_cdev_event_iso_interrupt {
  * resource was allocated at the IRM.  The client has to check @channel and
  * @bandwidth for whether the allocation actually succeeded.
  *
- * @channel is <0 if no channel was allocated.
- * @bandwidth is 0 if no bandwidth was allocated.
- *
  * An %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED event is sent after an isochronous
  * resource was deallocated at the IRM.  It is also sent when automatic
  * reallocation after a bus reset failed.
+ *
+ * @channel is <0 if no channel was (de)allocated or if reallocation failed.
+ * @bandwidth is 0 if no bandwidth was (de)allocated or if reallocation failed.
  */
 struct fw_cdev_event_iso_resource {
 	__u64 closure;
@@ -225,8 +225,10 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_GET_CYCLE_TIMER	_IOR('#', 0x0c, struct fw_cdev_get_cycle_timer)
 
 /* available since kernel version 2.6.30 */
-#define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE   _IOWR('#', 0x0d, struct fw_cdev_allocate_iso_resource)
-#define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE _IOW('#', 0x0e, struct fw_cdev_deallocate)
+#define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE       _IOWR('#', 0x0d, struct fw_cdev_allocate_iso_resource)
+#define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE      _IOW('#', 0x0e, struct fw_cdev_deallocate)
+#define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE   _IOW('#', 0x0f, struct fw_cdev_allocate_iso_resource)
+#define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE_ONCE _IOW('#', 0x10, struct fw_cdev_allocate_iso_resource)
 
 /* FW_CDEV_VERSION History
  *
@@ -523,11 +525,12 @@ struct fw_cdev_get_cycle_timer {
 };
 
 /**
- * struct fw_cdev_allocate_iso_resource - Allocate a channel or bandwidth
+ * struct fw_cdev_allocate_iso_resource - (De)allocate a channel or bandwidth
  * @closure:	Passed back to userspace in correponding iso resource events
- * @channels:	Isochronous channels of which one is to be allocated
- * @bandwidth:	Isochronous bandwidth units to be allocated
- * @handle:	Handle to the allocation, written by the kernel
+ * @channels:	Isochronous channels of which one is to be (de)allocated
+ * @bandwidth:	Isochronous bandwidth units to be (de)allocated
+ * @handle:	Handle to the allocation, written by the kernel (only valid in
+ *		case of %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE ioctls)
  *
  * The %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE ioctl initiates allocation of an
  * isochronous channel and/or of isochronous bandwidth at the isochronous
@@ -539,6 +542,25 @@ struct fw_cdev_get_cycle_timer {
  * will be sent.  The kernel will also automatically deallocate the resources
  * when the file descriptor is closed.
  *
+ * The %FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE ioctl can be used to initiate
+ * deallocation of resources which were allocated as described above.
+ * An %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED event concludes this operation.
+ *
+ * The %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE ioctl is a variant of allocation
+ * without automatic re- or deallocation.
+ * An %FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED event concludes this operation,
+ * indicating success or failure in its data.
+ *
+ * The %FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE_ONCE ioctl works like
+ * %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE except that resources are freed
+ * instead of allocated.  At most one channel may be specified in this ioctl.
+ * An %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED event concludes this operation.
+ *
+ * To summarize, %FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE allocates iso resources
+ * for the lifetime of the fd or handle.
+ * In contrast, %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE allocates iso resources
+ * for the duration of a bus generation.
+ *
  * @channels is a host-endian bitfield with the most significant bit
  * representing channel 0 and the least significant bit representing channel 63:
  * 1ULL << (63 - c)
-- 
cgit v1.2.3-71-gd317


From 33580a3ef5ba3bc0ee1b520df82a24bb37ce28f0 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 4 Jan 2009 16:23:29 +0100
Subject: firewire: cdev: add ioctl to query maximum transmission speed

While the speed of asynchronous transactions is automatically chosen by
the kernel, the speed of isochronous streams has to be chosen by the
initiating client.

In case of 1394a bus topologies, the maximum possible speed could be
figured out with some effort by evaluation of the remote node's link
speed field in the config ROM, the local node's link speed field, and
the PHY speeds and topologic information in the local node's or IRM's
topology map CSR.  However, this does not work in case of 1394b buses.

Hence add an ioctl to export the maximum speed which the kernel already
determined.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/fw-cdev.c    | 10 ++++++++++
 include/linux/firewire-cdev.h | 10 ++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 08fe68d34f32..05ad2a8f286c 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -1220,6 +1220,15 @@ static int ioctl_deallocate_iso_resource_once(struct client *client, void *buffe
 	return init_iso_resource(client, request, ISO_RES_DEALLOC_ONCE);
 }
 
+static int ioctl_get_speed(struct client *client, void *buffer)
+{
+	struct fw_cdev_get_speed *request = buffer;
+
+	request->max_speed = client->device->max_speed;
+
+	return 0;
+}
+
 static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_get_info,
 	ioctl_send_request,
@@ -1238,6 +1247,7 @@ static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_deallocate_iso_resource,
 	ioctl_allocate_iso_resource_once,
 	ioctl_deallocate_iso_resource_once,
+	ioctl_get_speed,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 08ca838a727b..f819c1026958 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -229,6 +229,7 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE      _IOW('#', 0x0e, struct fw_cdev_deallocate)
 #define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE   _IOW('#', 0x0f, struct fw_cdev_allocate_iso_resource)
 #define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE_ONCE _IOW('#', 0x10, struct fw_cdev_allocate_iso_resource)
+#define FW_CDEV_IOC_GET_SPEED                    _IOR('#', 0x11, struct fw_cdev_get_speed)
 
 /* FW_CDEV_VERSION History
  *
@@ -575,4 +576,13 @@ struct fw_cdev_allocate_iso_resource {
 	__u32 handle;
 };
 
+/**
+ * struct fw_cdev_get_speed - Query maximum speed to or from this device
+ * @max_speed:	Speed code; minimum of the device's link speed, the local node's
+ *		link speed, and all PHY port speeds between the two links
+ */
+struct fw_cdev_get_speed {
+	__u32 max_speed;
+};
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
-- 
cgit v1.2.3-71-gd317


From acfe8333572cad5dc70fce18ac966be0446548d7 Mon Sep 17 00:00:00 2001
From: "Jay Fenlason, Stefan Richter" <stefanr@s5r6.in-berlin.de>
Date: Sun, 4 Jan 2009 16:23:29 +0100
Subject: firewire: cdev: add ioctl for broadcast write requests

Write transactions to the broadcast node ID are a convenient way to
trigger functions of multiple nodes at once.  IIDC is a protocol which
can make use of this if multiple cameras with same command_regs_base are
connected at the same bus.

Based on
    Date: Wed, 10 Sep 2008 11:32:16 -0400
    From: Jay Fenlason <fenlason@redhat.com>
    Subject: [patch] SEND_BROADCAST_REQUEST
Changes:  ioctl_send_request() and ioctl_send_broadcast_request() now
share code.  Broadcast speed corrected to S100.  Check for proper tcode.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/fw-cdev.c    | 74 +++++++++++++++++++++++++++----------------
 include/linux/firewire-cdev.h |  1 +
 2 files changed, 48 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 05ad2a8f286c..a1637a86da3d 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -518,10 +518,10 @@ static void complete_transaction(struct fw_card *card, int rcode,
 	client_put(client);
 }
 
-static int ioctl_send_request(struct client *client, void *buffer)
+static int init_request(struct client *client,
+			struct fw_cdev_send_request *request,
+			int destination_id, int speed)
 {
-	struct fw_device *device = client->device;
-	struct fw_cdev_send_request *request = buffer;
 	struct outbound_transaction_event *e;
 	int ret;
 
@@ -544,24 +544,6 @@ static int ioctl_send_request(struct client *client, void *buffer)
 		goto failed;
 	}
 
-	switch (request->tcode) {
-	case TCODE_WRITE_QUADLET_REQUEST:
-	case TCODE_WRITE_BLOCK_REQUEST:
-	case TCODE_READ_QUADLET_REQUEST:
-	case TCODE_READ_BLOCK_REQUEST:
-	case TCODE_LOCK_MASK_SWAP:
-	case TCODE_LOCK_COMPARE_SWAP:
-	case TCODE_LOCK_FETCH_ADD:
-	case TCODE_LOCK_LITTLE_ADD:
-	case TCODE_LOCK_BOUNDED_ADD:
-	case TCODE_LOCK_WRAP_ADD:
-	case TCODE_LOCK_VENDOR_DEPENDENT:
-		break;
-	default:
-		ret = -EINVAL;
-		goto failed;
-	}
-
 	e->r.resource.release = release_transaction;
 	ret = add_client_resource(client, &e->r.resource, GFP_KERNEL);
 	if (ret < 0)
@@ -570,12 +552,9 @@ static int ioctl_send_request(struct client *client, void *buffer)
 	/* Get a reference for the transaction callback */
 	client_get(client);
 
-	fw_send_request(device->card, &e->r.transaction,
-			request->tcode & 0x1f,
-			device->node->node_id,
-			request->generation,
-			device->max_speed,
-			request->offset,
+	fw_send_request(client->device->card, &e->r.transaction,
+			request->tcode & 0x1f, destination_id,
+			request->generation, speed, request->offset,
 			e->response.data, request->length,
 			complete_transaction, e);
 
@@ -589,6 +568,31 @@ static int ioctl_send_request(struct client *client, void *buffer)
 	return ret;
 }
 
+static int ioctl_send_request(struct client *client, void *buffer)
+{
+	struct fw_cdev_send_request *request = buffer;
+
+	switch (request->tcode) {
+	case TCODE_WRITE_QUADLET_REQUEST:
+	case TCODE_WRITE_BLOCK_REQUEST:
+	case TCODE_READ_QUADLET_REQUEST:
+	case TCODE_READ_BLOCK_REQUEST:
+	case TCODE_LOCK_MASK_SWAP:
+	case TCODE_LOCK_COMPARE_SWAP:
+	case TCODE_LOCK_FETCH_ADD:
+	case TCODE_LOCK_LITTLE_ADD:
+	case TCODE_LOCK_BOUNDED_ADD:
+	case TCODE_LOCK_WRAP_ADD:
+	case TCODE_LOCK_VENDOR_DEPENDENT:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return init_request(client, request, client->device->node->node_id,
+			    client->device->max_speed);
+}
+
 static void release_request(struct client *client,
 			    struct client_resource *resource)
 {
@@ -1229,6 +1233,21 @@ static int ioctl_get_speed(struct client *client, void *buffer)
 	return 0;
 }
 
+static int ioctl_send_broadcast_request(struct client *client, void *buffer)
+{
+	struct fw_cdev_send_request *request = buffer;
+
+	switch (request->tcode) {
+	case TCODE_WRITE_QUADLET_REQUEST:
+	case TCODE_WRITE_BLOCK_REQUEST:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return init_request(client, request, LOCAL_BUS | 0x3f, SCODE_100);
+}
+
 static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_get_info,
 	ioctl_send_request,
@@ -1248,6 +1267,7 @@ static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_allocate_iso_resource_once,
 	ioctl_deallocate_iso_resource_once,
 	ioctl_get_speed,
+	ioctl_send_broadcast_request,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index f819c1026958..340a78502bca 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -230,6 +230,7 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE   _IOW('#', 0x0f, struct fw_cdev_allocate_iso_resource)
 #define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE_ONCE _IOW('#', 0x10, struct fw_cdev_allocate_iso_resource)
 #define FW_CDEV_IOC_GET_SPEED                    _IOR('#', 0x11, struct fw_cdev_get_speed)
+#define FW_CDEV_IOC_SEND_BROADCAST_REQUEST       _IOW('#', 0x12, struct fw_cdev_send_request)
 
 /* FW_CDEV_VERSION History
  *
-- 
cgit v1.2.3-71-gd317


From 77258da403be4cfce84b6abcdb515ad0bd1f92f1 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Wed, 7 Jan 2009 20:14:53 +0100
Subject: firewire: cdev: increment fw_cdev_version, update documentation

Necessary due to
    Date: Tue, 22 Jul 2008 23:23:40 -0700
    From: David Moore <dcm@acm.org>
    Subject: firewire: Include iso timestamp in headers when header_size > 4

Side note:  The lack of upwards compatibility sounds worse than it is.
All existing client implementations, libraw1394 and libdc1394, set
header_size = 4.  And since the ABI v1 behaviour does not offer any
advantages over the new behaviour, we deliberately do not provide the
old behaviour anymore.

Also add documentation about the format of fw_cdev_get_cycle_timer which
may be used in conjunction with the timestamp of iso packets but has a
different format.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 340a78502bca..6ed9127680fd 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -138,7 +138,24 @@ struct fw_cdev_event_request {
  * This event is sent when the controller has completed an &fw_cdev_iso_packet
  * with the %FW_CDEV_ISO_INTERRUPT bit set.  In the receive case, the headers
  * stripped of all packets up until and including the interrupt packet are
- * returned in the @header field.
+ * returned in the @header field.  The amount of header data per packet is as
+ * specified at iso context creation by &fw_cdev_create_iso_context.header_size.
+ *
+ * In version 1 of this ABI, header data consisted of the 1394 isochronous
+ * packet header, followed by quadlets from the packet payload if
+ * &fw_cdev_create_iso_context.header_size > 4.
+ *
+ * In version 2 of this ABI, header data consist of the 1394 isochronous
+ * packet header, followed by a timestamp quadlet if
+ * &fw_cdev_create_iso_context.header_size > 4, followed by quadlets from the
+ * packet payload if &fw_cdev_create_iso_context.header_size > 8.
+ *
+ * Behaviour of ver. 1 of this ABI is no longer available since ABI ver. 2.
+ *
+ * Format of 1394 iso packet header: 16 bits len, 2 bits tag, 6 bits channel,
+ * 4 bits tcode, 4 bits sy, in big endian byte order.  Format of timestamp:
+ * 16 bits invalid, 3 bits cycleSeconds, 13 bits cycleCount, in big endian byte
+ * order.
  */
 struct fw_cdev_event_iso_interrupt {
 	__u64 closure;
@@ -232,11 +249,13 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_GET_SPEED                    _IOR('#', 0x11, struct fw_cdev_get_speed)
 #define FW_CDEV_IOC_SEND_BROADCAST_REQUEST       _IOW('#', 0x12, struct fw_cdev_send_request)
 
-/* FW_CDEV_VERSION History
- *
- * 1	Feb 18, 2007:  Initial version.
+/*
+ * FW_CDEV_VERSION History
+ *  1  (2.6.22)  - initial version
+ *  2  (2.6.30)  - changed &fw_cdev_event_iso_interrupt.header if
+ *                 &fw_cdev_create_iso_context.header_size is 8 or more
  */
-#define FW_CDEV_VERSION		1
+#define FW_CDEV_VERSION 2
 
 /**
  * struct fw_cdev_get_info - General purpose information ioctl
@@ -417,6 +436,9 @@ struct fw_cdev_remove_descriptor {
  *
  * If a context was successfully created, the kernel writes back a handle to the
  * context, which must be passed in for subsequent operations on that context.
+ *
+ * Note that the effect of a @header_size > 4 depends on
+ * &fw_cdev_get_info.version, as documented at &fw_cdev_event_iso_interrupt.
  */
 struct fw_cdev_create_iso_context {
 	__u32 type;
@@ -520,6 +542,9 @@ struct fw_cdev_stop_iso {
  * The %FW_CDEV_IOC_GET_CYCLE_TIMER ioctl reads the isochronous cycle timer
  * and also the system clock.  This allows to express the receive time of an
  * isochronous packet as a system time with microsecond accuracy.
+ *
+ * @cycle_timer consists of 7 bits cycleSeconds, 13 bits cycleCount, and
+ * 12 bits cycleOffset, in host byte order.
  */
 struct fw_cdev_get_cycle_timer {
 	__u64 local_time;
-- 
cgit v1.2.3-71-gd317


From 5d9cb7d276a9c465fef5a771792eac2cf1929f2b Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 8 Jan 2009 23:07:40 +0100
Subject: firewire: cdev: add ioctls for iso resource management, amendment

Some fixes:
  - Remove stale documentation.
  - Fix a != vs. == thinko that got in the way of channel management.
  - Try bandwidth deallocation even if channel deallocation failed.

A simplification:
  - fw_cdev_allocate_iso_resource.channels is now ordered like
    libdc1394's dc1394_iso_allocate_channel() channels_allowed
    argument.

By the way, I looked closer at cards from NEC, TI, and VIA, and noticed
that they all don't implement IEEE 1394a behaviour which is meant to
deviate from IEEE 1212's notion of lock compare-swap.  This means that
we have to do two lock transactions instead of one in many cases where
one transaction would already succeed on a fully 1394a compliant IRM.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/fw-cdev.c    |  2 +-
 drivers/firewire/fw-iso.c     | 38 ++++++++++++++++++++++----------------
 include/linux/firewire-cdev.h | 10 ++++------
 3 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index b93ad9c0a0d0..257b0c709a8b 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -1082,7 +1082,7 @@ static void iso_resource_work(struct work_struct *work)
 	spin_unlock_irq(&client->lock);
 
 	if (todo == ISO_RES_ALLOC && channel >= 0)
-		r->channels = 1ULL << (63 - channel);
+		r->channels = 1ULL << channel;
 
 	if (todo == ISO_RES_REALLOC && success)
 		goto out;
diff --git a/drivers/firewire/fw-iso.c b/drivers/firewire/fw-iso.c
index a7b57b253b06..f511d16efaee 100644
--- a/drivers/firewire/fw-iso.c
+++ b/drivers/firewire/fw-iso.c
@@ -204,17 +204,19 @@ static int manage_bandwidth(struct fw_card *card, int irm_id, int generation,
 }
 
 static int manage_channel(struct fw_card *card, int irm_id, int generation,
-			  __be32 channels_mask, u64 offset, bool allocate)
+			  u32 channels_mask, u64 offset, bool allocate)
 {
-	__be32 data[2], c, old = allocate ? cpu_to_be32(~0) : 0;
+	__be32 data[2], c, all, old;
 	int i, retry = 5;
 
+	old = all = allocate ? cpu_to_be32(~0) : 0;
+
 	for (i = 0; i < 32; i++) {
-		c = cpu_to_be32(1 << (31 - i));
-		if (!(channels_mask & c))
+		if (!(channels_mask & 1 << i))
 			continue;
 
-		if (allocate == !(old & c))
+		c = cpu_to_be32(1 << (31 - i));
+		if ((old & c) != (all & c))
 			continue;
 
 		data[0] = old;
@@ -233,7 +235,7 @@ static int manage_channel(struct fw_card *card, int irm_id, int generation,
 			old = data[0];
 
 			/* Is the IRM 1394a-2000 compliant? */
-			if ((data[0] & c) != (data[1] & c))
+			if ((data[0] & c) == (data[1] & c))
 				continue;
 
 			/* 1394-1995 IRM, fall through to retry. */
@@ -249,11 +251,10 @@ static int manage_channel(struct fw_card *card, int irm_id, int generation,
 static void deallocate_channel(struct fw_card *card, int irm_id,
 			       int generation, int channel)
 {
-	__be32 mask;
+	u32 mask;
 	u64 offset;
 
-	mask = channel < 32 ? cpu_to_be32(1 << (31 - channel)) :
-			      cpu_to_be32(1 << (63 - channel));
+	mask = channel < 32 ? 1 << channel : 1 << (channel - 32);
 	offset = channel < 32 ? CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_HI :
 				CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_LO;
 
@@ -266,7 +267,12 @@ static void deallocate_channel(struct fw_card *card, int irm_id,
  * In parameters: card, generation, channels_mask, bandwidth, allocate
  * Out parameters: channel, bandwidth
  * This function blocks (sleeps) during communication with the IRM.
+ *
  * Allocates or deallocates at most one channel out of channels_mask.
+ * channels_mask is a bitfield with MSB for channel 63 and LSB for channel 0.
+ * (Note, the IRM's CHANNELS_AVAILABLE is a big-endian bitfield with MSB for
+ * channel 0 and LSB for channel 63.)
+ * Allocates or deallocates as many bandwidth allocation units as specified.
  *
  * Returns channel < 0 if no channel was allocated or deallocated.
  * Returns bandwidth = 0 if no bandwidth was allocated or deallocated.
@@ -274,17 +280,17 @@ static void deallocate_channel(struct fw_card *card, int irm_id,
  * If generation is stale, deallocations succeed but allocations fail with
  * channel = -EAGAIN.
  *
- * If channel (de)allocation fails, bandwidth (de)allocation fails too.
+ * If channel allocation fails, no bandwidth will be allocated either.
  * If bandwidth allocation fails, no channel will be allocated either.
- * If bandwidth deallocation fails, channel deallocation may still have been
- * successful.
+ * But deallocations of channel and bandwidth are tried independently
+ * of each other's success.
  */
 void fw_iso_resource_manage(struct fw_card *card, int generation,
 			    u64 channels_mask, int *channel, int *bandwidth,
 			    bool allocate)
 {
-	__be32 channels_hi = cpu_to_be32(channels_mask >> 32);
-	__be32 channels_lo = cpu_to_be32(channels_mask);
+	u32 channels_hi = channels_mask;	/* channels 31...0 */
+	u32 channels_lo = channels_mask >> 32;	/* channels 63...32 */
 	int irm_id, ret, c = -EINVAL;
 
 	spin_lock_irq(&card->lock);
@@ -302,7 +308,7 @@ void fw_iso_resource_manage(struct fw_card *card, int generation,
 	}
 	*channel = c;
 
-	if (channels_mask != 0 && c < 0)
+	if (allocate && channels_mask != 0 && c < 0)
 		*bandwidth = 0;
 
 	if (*bandwidth == 0)
@@ -312,7 +318,7 @@ void fw_iso_resource_manage(struct fw_card *card, int generation,
 	if (ret < 0)
 		*bandwidth = 0;
 
-	if (ret < 0 && c >= 0 && allocate) {
+	if (allocate && ret < 0 && c >= 0) {
 		deallocate_channel(card, irm_id, generation, c);
 		*channel = ret;
 	}
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 6ed9127680fd..2e35379bf96c 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -174,8 +174,6 @@ struct fw_cdev_event_iso_interrupt {
  * @handle:	Reference by which an allocated resource can be deallocated
  * @channel:	Isochronous channel which was (de)allocated, if any
  * @bandwidth:	Bandwidth allocation units which were (de)allocated, if any
- * @channels_available:  Last known availability of channels
- * @bandwidth_available: Last known availability of bandwidth
  *
  * An %FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED event is sent after an isochronous
  * resource was allocated at the IRM.  The client has to check @channel and
@@ -580,7 +578,7 @@ struct fw_cdev_get_cycle_timer {
  *
  * The %FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE_ONCE ioctl works like
  * %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE except that resources are freed
- * instead of allocated.  At most one channel may be specified in this ioctl.
+ * instead of allocated.
  * An %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED event concludes this operation.
  *
  * To summarize, %FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE allocates iso resources
@@ -588,9 +586,9 @@ struct fw_cdev_get_cycle_timer {
  * In contrast, %FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE allocates iso resources
  * for the duration of a bus generation.
  *
- * @channels is a host-endian bitfield with the most significant bit
- * representing channel 0 and the least significant bit representing channel 63:
- * 1ULL << (63 - c)
+ * @channels is a host-endian bitfield with the least significant bit
+ * representing channel 0 and the most significant bit representing channel 63:
+ * 1ULL << c for each channel c that is a candidate for (de)allocation.
  *
  * @bandwidth is expressed in bandwidth allocation units, i.e. the time to send
  * one quadlet of data (payload or header data) at speed S1600.
-- 
cgit v1.2.3-71-gd317


From f8c2287c65f8f72000102fc058232669e4540bc4 Mon Sep 17 00:00:00 2001
From: Jay Fenlason <fenlason@redhat.com>
Date: Thu, 5 Mar 2009 19:08:40 +0100
Subject: firewire: implement asynchronous stream transmission

Allow userspace and other firewire drivers (fw-ipv4 I'm looking at
you!) to send Asynchronous Transmit Streams as described in 7.8.3 of
release 1.1 of the 1394 Open Host Controller Interface Specification.

Signed-off-by: Jay Fenlason <fenlason@redhat.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de> (tweaks)
---
 drivers/firewire/fw-cdev.c        | 33 +++++++++++++++++++++++++++++++++
 drivers/firewire/fw-ohci.c        | 21 +++++++++++++++++++--
 drivers/firewire/fw-transaction.c | 25 +++++++++++++++++++++++++
 drivers/firewire/fw-transaction.h |  4 ++++
 include/linux/firewire-cdev.h     | 27 +++++++++++++++++++++++++++
 5 files changed, 108 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 214e534efee5..539dae5eb5b2 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -1242,6 +1242,38 @@ static int ioctl_send_broadcast_request(struct client *client, void *buffer)
 	return init_request(client, request, LOCAL_BUS | 0x3f, SCODE_100);
 }
 
+struct stream_packet {
+	struct fw_packet packet;
+	u8 data[0];
+};
+
+static void send_stream_packet_done(struct fw_packet *packet,
+				    struct fw_card *card, int status)
+{
+	kfree(container_of(packet, struct stream_packet, packet));
+}
+
+static int ioctl_send_stream_packet(struct client *client, void *buffer)
+{
+	struct fw_cdev_send_stream_packet *request = buffer;
+	struct stream_packet *p;
+
+	p = kmalloc(sizeof(*p) + request->size, GFP_KERNEL);
+	if (p == NULL)
+		return -ENOMEM;
+
+	if (request->data &&
+	    copy_from_user(p->data, u64_to_uptr(request->data), request->size)) {
+		kfree(p);
+		return -EFAULT;
+	}
+	fw_send_stream_packet(client->device->card, &p->packet,
+			      request->generation, request->speed,
+			      request->channel, request->sy, request->tag,
+			      p->data, request->size, send_stream_packet_done);
+	return 0;
+}
+
 static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_get_info,
 	ioctl_send_request,
@@ -1262,6 +1294,7 @@ static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
 	ioctl_deallocate_iso_resource_once,
 	ioctl_get_speed,
 	ioctl_send_broadcast_request,
+	ioctl_send_stream_packet,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/drivers/firewire/fw-ohci.c b/drivers/firewire/fw-ohci.c
index c92278374658..1180d0be0bb4 100644
--- a/drivers/firewire/fw-ohci.c
+++ b/drivers/firewire/fw-ohci.c
@@ -936,7 +936,9 @@ static int at_context_queue_packet(struct context *ctx,
 	 */
 
 	header = (__le32 *) &d[1];
-	if (packet->header_length > 8) {
+	switch (packet->header_length) {
+	case 16:
+	case 12:
 		header[0] = cpu_to_le32((packet->header[0] & 0xffff) |
 					(packet->speed << 16));
 		header[1] = cpu_to_le32((packet->header[1] & 0xffff) |
@@ -950,12 +952,27 @@ static int at_context_queue_packet(struct context *ctx,
 			header[3] = (__force __le32) packet->header[3];
 
 		d[0].req_count = cpu_to_le16(packet->header_length);
-	} else {
+		break;
+
+	case 8:
 		header[0] = cpu_to_le32((OHCI1394_phy_tcode << 4) |
 					(packet->speed << 16));
 		header[1] = cpu_to_le32(packet->header[0]);
 		header[2] = cpu_to_le32(packet->header[1]);
 		d[0].req_count = cpu_to_le16(12);
+		break;
+
+	case 4:
+		header[0] = cpu_to_le32((packet->header[0] & 0xffff) |
+					(packet->speed << 16));
+		header[1] = cpu_to_le32(packet->header[0] & 0xffff0000);
+		d[0].req_count = cpu_to_le16(8);
+		break;
+
+	default:
+		/* BUG(); */
+		packet->ack = RCODE_SEND_ERROR;
+		return -1;
 	}
 
 	driver_data = (struct driver_data *) &d[3];
diff --git a/drivers/firewire/fw-transaction.c b/drivers/firewire/fw-transaction.c
index 76938fe432a0..e3da58991960 100644
--- a/drivers/firewire/fw-transaction.c
+++ b/drivers/firewire/fw-transaction.c
@@ -37,6 +37,10 @@
 #include "fw-topology.h"
 #include "fw-device.h"
 
+#define HEADER_TAG(tag)			((tag) << 14)
+#define HEADER_CHANNEL(ch)		((ch) << 8)
+#define HEADER_SY(sy)			((sy) << 0)
+
 #define HEADER_PRI(pri)			((pri) << 0)
 #define HEADER_TCODE(tcode)		((tcode) << 4)
 #define HEADER_RETRY(retry)		((retry) << 8)
@@ -293,6 +297,27 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 }
 EXPORT_SYMBOL(fw_send_request);
 
+void fw_send_stream_packet(struct fw_card *card, struct fw_packet *p,
+		int generation, int speed, int channel, int sy, int tag,
+		void *payload, size_t length, fw_packet_callback_t callback)
+{
+	p->callback = callback;
+	p->header[0] =
+		  HEADER_DATA_LENGTH(length)
+		| HEADER_TAG(tag)
+		| HEADER_CHANNEL(channel)
+		| HEADER_TCODE(TCODE_STREAM_DATA)
+		| HEADER_SY(sy);
+	p->header_length = 4;
+	p->payload = payload;
+	p->payload_length = length;
+	p->speed = speed;
+	p->generation = generation;
+	p->ack = 0;
+
+	card->driver->send_request(card, p);
+}
+
 struct transaction_callback_data {
 	struct completion done;
 	void *payload;
diff --git a/drivers/firewire/fw-transaction.h b/drivers/firewire/fw-transaction.h
index 35d0a4bb6d5c..eed2e295eb3c 100644
--- a/drivers/firewire/fw-transaction.h
+++ b/drivers/firewire/fw-transaction.h
@@ -407,6 +407,10 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t,
 		int tcode, int destination_id, int generation, int speed,
 		unsigned long long offset, void *payload, size_t length,
 		fw_transaction_callback_t callback, void *callback_data);
+void fw_send_stream_packet(struct fw_card *card, struct fw_packet *p,
+		int generation, int speed, int channel, int sy, int tag,
+		void *payload, size_t length, fw_packet_callback_t callback);
+
 int fw_cancel_transaction(struct fw_card *card,
 			  struct fw_transaction *transaction);
 void fw_flush_transactions(struct fw_card *card);
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 2e35379bf96c..4dfc84d0ac76 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -246,6 +246,7 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE_ONCE _IOW('#', 0x10, struct fw_cdev_allocate_iso_resource)
 #define FW_CDEV_IOC_GET_SPEED                    _IOR('#', 0x11, struct fw_cdev_get_speed)
 #define FW_CDEV_IOC_SEND_BROADCAST_REQUEST       _IOW('#', 0x12, struct fw_cdev_send_request)
+#define FW_CDEV_IOC_SEND_STREAM_PACKET           _IOW('#', 0x13, struct fw_cdev_send_stream_packet)
 
 /*
  * FW_CDEV_VERSION History
@@ -609,4 +610,30 @@ struct fw_cdev_get_speed {
 	__u32 max_speed;
 };
 
+/**
+ * struct fw_cdev_send_stream_packet - send an asynchronous stream packet
+ * @generation:   Bus generation where the packet is valid
+ * @speed:	  Speed code to send the packet at
+ * @channel:	  Channel to send the packet on
+ * @sy:		  Four-bit sy code for the packet
+ * @tag:	  Two-bit tag field to use for the packet
+ * @size:	  Size of the packet's data payload
+ * @data:	  Userspace pointer to the payload
+ *
+ * The %FW_CDEV_IOC_SEND_STREAM_PACKET ioctl sends an asynchronous stream packet
+ * to every device (that is listening to the specified channel) on the
+ * firewire bus.  It is the applications's job to ensure
+ * that the intended device(s) will be able to receive the packet at the chosen
+ * transmit speed.
+ */
+struct fw_cdev_send_stream_packet {
+	__u32 generation;
+	__u32 speed;
+	__u32 channel;
+	__u32 sy;
+	__u32 tag;
+	__u32 size;
+	__u64 data;
+};
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
-- 
cgit v1.2.3-71-gd317


From c8a25900f35e575938c791507894c036c0f2ca7d Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 10 Mar 2009 20:59:16 +0100
Subject: firewire: cdev: amendment to "add ioctl to query maximum transmission
 speed"

The as yet unreleased FW_CDEV_IOC_GET_SPEED ioctl puts only a single
integer into the parameter buffer.  We can use ioctl()'s return value
instead.

(Also: Some whitespace change in firewire-cdev.h.)

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/fw-cdev.c    | 11 ++++++-----
 include/linux/firewire-cdev.h | 37 ++++++++++++++-----------------------
 2 files changed, 20 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 539dae5eb5b2..2784f91896db 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -1214,13 +1214,14 @@ static int ioctl_deallocate_iso_resource_once(struct client *client, void *buffe
 	return init_iso_resource(client, request, ISO_RES_DEALLOC_ONCE);
 }
 
+/*
+ * Returns a speed code:  Maximum speed to or from this device,
+ * limited by the device's link speed, the local node's link speed,
+ * and all PHY port speeds between the two links.
+ */
 static int ioctl_get_speed(struct client *client, void *buffer)
 {
-	struct fw_cdev_get_speed *request = buffer;
-
-	request->max_speed = client->device->max_speed;
-
-	return 0;
+	return client->device->max_speed;
 }
 
 static int ioctl_send_broadcast_request(struct client *client, void *buffer)
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 4dfc84d0ac76..de4035792f70 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -223,28 +223,28 @@ union fw_cdev_event {
 };
 
 /* available since kernel version 2.6.22 */
-#define FW_CDEV_IOC_GET_INFO		_IOWR('#', 0x00, struct fw_cdev_get_info)
-#define FW_CDEV_IOC_SEND_REQUEST	_IOW('#', 0x01, struct fw_cdev_send_request)
-#define FW_CDEV_IOC_ALLOCATE		_IOWR('#', 0x02, struct fw_cdev_allocate)
-#define FW_CDEV_IOC_DEALLOCATE		_IOW('#', 0x03, struct fw_cdev_deallocate)
-#define FW_CDEV_IOC_SEND_RESPONSE	_IOW('#', 0x04, struct fw_cdev_send_response)
-#define FW_CDEV_IOC_INITIATE_BUS_RESET	_IOW('#', 0x05, struct fw_cdev_initiate_bus_reset)
-#define FW_CDEV_IOC_ADD_DESCRIPTOR	_IOWR('#', 0x06, struct fw_cdev_add_descriptor)
-#define FW_CDEV_IOC_REMOVE_DESCRIPTOR	_IOW('#', 0x07, struct fw_cdev_remove_descriptor)
-#define FW_CDEV_IOC_CREATE_ISO_CONTEXT	_IOWR('#', 0x08, struct fw_cdev_create_iso_context)
-#define FW_CDEV_IOC_QUEUE_ISO		_IOWR('#', 0x09, struct fw_cdev_queue_iso)
-#define FW_CDEV_IOC_START_ISO		_IOW('#', 0x0a, struct fw_cdev_start_iso)
-#define FW_CDEV_IOC_STOP_ISO		_IOW('#', 0x0b, struct fw_cdev_stop_iso)
+#define FW_CDEV_IOC_GET_INFO           _IOWR('#', 0x00, struct fw_cdev_get_info)
+#define FW_CDEV_IOC_SEND_REQUEST        _IOW('#', 0x01, struct fw_cdev_send_request)
+#define FW_CDEV_IOC_ALLOCATE           _IOWR('#', 0x02, struct fw_cdev_allocate)
+#define FW_CDEV_IOC_DEALLOCATE          _IOW('#', 0x03, struct fw_cdev_deallocate)
+#define FW_CDEV_IOC_SEND_RESPONSE       _IOW('#', 0x04, struct fw_cdev_send_response)
+#define FW_CDEV_IOC_INITIATE_BUS_RESET  _IOW('#', 0x05, struct fw_cdev_initiate_bus_reset)
+#define FW_CDEV_IOC_ADD_DESCRIPTOR     _IOWR('#', 0x06, struct fw_cdev_add_descriptor)
+#define FW_CDEV_IOC_REMOVE_DESCRIPTOR   _IOW('#', 0x07, struct fw_cdev_remove_descriptor)
+#define FW_CDEV_IOC_CREATE_ISO_CONTEXT _IOWR('#', 0x08, struct fw_cdev_create_iso_context)
+#define FW_CDEV_IOC_QUEUE_ISO          _IOWR('#', 0x09, struct fw_cdev_queue_iso)
+#define FW_CDEV_IOC_START_ISO           _IOW('#', 0x0a, struct fw_cdev_start_iso)
+#define FW_CDEV_IOC_STOP_ISO            _IOW('#', 0x0b, struct fw_cdev_stop_iso)
 
 /* available since kernel version 2.6.24 */
-#define FW_CDEV_IOC_GET_CYCLE_TIMER	_IOR('#', 0x0c, struct fw_cdev_get_cycle_timer)
+#define FW_CDEV_IOC_GET_CYCLE_TIMER     _IOR('#', 0x0c, struct fw_cdev_get_cycle_timer)
 
 /* available since kernel version 2.6.30 */
 #define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE       _IOWR('#', 0x0d, struct fw_cdev_allocate_iso_resource)
 #define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE      _IOW('#', 0x0e, struct fw_cdev_deallocate)
 #define FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE_ONCE   _IOW('#', 0x0f, struct fw_cdev_allocate_iso_resource)
 #define FW_CDEV_IOC_DEALLOCATE_ISO_RESOURCE_ONCE _IOW('#', 0x10, struct fw_cdev_allocate_iso_resource)
-#define FW_CDEV_IOC_GET_SPEED                    _IOR('#', 0x11, struct fw_cdev_get_speed)
+#define FW_CDEV_IOC_GET_SPEED                     _IO('#', 0x11) /* returns speed code */
 #define FW_CDEV_IOC_SEND_BROADCAST_REQUEST       _IOW('#', 0x12, struct fw_cdev_send_request)
 #define FW_CDEV_IOC_SEND_STREAM_PACKET           _IOW('#', 0x13, struct fw_cdev_send_stream_packet)
 
@@ -601,15 +601,6 @@ struct fw_cdev_allocate_iso_resource {
 	__u32 handle;
 };
 
-/**
- * struct fw_cdev_get_speed - Query maximum speed to or from this device
- * @max_speed:	Speed code; minimum of the device's link speed, the local node's
- *		link speed, and all PHY port speeds between the two links
- */
-struct fw_cdev_get_speed {
-	__u32 max_speed;
-};
-
 /**
  * struct fw_cdev_send_stream_packet - send an asynchronous stream packet
  * @generation:   Bus generation where the packet is valid
-- 
cgit v1.2.3-71-gd317


From de487da8ca5839d057e1f4b57ee3f387e180b800 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 10 Mar 2009 21:00:23 +0100
Subject: firewire: cdev: secure add_descriptor ioctl

The access permissions and ownership or ACL of /dev/fw* character device
files will typically be set based on the device type of the respective
nodes, as obtained by firewire-core from descriptors in the device's
configuration ROM.  An example policy is to deny write permission by
default but grant write permission to files of AV/C video and audio
devices and IIDC video devices.

The FW_CDEV_IOC_ADD_DESCRIPTOR ioctl could be used to partly subvert
such a policy:  Find a device file with relaxed permissions, use the
ioctl to add a descriptor with AV/C marker to the local node's ROM, thus
gain access to the local node's character device file.  (This is only
possible if there are udev scripts installed which actively relax
permissions for known device types and if there is a device of such a
type connected.)

Accessibility of the local node's device file is relevant to host
security if the host contains two or more IEEE 1394 link layer
controllers which are plugged into a single bus.

Therefore change the ABI to deny FW_CDEV_IOC_ADD_DESCRIPTOR if the file
belongs to a remote node.  (This change has no impact on known
implementers of the ABI:  None of them uses the ioctl yet.)

Also clarify the documentation:  The ioctl affects all local nodes, not
just one local node.

Cc: stable@kernel.org
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/fw-cdev.c    | 8 ++++++++
 include/linux/firewire-cdev.h | 5 ++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 2784f91896db..160cb27e120c 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -742,9 +742,17 @@ static void release_descriptor(struct client *client,
 static int ioctl_add_descriptor(struct client *client, void *buffer)
 {
 	struct fw_cdev_add_descriptor *request = buffer;
+	struct fw_card *card = client->device->card;
 	struct descriptor_resource *r;
 	int ret;
 
+	/* Access policy: Allow this ioctl only on local nodes' device files. */
+	spin_lock_irq(&card->lock);
+	ret = client->device->node_id != card->local_node->node_id;
+	spin_unlock_irq(&card->lock);
+	if (ret)
+		return -ENOSYS;
+
 	if (request->length > 256)
 		return -EINVAL;
 
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index de4035792f70..25bc82726ef7 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -394,6 +394,9 @@ struct fw_cdev_initiate_bus_reset {
  * If successful, the kernel adds the descriptor and writes back a handle to the
  * kernel-side object to be used for later removal of the descriptor block and
  * immediate key.
+ *
+ * This ioctl affects the configuration ROMs of all local nodes.
+ * The ioctl only succeeds on device files which represent a local node.
  */
 struct fw_cdev_add_descriptor {
 	__u32 immediate;
@@ -409,7 +412,7 @@ struct fw_cdev_add_descriptor {
  *		descriptor was added
  *
  * Remove a descriptor block and accompanying immediate key from the local
- * node's configuration ROM.
+ * nodes' configuration ROMs.
  */
 struct fw_cdev_remove_descriptor {
 	__u32 handle;
-- 
cgit v1.2.3-71-gd317


From 18e9b10fcdc090d3a38606958167d5923c7099b7 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 10 Mar 2009 21:02:21 +0100
Subject: firewire: cdev: add closure to async stream ioctl

This changes the as yet unreleased FW_CDEV_IOC_SEND_STREAM_PACKET ioctl
to generate an fw_cdev_event_response event just like the other two
ioctls for asynchronous request transmission do.  This way, clients get
feedback on successful or unsuccessful transmission.

This also adds input validation for length, tag, channel, sy, speed.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/fw-cdev.c        | 46 ++++++++++++++++-----------------------
 drivers/firewire/fw-transaction.c | 42 ++++++++++++++---------------------
 drivers/firewire/fw-transaction.h |  9 ++++----
 include/linux/firewire-cdev.h     | 31 +++++++++++++-------------
 4 files changed, 56 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c
index 95a207545eb3..7eb6594cc3e5 100644
--- a/drivers/firewire/fw-cdev.c
+++ b/drivers/firewire/fw-cdev.c
@@ -522,7 +522,8 @@ static int init_request(struct client *client,
 	struct outbound_transaction_event *e;
 	int ret;
 
-	if (request->length > 4096 || request->length > 512 << speed)
+	if (request->tcode != TCODE_STREAM_DATA &&
+	    (request->length > 4096 || request->length > 512 << speed))
 		return -EIO;
 
 	e = kmalloc(sizeof(*e) + request->length, GFP_KERNEL);
@@ -1247,36 +1248,27 @@ static int ioctl_send_broadcast_request(struct client *client, void *buffer)
 	return init_request(client, request, LOCAL_BUS | 0x3f, SCODE_100);
 }
 
-struct stream_packet {
-	struct fw_packet packet;
-	u8 data[0];
-};
-
-static void send_stream_packet_done(struct fw_packet *packet,
-				    struct fw_card *card, int status)
-{
-	kfree(container_of(packet, struct stream_packet, packet));
-}
-
 static int ioctl_send_stream_packet(struct client *client, void *buffer)
 {
-	struct fw_cdev_send_stream_packet *request = buffer;
-	struct stream_packet *p;
+	struct fw_cdev_send_stream_packet *p = buffer;
+	struct fw_cdev_send_request request;
+	int dest;
 
-	p = kmalloc(sizeof(*p) + request->size, GFP_KERNEL);
-	if (p == NULL)
-		return -ENOMEM;
+	if (p->speed > client->device->card->link_speed ||
+	    p->length > 1024 << p->speed)
+		return -EIO;
 
-	if (request->data &&
-	    copy_from_user(p->data, u64_to_uptr(request->data), request->size)) {
-		kfree(p);
-		return -EFAULT;
-	}
-	fw_send_stream_packet(client->device->card, &p->packet,
-			      request->generation, request->speed,
-			      request->channel, request->sy, request->tag,
-			      p->data, request->size, send_stream_packet_done);
-	return 0;
+	if (p->tag > 3 || p->channel > 63 || p->sy > 15)
+		return -EINVAL;
+
+	dest = fw_stream_packet_destination_id(p->tag, p->channel, p->sy);
+	request.tcode		= TCODE_STREAM_DATA;
+	request.length		= p->length;
+	request.closure		= p->closure;
+	request.data		= p->data;
+	request.generation	= p->generation;
+
+	return init_request(client, &request, dest, p->speed);
 }
 
 static int (* const ioctl_handlers[])(struct client *client, void *buffer) = {
diff --git a/drivers/firewire/fw-transaction.c b/drivers/firewire/fw-transaction.c
index e3da58991960..4a9b37461c26 100644
--- a/drivers/firewire/fw-transaction.c
+++ b/drivers/firewire/fw-transaction.c
@@ -37,10 +37,6 @@
 #include "fw-topology.h"
 #include "fw-device.h"
 
-#define HEADER_TAG(tag)			((tag) << 14)
-#define HEADER_CHANNEL(ch)		((ch) << 8)
-#define HEADER_SY(sy)			((sy) << 0)
-
 #define HEADER_PRI(pri)			((pri) << 0)
 #define HEADER_TCODE(tcode)		((tcode) << 4)
 #define HEADER_RETRY(retry)		((retry) << 8)
@@ -158,6 +154,18 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel,
 {
 	int ext_tcode;
 
+	if (tcode == TCODE_STREAM_DATA) {
+		packet->header[0] =
+			HEADER_DATA_LENGTH(length) |
+			destination_id |
+			HEADER_TCODE(TCODE_STREAM_DATA);
+		packet->header_length = 4;
+		packet->payload = payload;
+		packet->payload_length = length;
+
+		goto common;
+	}
+
 	if (tcode > 0x10) {
 		ext_tcode = tcode & ~0x10;
 		tcode = TCODE_LOCK_REQUEST;
@@ -204,7 +212,7 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel,
 		packet->payload_length = 0;
 		break;
 	}
-
+ common:
 	packet->speed = speed;
 	packet->generation = generation;
 	packet->ack = 0;
@@ -246,6 +254,9 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel,
  * @param callback function to be called when the transaction is completed
  * @param callback_data pointer to arbitrary data, which will be
  *   passed to the callback
+ *
+ * In case of asynchronous stream packets i.e. TCODE_STREAM_DATA, the caller
+ * needs to synthesize @destination_id with fw_stream_packet_destination_id().
  */
 void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 		     int destination_id, int generation, int speed,
@@ -297,27 +308,6 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 }
 EXPORT_SYMBOL(fw_send_request);
 
-void fw_send_stream_packet(struct fw_card *card, struct fw_packet *p,
-		int generation, int speed, int channel, int sy, int tag,
-		void *payload, size_t length, fw_packet_callback_t callback)
-{
-	p->callback = callback;
-	p->header[0] =
-		  HEADER_DATA_LENGTH(length)
-		| HEADER_TAG(tag)
-		| HEADER_CHANNEL(channel)
-		| HEADER_TCODE(TCODE_STREAM_DATA)
-		| HEADER_SY(sy);
-	p->header_length = 4;
-	p->payload = payload;
-	p->payload_length = length;
-	p->speed = speed;
-	p->generation = generation;
-	p->ack = 0;
-
-	card->driver->send_request(card, p);
-}
-
 struct transaction_callback_data {
 	struct completion done;
 	void *payload;
diff --git a/drivers/firewire/fw-transaction.h b/drivers/firewire/fw-transaction.h
index f90f09c05833..d4f42cecbdfa 100644
--- a/drivers/firewire/fw-transaction.h
+++ b/drivers/firewire/fw-transaction.h
@@ -412,10 +412,6 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t,
 		int tcode, int destination_id, int generation, int speed,
 		unsigned long long offset, void *payload, size_t length,
 		fw_transaction_callback_t callback, void *callback_data);
-void fw_send_stream_packet(struct fw_card *card, struct fw_packet *p,
-		int generation, int speed, int channel, int sy, int tag,
-		void *payload, size_t length, fw_packet_callback_t callback);
-
 int fw_cancel_transaction(struct fw_card *card,
 			  struct fw_transaction *transaction);
 void fw_flush_transactions(struct fw_card *card);
@@ -425,6 +421,11 @@ int fw_run_transaction(struct fw_card *card, int tcode, int destination_id,
 void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count);
 
+static inline int fw_stream_packet_destination_id(int tag, int channel, int sy)
+{
+	return tag << 14 | channel << 8 | sy;
+}
+
 /*
  * Called by the topology code to inform the device code of node
  * activity; found, lost, or updated nodes.
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 25bc82726ef7..c6b3ca3af6df 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -606,28 +606,29 @@ struct fw_cdev_allocate_iso_resource {
 
 /**
  * struct fw_cdev_send_stream_packet - send an asynchronous stream packet
- * @generation:   Bus generation where the packet is valid
- * @speed:	  Speed code to send the packet at
- * @channel:	  Channel to send the packet on
- * @sy:		  Four-bit sy code for the packet
- * @tag:	  Two-bit tag field to use for the packet
- * @size:	  Size of the packet's data payload
- * @data:	  Userspace pointer to the payload
+ * @length:	Length of outgoing payload, in bytes
+ * @tag:	Data format tag
+ * @channel:	Isochronous channel to transmit to
+ * @sy:		Synchronization code
+ * @closure:	Passed back to userspace in the response event
+ * @data:	Userspace pointer to payload
+ * @generation:	The bus generation where packet is valid
+ * @speed:	Speed to transmit at
  *
  * The %FW_CDEV_IOC_SEND_STREAM_PACKET ioctl sends an asynchronous stream packet
- * to every device (that is listening to the specified channel) on the
- * firewire bus.  It is the applications's job to ensure
- * that the intended device(s) will be able to receive the packet at the chosen
- * transmit speed.
+ * to every device which is listening to the specified channel.  The kernel
+ * writes an &fw_cdev_event_response event which indicates success or failure of
+ * the transmission.
  */
 struct fw_cdev_send_stream_packet {
-	__u32 generation;
-	__u32 speed;
+	__u32 length;
+	__u32 tag;
 	__u32 channel;
 	__u32 sy;
-	__u32 tag;
-	__u32 size;
+	__u64 closure;
 	__u64 data;
+	__u32 generation;
+	__u32 speed;
 };
 
 #endif /* _LINUX_FIREWIRE_CDEV_H */
-- 
cgit v1.2.3-71-gd317


From 7a254df007b3db88bd430474030fec92e7bab22a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:39 +0100
Subject: ide: move ide_pktcmd_tf_load() to ide-atapi.c

Then make it static and remove 'dma' argument.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c | 21 ++++++++++++++++++++-
 drivers/ide/ide-io.c    | 20 --------------------
 include/linux/ide.h     |  2 --
 3 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index e9d042dba0e0..09ae30f46070 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -456,6 +456,25 @@ next_irq:
 	return ide_started;
 }
 
+static void ide_pktcmd_tf_load(ide_drive_t *drive, u32 tf_flags, u16 bcount)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	ide_task_t task;
+	u8 dma = drive->dma;
+
+	memset(&task, 0, sizeof(task));
+	task.tf_flags = IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
+			IDE_TFLAG_OUT_FEATURE | tf_flags;
+	task.tf.feature = dma;		/* Use PIO/DMA */
+	task.tf.lbam    = bcount & 0xff;
+	task.tf.lbah    = (bcount >> 8) & 0xff;
+
+	ide_tf_dump(drive->name, &task.tf);
+	hwif->tp_ops->set_irq(hwif, 1);
+	SELECT_MASK(drive, 0);
+	hwif->tp_ops->tf_load(drive, &task);
+}
+
 static u8 ide_read_ireason(ide_drive_t *drive)
 {
 	ide_task_t task;
@@ -629,7 +648,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 						       : WAIT_TAPE_CMD;
 	}
 
-	ide_pktcmd_tf_load(drive, tf_flags, bcount, drive->dma);
+	ide_pktcmd_tf_load(drive, tf_flags, bcount);
 
 	/* Issue the packet command */
 	if (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT) {
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index a9a6c208288a..4344b6119d77 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1192,26 +1192,6 @@ void ide_do_drive_cmd(ide_drive_t *drive, struct request *rq)
 }
 EXPORT_SYMBOL(ide_do_drive_cmd);
 
-void ide_pktcmd_tf_load(ide_drive_t *drive, u32 tf_flags, u16 bcount, u8 dma)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	ide_task_t task;
-
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
-			IDE_TFLAG_OUT_FEATURE | tf_flags;
-	task.tf.feature = dma;		/* Use PIO/DMA */
-	task.tf.lbam    = bcount & 0xff;
-	task.tf.lbah    = (bcount >> 8) & 0xff;
-
-	ide_tf_dump(drive->name, &task.tf);
-	hwif->tp_ops->set_irq(hwif, 1);
-	SELECT_MASK(drive, 0);
-	hwif->tp_ops->tf_load(drive, &task);
-}
-
-EXPORT_SYMBOL_GPL(ide_pktcmd_tf_load);
-
 void ide_pad_transfer(ide_drive_t *drive, int write, int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 25087aead657..e7b787de5286 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1202,8 +1202,6 @@ void ide_read_bcount_and_ireason(ide_drive_t *, u16 *, u8 *);
 
 extern int drive_is_ready(ide_drive_t *);
 
-void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8);
-
 int ide_check_atapi_device(ide_drive_t *, const char *);
 
 void ide_init_pc(struct ide_atapi_pc *);
-- 
cgit v1.2.3-71-gd317


From 7ed5b157d9dff55bf477b4c8b4708d5d45476677 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:41 +0100
Subject: ide: add ide_for_each_present_dev() iterator

* Add ide_for_each_present_dev() iterator and convert IDE code to use it.

* Do some drive-by CodingStyle fixups in ide-acpi.c while at it.

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-acpi.c  | 30 ++++++++++--------------------
 drivers/ide/ide-iops.c  |  5 ++---
 drivers/ide/ide-probe.c | 38 +++++++++++++-------------------------
 include/linux/ide.h     |  4 ++++
 4 files changed, 29 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index a3bebba18425..8d6d31fcbfba 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -608,17 +608,17 @@ void ide_acpi_set_state(ide_hwif_t *hwif, int on)
 		DEBPRINT("no ACPI data for %s\n", hwif->name);
 		return;
 	}
+
 	/* channel first and then drives for power on and verse versa for power off */
 	if (on)
 		acpi_bus_set_power(hwif->acpidata->obj_handle, ACPI_STATE_D0);
 
-	ide_port_for_each_dev(i, drive, hwif) {
-		if (drive->acpidata->obj_handle &&
-		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
+	ide_port_for_each_present_dev(i, drive, hwif) {
+		if (drive->acpidata->obj_handle)
 			acpi_bus_set_power(drive->acpidata->obj_handle,
-				on? ACPI_STATE_D0: ACPI_STATE_D3);
-		}
+					   on ? ACPI_STATE_D0 : ACPI_STATE_D3);
 	}
+
 	if (!on)
 		acpi_bus_set_power(hwif->acpidata->obj_handle, ACPI_STATE_D3);
 }
@@ -667,12 +667,9 @@ void ide_acpi_port_init_devices(ide_hwif_t *hwif)
 	hwif->devices[1]->acpidata = &hwif->acpidata->slave;
 
 	/* get _ADR info for each device */
-	ide_port_for_each_dev(i, drive, hwif) {
+	ide_port_for_each_present_dev(i, drive, hwif) {
 		acpi_handle dev_handle;
 
-		if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-			continue;
-
 		DEBPRINT("ENTER: %s at channel#: %d port#: %d\n",
 			 drive->name, hwif->channel, drive->dn & 1);
 
@@ -685,13 +682,8 @@ void ide_acpi_port_init_devices(ide_hwif_t *hwif)
 		drive->acpidata->obj_handle = dev_handle;
 	}
 
-	/*
-	 * Send IDENTIFY for each drive
-	 */
-	ide_port_for_each_dev(i, drive, hwif) {
-		if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-			continue;
-
+	/* send IDENTIFY for each device */
+	ide_port_for_each_present_dev(i, drive, hwif) {
 		err = taskfile_lib_get_identify(drive, drive->acpidata->idbuff);
 		if (err)
 			DEBPRINT("identify device %s failed (%d)\n",
@@ -711,9 +703,7 @@ void ide_acpi_port_init_devices(ide_hwif_t *hwif)
 	ide_acpi_get_timing(hwif);
 	ide_acpi_push_timing(hwif);
 
-	ide_port_for_each_dev(i, drive, hwif) {
-		if (drive->dev_flags & IDE_DFLAG_PRESENT)
-			/* Execute ACPI startup code */
-			ide_acpi_exec_tfs(drive);
+	ide_port_for_each_present_dev(i, drive, hwif) {
+		ide_acpi_exec_tfs(drive);
 	}
 }
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index b1892bd95c6f..02fed32a4047 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -1103,9 +1103,8 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 
 		prepare_to_wait(&ide_park_wq, &wait, TASK_UNINTERRUPTIBLE);
 		timeout = jiffies;
-		ide_port_for_each_dev(i, tdrive, hwif) {
-			if (tdrive->dev_flags & IDE_DFLAG_PRESENT &&
-			    tdrive->dev_flags & IDE_DFLAG_PARKED &&
+		ide_port_for_each_present_dev(i, tdrive, hwif) {
+			if ((tdrive->dev_flags & IDE_DFLAG_PARKED) &&
 			    time_after(tdrive->sleep, timeout))
 				timeout = tdrive->sleep;
 		}
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 5deb7e717333..eb0a38cd83ce 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -825,22 +825,18 @@ static void ide_port_tune_devices(ide_hwif_t *hwif)
 	ide_drive_t *drive;
 	int i;
 
-	ide_port_for_each_dev(i, drive, hwif) {
-		if (drive->dev_flags & IDE_DFLAG_PRESENT) {
-			if (port_ops && port_ops->quirkproc)
-				port_ops->quirkproc(drive);
-		}
+	ide_port_for_each_present_dev(i, drive, hwif) {
+		if (port_ops && port_ops->quirkproc)
+			port_ops->quirkproc(drive);
 	}
 
-	ide_port_for_each_dev(i, drive, hwif) {
-		if (drive->dev_flags & IDE_DFLAG_PRESENT) {
-			ide_set_max_pio(drive);
+	ide_port_for_each_present_dev(i, drive, hwif) {
+		ide_set_max_pio(drive);
 
-			drive->dev_flags |= IDE_DFLAG_NICE1;
+		drive->dev_flags |= IDE_DFLAG_NICE1;
 
-			if (hwif->dma_ops)
-				ide_set_dma(drive);
-		}
+		if (hwif->dma_ops)
+			ide_set_dma(drive);
 	}
 }
 
@@ -911,10 +907,7 @@ static int ide_port_setup_devices(ide_hwif_t *hwif)
 	int i, j = 0;
 
 	mutex_lock(&ide_cfg_mtx);
-	ide_port_for_each_dev(i, drive, hwif) {
-		if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-			continue;
-
+	ide_port_for_each_present_dev(i, drive, hwif) {
 		if (ide_init_queue(drive)) {
 			printk(KERN_ERR "ide: failed to init %s\n",
 					drive->name);
@@ -1139,13 +1132,10 @@ static void hwif_register_devices(ide_hwif_t *hwif)
 	ide_drive_t *drive;
 	unsigned int i;
 
-	ide_port_for_each_dev(i, drive, hwif) {
+	ide_port_for_each_present_dev(i, drive, hwif) {
 		struct device *dev = &drive->gendev;
 		int ret;
 
-		if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-			continue;
-
 		dev_set_name(dev, "%u.%u", hwif->index, i);
 		dev->parent = &hwif->gendev;
 		dev->bus = &ide_bus_type;
@@ -1610,11 +1600,9 @@ static void __ide_port_unregister_devices(ide_hwif_t *hwif)
 	ide_drive_t *drive;
 	int i;
 
-	ide_port_for_each_dev(i, drive, hwif) {
-		if (drive->dev_flags & IDE_DFLAG_PRESENT) {
-			device_unregister(&drive->gendev);
-			wait_for_completion(&drive->gendev_rel_comp);
-		}
+	ide_port_for_each_present_dev(i, drive, hwif) {
+		device_unregister(&drive->gendev);
+		wait_for_completion(&drive->gendev_rel_comp);
 	}
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e7b787de5286..7ed395b4b891 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1609,6 +1609,10 @@ static inline ide_drive_t *ide_get_pair_dev(ide_drive_t *drive)
 #define ide_port_for_each_dev(i, dev, port) \
 	for ((i) = 0; ((dev) = (port)->devices[i]) || (i) < MAX_DRIVES; (i)++)
 
+#define ide_port_for_each_present_dev(i, dev, port) \
+	for ((i) = 0; ((dev) = (port)->devices[i]) || (i) < MAX_DRIVES; (i)++) \
+		if ((dev)->dev_flags & IDE_DFLAG_PRESENT)
+
 #define ide_host_for_each_port(i, port, host) \
 	for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++)
 
-- 
cgit v1.2.3-71-gd317


From 8b803bd184e3f6892284d4b50801b9ec85cd9b96 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:41 +0100
Subject: ide: sanitize ACPI initialization

* ide_acpi_init() -> ide_acpi_init_port()

* ide_acpi_blacklist() -> ide_acpi_init()

* Call ide_acpi_init() only once (do it during IDE core
  initialization) and cleanup the function accordingly.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-acpi.c  | 12 +++---------
 drivers/ide/ide-probe.c |  2 +-
 drivers/ide/ide.c       |  2 ++
 include/linux/ide.h     |  6 ++++--
 4 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 8d6d31fcbfba..ba5932d7b1bb 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -89,12 +89,8 @@ static const struct dmi_system_id ide_acpi_dmi_table[] = {
 	{ }	/* terminate list */
 };
 
-static int ide_acpi_blacklist(void)
+int ide_acpi_init(void)
 {
-	static int done;
-	if (done)
-		return 0;
-	done = 1;
 	dmi_check_system(ide_acpi_dmi_table);
 	return 0;
 }
@@ -624,7 +620,7 @@ void ide_acpi_set_state(ide_hwif_t *hwif, int on)
 }
 
 /**
- * ide_acpi_init - initialize the ACPI link for an IDE interface
+ * ide_acpi_init_port - initialize the ACPI link for an IDE interface
  * @hwif: target IDE interface (channel)
  *
  * The ACPI spec is not quite clear when the drive identify buffer
@@ -634,10 +630,8 @@ void ide_acpi_set_state(ide_hwif_t *hwif, int on)
  * So we get the information during startup; but this means that
  * any changes during run-time will be lost after resume.
  */
-void ide_acpi_init(ide_hwif_t *hwif)
+void ide_acpi_init_port(ide_hwif_t *hwif)
 {
-	ide_acpi_blacklist();
-
 	hwif->acpidata = kzalloc(sizeof(struct ide_acpi_hwif_link), GFP_KERNEL);
 	if (!hwif->acpidata)
 		return;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index eb0a38cd83ce..a51ad2bd62b4 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1543,7 +1543,7 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
 
 		j++;
 
-		ide_acpi_init(hwif);
+		ide_acpi_init_port(hwif);
 
 		if (hwif->present)
 			ide_acpi_port_init_devices(hwif);
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 0920e3b0c962..c779aa24dbe6 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -527,6 +527,8 @@ static int __init ide_init(void)
 		goto out_port_class;
 	}
 
+	ide_acpi_init();
+
 	proc_ide_create();
 
 	return 0;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 7ed395b4b891..6bb104f4e341 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1484,17 +1484,19 @@ static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
+int ide_acpi_init(void);
 extern int ide_acpi_exec_tfs(ide_drive_t *drive);
 extern void ide_acpi_get_timing(ide_hwif_t *hwif);
 extern void ide_acpi_push_timing(ide_hwif_t *hwif);
-extern void ide_acpi_init(ide_hwif_t *hwif);
+void ide_acpi_init_port(ide_hwif_t *);
 void ide_acpi_port_init_devices(ide_hwif_t *);
 extern void ide_acpi_set_state(ide_hwif_t *hwif, int on);
 #else
+static inline int ide_acpi_init(void) { return 0; }
 static inline int ide_acpi_exec_tfs(ide_drive_t *drive) { return 0; }
 static inline void ide_acpi_get_timing(ide_hwif_t *hwif) { ; }
 static inline void ide_acpi_push_timing(ide_hwif_t *hwif) { ; }
-static inline void ide_acpi_init(ide_hwif_t *hwif) { ; }
+static inline void ide_acpi_init_port(ide_hwif_t *hwif) { ; }
 static inline void ide_acpi_port_init_devices(ide_hwif_t *hwif) { ; }
 static inline void ide_acpi_set_state(ide_hwif_t *hwif, int on) {}
 #endif
-- 
cgit v1.2.3-71-gd317


From b6a45a0b1e9a358b81201659cf87b023e3ec73e0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:43 +0100
Subject: ide: move drive_is_ready() to ide-io.c

Move drive_is_ready() to ide-io.c, then make it static.

Also make some minor CodingStyle fixups while at it.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c   | 23 +++++++++++++++++++++++
 drivers/ide/ide-iops.c | 25 -------------------------
 include/linux/ide.h    |  2 --
 3 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index d90cf5d08142..835cf646bb07 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -887,6 +887,29 @@ static void ide_plug_device(ide_drive_t *drive)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static int drive_is_ready(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	u8 stat = 0;
+
+	if (drive->waiting_for_dma)
+		return hwif->dma_ops->dma_test_irq(drive);
+
+	if (hwif->io_ports.ctl_addr &&
+	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0)
+		stat = hwif->tp_ops->read_altstatus(hwif);
+	else
+		/* Note: this may clear a pending IRQ!! */
+		stat = hwif->tp_ops->read_status(hwif);
+
+	if (stat & ATA_BUSY)
+		/* drive busy: definitely not interrupting */
+		return 0;
+
+	/* drive ready: *might* be interrupting */
+	return 1;
+}
+
 /**
  *	ide_timer_expiry	-	handle lack of an IDE interrupt
  *	@data: timer callback magic (hwif)
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index cd1f2e464c4b..ee9c60342787 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -438,31 +438,6 @@ void ide_fixstring (u8 *s, const int bytecount, const int byteswap)
 
 EXPORT_SYMBOL(ide_fixstring);
 
-int drive_is_ready (ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	u8 stat			= 0;
-
-	if (drive->waiting_for_dma)
-		return hwif->dma_ops->dma_test_irq(drive);
-
-	if (hwif->io_ports.ctl_addr &&
-	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0)
-		stat = hwif->tp_ops->read_altstatus(hwif);
-	else
-		/* Note: this may clear a pending IRQ!! */
-		stat = hwif->tp_ops->read_status(hwif);
-
-	if (stat & ATA_BUSY)
-		/* drive busy:  definitely not interrupting */
-		return 0;
-
-	/* drive ready: *might* be interrupting */
-	return 1;
-}
-
-EXPORT_SYMBOL(drive_is_ready);
-
 /*
  * This routine busy-waits for the drive status to be not "busy".
  * It then checks the status for all of the "good" bits and none
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 6bb104f4e341..2e95adeedff4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1200,8 +1200,6 @@ void SELECT_MASK(ide_drive_t *, int);
 u8 ide_read_error(ide_drive_t *);
 void ide_read_bcount_and_ireason(ide_drive_t *, u16 *, u8 *);
 
-extern int drive_is_ready(ide_drive_t *);
-
 int ide_check_atapi_device(ide_drive_t *, const char *);
 
 void ide_init_pc(struct ide_atapi_pc *);
-- 
cgit v1.2.3-71-gd317


From 65ca5377322c7543163066f373ae9e6b0ad8de8a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:43 +0100
Subject: ide: move ide_dma_timeout_retry() to ide-dma.c

Move ide_dma_timeout_retry() to ide-dma.c and add static inline
version for CONFIG_BLK_DEV_IDEDMA=n.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-dma.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-io.c  | 57 ---------------------------------------------------
 include/linux/ide.h   |  2 ++
 3 files changed, 59 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 059c90bb5ad2..a878f4734f81 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -470,6 +470,63 @@ void ide_dma_timeout(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_dma_timeout);
 
+/*
+ * un-busy the port etc, and clear any pending DMA status. we want to
+ * retry the current request in pio mode instead of risking tossing it
+ * all away
+ */
+ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct request *rq;
+	ide_startstop_t ret = ide_stopped;
+
+	/*
+	 * end current dma transaction
+	 */
+
+	if (error < 0) {
+		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
+		(void)hwif->dma_ops->dma_end(drive);
+		ret = ide_error(drive, "dma timeout error",
+				hwif->tp_ops->read_status(hwif));
+	} else {
+		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
+		hwif->dma_ops->dma_timeout(drive);
+	}
+
+	/*
+	 * disable dma for now, but remember that we did so because of
+	 * a timeout -- we'll reenable after we finish this next request
+	 * (or rather the first chunk of it) in pio.
+	 */
+	drive->dev_flags |= IDE_DFLAG_DMA_PIO_RETRY;
+	drive->retry_pio++;
+	ide_dma_off_quietly(drive);
+
+	/*
+	 * un-busy drive etc and make sure request is sane
+	 */
+
+	rq = hwif->rq;
+	if (!rq)
+		goto out;
+
+	hwif->rq = NULL;
+
+	rq->errors = 0;
+
+	if (!rq->bio)
+		goto out;
+
+	rq->sector = rq->bio->bi_sector;
+	rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9;
+	rq->hard_cur_sectors = rq->current_nr_sectors;
+	rq->buffer = bio_data(rq->bio);
+out:
+	return ret;
+}
+
 void ide_release_dma_engine(ide_hwif_t *hwif)
 {
 	if (hwif->dmatable_cpu) {
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 835cf646bb07..557b15700ea2 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -819,63 +819,6 @@ plug_device_2:
 		blk_plug_device(q);
 }
 
-/*
- * un-busy the port etc, and clear any pending DMA status. we want to
- * retry the current request in pio mode instead of risking tossing it
- * all away
- */
-static ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq;
-	ide_startstop_t ret = ide_stopped;
-
-	/*
-	 * end current dma transaction
-	 */
-
-	if (error < 0) {
-		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
-		(void)hwif->dma_ops->dma_end(drive);
-		ret = ide_error(drive, "dma timeout error",
-				hwif->tp_ops->read_status(hwif));
-	} else {
-		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
-		hwif->dma_ops->dma_timeout(drive);
-	}
-
-	/*
-	 * disable dma for now, but remember that we did so because of
-	 * a timeout -- we'll reenable after we finish this next request
-	 * (or rather the first chunk of it) in pio.
-	 */
-	drive->dev_flags |= IDE_DFLAG_DMA_PIO_RETRY;
-	drive->retry_pio++;
-	ide_dma_off_quietly(drive);
-
-	/*
-	 * un-busy drive etc and make sure request is sane
-	 */
-
-	rq = hwif->rq;
-	if (!rq)
-		goto out;
-
-	hwif->rq = NULL;
-
-	rq->errors = 0;
-
-	if (!rq->bio)
-		goto out;
-
-	rq->sector = rq->bio->bi_sector;
-	rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9;
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->buffer = bio_data(rq->bio);
-out:
-	return ret;
-}
-
 static void ide_plug_device(ide_drive_t *drive)
 {
 	struct request_queue *q = drive->queue;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2e95adeedff4..d0065a90452b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1467,6 +1467,7 @@ static inline int config_drive_for_dma(ide_drive_t *drive) { return 0; }
 
 void ide_dma_lost_irq(ide_drive_t *);
 void ide_dma_timeout(ide_drive_t *);
+ide_startstop_t ide_dma_timeout_retry(ide_drive_t *, int);
 
 #else
 static inline int ide_id_dma_bug(ide_drive_t *drive) { return 0; }
@@ -1478,6 +1479,7 @@ static inline void ide_dma_on(ide_drive_t *drive) { ; }
 static inline void ide_dma_verbose(ide_drive_t *drive) { ; }
 static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
 static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
+static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
-- 
cgit v1.2.3-71-gd317


From 1866082339597930c5b77aad8de34ab4fbb5724f Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:44 +0100
Subject: ide: remove ide_do_drive_cmd()

* Use elv_add_request() instead of __elv_add_request() in ide_do_drive_cmd().

* ide_do_drive_cmd() is used only in ide-{atapi,cd}.c so inline it there.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c |  5 ++++-
 drivers/ide/ide-cd.c    |  4 +++-
 drivers/ide/ide-io.c    | 28 ----------------------------
 include/linux/ide.h     |  2 --
 4 files changed, 7 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 09ae30f46070..3044c51c06a5 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -149,7 +149,10 @@ static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
 	memcpy(rq->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		rq->cmd[13] = REQ_IDETAPE_PC1;
-	ide_do_drive_cmd(drive, rq);
+
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
 }
 
 /*
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index ddfbea41d296..2177cd11664c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -242,7 +242,9 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x\n",
 			      failed_command->cmd[0]);
 
-	ide_do_drive_cmd(drive, rq);
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
 }
 
 static void cdrom_end_request(ide_drive_t *drive, int uptodate)
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 557b15700ea2..56be3375bee4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1129,34 +1129,6 @@ out_early:
 }
 EXPORT_SYMBOL_GPL(ide_intr);
 
-/**
- *	ide_do_drive_cmd	-	issue IDE special command
- *	@drive: device to issue command
- *	@rq: request to issue
- *
- *	This function issues a special IDE device request
- *	onto the request queue.
- *
- *	the rq is queued at the head of the request queue, displacing
- *	the currently-being-processed request and this function
- *	returns immediately without waiting for the new rq to be
- *	completed.  This is VERY DANGEROUS, and is intended for
- *	careful use by the ATAPI tape/cdrom driver code.
- */
-
-void ide_do_drive_cmd(ide_drive_t *drive, struct request *rq)
-{
-	struct request_queue *q = drive->queue;
-	unsigned long flags;
-
-	drive->hwif->rq = NULL;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(ide_do_drive_cmd);
-
 void ide_pad_transfer(ide_drive_t *drive, int write, int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d0065a90452b..8fadffe53cde 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1174,8 +1174,6 @@ extern ide_startstop_t ide_do_reset (ide_drive_t *);
 extern int ide_devset_execute(ide_drive_t *drive,
 			      const struct ide_devset *setting, int arg);
 
-extern void ide_do_drive_cmd(ide_drive_t *, struct request *);
-
 extern void ide_end_drive_cmd(ide_drive_t *, u8, u8);
 
 void ide_tf_dump(const char *, struct ide_taskfile *);
-- 
cgit v1.2.3-71-gd317


From c4e66c36cce3f23d68013c4112013123ffe80bdb Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:44 +0100
Subject: ide: move ide_do_park_unpark() to ide-park.c

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c   | 24 ------------------------
 drivers/ide/ide-park.c | 25 +++++++++++++++++++++++++
 include/linux/ide.h    |  2 ++
 3 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c37883ae2662..16e47989fcfd 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -527,30 +527,6 @@ static ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 	return ide_stopped;
 }
 
-static ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
-{
-	ide_task_t task;
-	struct ide_taskfile *tf = &task.tf;
-
-	memset(&task, 0, sizeof(task));
-	if (rq->cmd[0] == REQ_PARK_HEADS) {
-		drive->sleep = *(unsigned long *)rq->special;
-		drive->dev_flags |= IDE_DFLAG_SLEEPING;
-		tf->command = ATA_CMD_IDLEIMMEDIATE;
-		tf->feature = 0x44;
-		tf->lbal = 0x4c;
-		tf->lbam = 0x4e;
-		tf->lbah = 0x55;
-		task.tf_flags |= IDE_TFLAG_CUSTOM_HANDLER;
-	} else		/* cmd == REQ_UNPARK_HEADS */
-		tf->command = ATA_CMD_CHK_POWER;
-
-	task.tf_flags |= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	task.rq = rq;
-	drive->hwif->data_phase = task.data_phase = TASKFILE_NO_DATA;
-	return do_rw_taskfile(drive, &task);
-}
-
 static ide_startstop_t ide_special_rq(ide_drive_t *drive, struct request *rq)
 {
 	u8 cmd = rq->cmd[0];
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index c875a957596c..f30e52152fcb 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
+#include <linux/hdreg.h>
 #include <linux/jiffies.h>
 #include <linux/blkdev.h>
 
@@ -60,6 +61,30 @@ out:
 	return;
 }
 
+ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
+{
+	ide_task_t task;
+	struct ide_taskfile *tf = &task.tf;
+
+	memset(&task, 0, sizeof(task));
+	if (rq->cmd[0] == REQ_PARK_HEADS) {
+		drive->sleep = *(unsigned long *)rq->special;
+		drive->dev_flags |= IDE_DFLAG_SLEEPING;
+		tf->command = ATA_CMD_IDLEIMMEDIATE;
+		tf->feature = 0x44;
+		tf->lbal = 0x4c;
+		tf->lbam = 0x4e;
+		tf->lbah = 0x55;
+		task.tf_flags |= IDE_TFLAG_CUSTOM_HANDLER;
+	} else		/* cmd == REQ_UNPARK_HEADS */
+		tf->command = ATA_CMD_CHK_POWER;
+
+	task.tf_flags |= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	task.rq = rq;
+	drive->hwif->data_phase = task.data_phase = TASKFILE_NO_DATA;
+	return do_rw_taskfile(drive, &task);
+}
+
 ssize_t ide_park_show(struct device *dev, struct device_attribute *attr,
 		      char *buf)
 {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 8fadffe53cde..110d26359897 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1169,6 +1169,8 @@ int ide_busy_sleep(ide_hwif_t *, unsigned long, int);
 
 int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
+ide_startstop_t ide_do_park_unpark(ide_drive_t *, struct request *);
+
 extern ide_startstop_t ide_do_reset (ide_drive_t *);
 
 extern int ide_devset_execute(ide_drive_t *drive,
-- 
cgit v1.2.3-71-gd317


From 11938c929022bb92b1a42f5e1289524a1e465dc0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:44 +0100
Subject: ide: move device settings code to ide-devsets.c

Remove stale comment from ide.c while at it.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Makefile      |   3 +-
 drivers/ide/ide-devsets.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-io.c      |  37 ---------
 drivers/ide/ide.c         | 154 -------------------------------------
 include/linux/ide.h       |   1 +
 5 files changed, 193 insertions(+), 192 deletions(-)
 create mode 100644 drivers/ide/ide-devsets.c

(limited to 'include/linux')

diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index 1c326d94aa6d..83a970ee4bfe 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -5,7 +5,8 @@
 EXTRA_CFLAGS				+= -Idrivers/ide
 
 ide-core-y += ide.o ide-ioctls.o ide-io.o ide-iops.o ide-lib.o ide-probe.o \
-	      ide-taskfile.o ide-pm.o ide-park.o ide-pio-blacklist.o ide-sysfs.o
+	      ide-taskfile.o ide-pm.o ide-park.o ide-pio-blacklist.o \
+	      ide-sysfs.o ide-devsets.o
 
 # core IDE code
 ide-core-$(CONFIG_IDE_TIMINGS)		+= ide-timings.o
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
new file mode 100644
index 000000000000..7c3953414d47
--- /dev/null
+++ b/drivers/ide/ide-devsets.c
@@ -0,0 +1,190 @@
+
+#include <linux/kernel.h>
+#include <linux/ide.h>
+
+DEFINE_MUTEX(ide_setting_mtx);
+
+ide_devset_get(io_32bit, io_32bit);
+
+static int set_io_32bit(ide_drive_t *drive, int arg)
+{
+	if (drive->dev_flags & IDE_DFLAG_NO_IO_32BIT)
+		return -EPERM;
+
+	if (arg < 0 || arg > 1 + (SUPPORT_VLB_SYNC << 1))
+		return -EINVAL;
+
+	drive->io_32bit = arg;
+
+	return 0;
+}
+
+ide_devset_get_flag(ksettings, IDE_DFLAG_KEEP_SETTINGS);
+
+static int set_ksettings(ide_drive_t *drive, int arg)
+{
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
+	if (arg)
+		drive->dev_flags |= IDE_DFLAG_KEEP_SETTINGS;
+	else
+		drive->dev_flags &= ~IDE_DFLAG_KEEP_SETTINGS;
+
+	return 0;
+}
+
+ide_devset_get_flag(using_dma, IDE_DFLAG_USING_DMA);
+
+static int set_using_dma(ide_drive_t *drive, int arg)
+{
+#ifdef CONFIG_BLK_DEV_IDEDMA
+	int err = -EPERM;
+
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
+	if (ata_id_has_dma(drive->id) == 0)
+		goto out;
+
+	if (drive->hwif->dma_ops == NULL)
+		goto out;
+
+	err = 0;
+
+	if (arg) {
+		if (ide_set_dma(drive))
+			err = -EIO;
+	} else
+		ide_dma_off(drive);
+
+out:
+	return err;
+#else
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
+	return -EPERM;
+#endif
+}
+
+/*
+ * handle HDIO_SET_PIO_MODE ioctl abusers here, eventually it will go away
+ */
+static int set_pio_mode_abuse(ide_hwif_t *hwif, u8 req_pio)
+{
+	switch (req_pio) {
+	case 202:
+	case 201:
+	case 200:
+	case 102:
+	case 101:
+	case 100:
+		return (hwif->host_flags & IDE_HFLAG_ABUSE_DMA_MODES) ? 1 : 0;
+	case 9:
+	case 8:
+		return (hwif->host_flags & IDE_HFLAG_ABUSE_PREFETCH) ? 1 : 0;
+	case 7:
+	case 6:
+		return (hwif->host_flags & IDE_HFLAG_ABUSE_FAST_DEVSEL) ? 1 : 0;
+	default:
+		return 0;
+	}
+}
+
+static int set_pio_mode(ide_drive_t *drive, int arg)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_port_ops *port_ops = hwif->port_ops;
+
+	if (arg < 0 || arg > 255)
+		return -EINVAL;
+
+	if (port_ops == NULL || port_ops->set_pio_mode == NULL ||
+	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
+		return -ENOSYS;
+
+	if (set_pio_mode_abuse(drive->hwif, arg)) {
+		if (arg == 8 || arg == 9) {
+			unsigned long flags;
+
+			/* take lock for IDE_DFLAG_[NO_]UNMASK/[NO_]IO_32BIT */
+			spin_lock_irqsave(&hwif->lock, flags);
+			port_ops->set_pio_mode(drive, arg);
+			spin_unlock_irqrestore(&hwif->lock, flags);
+		} else
+			port_ops->set_pio_mode(drive, arg);
+	} else {
+		int keep_dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
+
+		ide_set_pio(drive, arg);
+
+		if (hwif->host_flags & IDE_HFLAG_SET_PIO_MODE_KEEP_DMA) {
+			if (keep_dma)
+				ide_dma_on(drive);
+		}
+	}
+
+	return 0;
+}
+
+ide_devset_get_flag(unmaskirq, IDE_DFLAG_UNMASK);
+
+static int set_unmaskirq(ide_drive_t *drive, int arg)
+{
+	if (drive->dev_flags & IDE_DFLAG_NO_UNMASK)
+		return -EPERM;
+
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
+	if (arg)
+		drive->dev_flags |= IDE_DFLAG_UNMASK;
+	else
+		drive->dev_flags &= ~IDE_DFLAG_UNMASK;
+
+	return 0;
+}
+
+ide_ext_devset_rw_sync(io_32bit, io_32bit);
+ide_ext_devset_rw_sync(keepsettings, ksettings);
+ide_ext_devset_rw_sync(unmaskirq, unmaskirq);
+ide_ext_devset_rw_sync(using_dma, using_dma);
+__IDE_DEVSET(pio_mode, DS_SYNC, NULL, set_pio_mode);
+
+int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
+		       int arg)
+{
+	struct request_queue *q = drive->queue;
+	struct request *rq;
+	int ret = 0;
+
+	if (!(setting->flags & DS_SYNC))
+		return setting->set(drive, arg);
+
+	rq = blk_get_request(q, READ, __GFP_WAIT);
+	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_len = 5;
+	rq->cmd[0] = REQ_DEVSET_EXEC;
+	*(int *)&rq->cmd[1] = arg;
+	rq->special = setting->set;
+
+	if (blk_execute_rq(q, NULL, rq, 0))
+		ret = rq->errors;
+	blk_put_request(rq);
+
+	return ret;
+}
+
+ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
+{
+	int err, (*setfunc)(ide_drive_t *, int) = rq->special;
+
+	err = setfunc(drive, *(int *)&rq->cmd[1]);
+	if (err)
+		rq->errors = err;
+	else
+		err = 1;
+	ide_end_request(drive, err, 0);
+	return ide_stopped;
+}
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 16e47989fcfd..74d1a3e68252 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -490,43 +490,6 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
  	return ide_stopped;
 }
 
-int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
-		       int arg)
-{
-	struct request_queue *q = drive->queue;
-	struct request *rq;
-	int ret = 0;
-
-	if (!(setting->flags & DS_SYNC))
-		return setting->set(drive, arg);
-
-	rq = blk_get_request(q, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_len = 5;
-	rq->cmd[0] = REQ_DEVSET_EXEC;
-	*(int *)&rq->cmd[1] = arg;
-	rq->special = setting->set;
-
-	if (blk_execute_rq(q, NULL, rq, 0))
-		ret = rq->errors;
-	blk_put_request(rq);
-
-	return ret;
-}
-
-static ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
-{
-	int err, (*setfunc)(ide_drive_t *, int) = rq->special;
-
-	err = setfunc(drive, *(int *)&rq->cmd[1]);
-	if (err)
-		rq->errors = err;
-	else
-		err = 1;
-	ide_end_request(drive, err, 0);
-	return ide_stopped;
-}
-
 static ide_startstop_t ide_special_rq(ide_drive_t *drive, struct request *rq)
 {
 	u8 cmd = rq->cmd[0];
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index c779aa24dbe6..92c9b90931e7 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -62,160 +62,6 @@
 
 struct class *ide_port_class;
 
-/*
- *	Locks for IDE setting functionality
- */
-
-DEFINE_MUTEX(ide_setting_mtx);
-
-ide_devset_get(io_32bit, io_32bit);
-
-static int set_io_32bit(ide_drive_t *drive, int arg)
-{
-	if (drive->dev_flags & IDE_DFLAG_NO_IO_32BIT)
-		return -EPERM;
-
-	if (arg < 0 || arg > 1 + (SUPPORT_VLB_SYNC << 1))
-		return -EINVAL;
-
-	drive->io_32bit = arg;
-
-	return 0;
-}
-
-ide_devset_get_flag(ksettings, IDE_DFLAG_KEEP_SETTINGS);
-
-static int set_ksettings(ide_drive_t *drive, int arg)
-{
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (arg)
-		drive->dev_flags |= IDE_DFLAG_KEEP_SETTINGS;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_KEEP_SETTINGS;
-
-	return 0;
-}
-
-ide_devset_get_flag(using_dma, IDE_DFLAG_USING_DMA);
-
-static int set_using_dma(ide_drive_t *drive, int arg)
-{
-#ifdef CONFIG_BLK_DEV_IDEDMA
-	int err = -EPERM;
-
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (ata_id_has_dma(drive->id) == 0)
-		goto out;
-
-	if (drive->hwif->dma_ops == NULL)
-		goto out;
-
-	err = 0;
-
-	if (arg) {
-		if (ide_set_dma(drive))
-			err = -EIO;
-	} else
-		ide_dma_off(drive);
-
-out:
-	return err;
-#else
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	return -EPERM;
-#endif
-}
-
-/*
- * handle HDIO_SET_PIO_MODE ioctl abusers here, eventually it will go away
- */
-static int set_pio_mode_abuse(ide_hwif_t *hwif, u8 req_pio)
-{
-	switch (req_pio) {
-	case 202:
-	case 201:
-	case 200:
-	case 102:
-	case 101:
-	case 100:
-		return (hwif->host_flags & IDE_HFLAG_ABUSE_DMA_MODES) ? 1 : 0;
-	case 9:
-	case 8:
-		return (hwif->host_flags & IDE_HFLAG_ABUSE_PREFETCH) ? 1 : 0;
-	case 7:
-	case 6:
-		return (hwif->host_flags & IDE_HFLAG_ABUSE_FAST_DEVSEL) ? 1 : 0;
-	default:
-		return 0;
-	}
-}
-
-static int set_pio_mode(ide_drive_t *drive, int arg)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (arg < 0 || arg > 255)
-		return -EINVAL;
-
-	if (port_ops == NULL || port_ops->set_pio_mode == NULL ||
-	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
-		return -ENOSYS;
-
-	if (set_pio_mode_abuse(drive->hwif, arg)) {
-		if (arg == 8 || arg == 9) {
-			unsigned long flags;
-
-			/* take lock for IDE_DFLAG_[NO_]UNMASK/[NO_]IO_32BIT */
-			spin_lock_irqsave(&hwif->lock, flags);
-			port_ops->set_pio_mode(drive, arg);
-			spin_unlock_irqrestore(&hwif->lock, flags);
-		} else
-			port_ops->set_pio_mode(drive, arg);
-	} else {
-		int keep_dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
-
-		ide_set_pio(drive, arg);
-
-		if (hwif->host_flags & IDE_HFLAG_SET_PIO_MODE_KEEP_DMA) {
-			if (keep_dma)
-				ide_dma_on(drive);
-		}
-	}
-
-	return 0;
-}
-
-ide_devset_get_flag(unmaskirq, IDE_DFLAG_UNMASK);
-
-static int set_unmaskirq(ide_drive_t *drive, int arg)
-{
-	if (drive->dev_flags & IDE_DFLAG_NO_UNMASK)
-		return -EPERM;
-
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (arg)
-		drive->dev_flags |= IDE_DFLAG_UNMASK;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_UNMASK;
-
-	return 0;
-}
-
-ide_ext_devset_rw_sync(io_32bit, io_32bit);
-ide_ext_devset_rw_sync(keepsettings, ksettings);
-ide_ext_devset_rw_sync(unmaskirq, unmaskirq);
-ide_ext_devset_rw_sync(using_dma, using_dma);
-__IDE_DEVSET(pio_mode, DS_SYNC, NULL, set_pio_mode);
-
 /**
  * ide_device_get	-	get an additional reference to a ide_drive_t
  * @drive:	device to get a reference to
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 110d26359897..eca5082c3437 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1170,6 +1170,7 @@ int ide_busy_sleep(ide_hwif_t *, unsigned long, int);
 int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
 ide_startstop_t ide_do_park_unpark(ide_drive_t *, struct request *);
+ide_startstop_t ide_do_devset(ide_drive_t *, struct request *);
 
 extern ide_startstop_t ide_do_reset (ide_drive_t *);
 
-- 
cgit v1.2.3-71-gd317


From 7eeaaaa52285d5e6cb79f515e99c591dcb9d04fe Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:46 +0100
Subject: ide: move xfer mode tuning code to ide-xfer-mode.c

* Move xfer mode tuning code to ide-xfer-mode.c.

* Add CONFIG_IDE_XFER_MODE config option to be selected by host drivers
  that support xfer mode tuning.

* Add CONFIG_IDE_XFER_MODE=n static inline versions of ide_set_pio()
  and ide_set_xfer_rate().

* Make IDE_TIMINGS and BLK_DEV_IDEDMA config options select IDE_XFER_MODE,
  also add explicit selects for few host drivers that need it.

* Build/link ide-xfer-mode.o and ide-pio-blacklist.o (it is needed only
  by ide-xfer-mode.o) only if CONFIG_IDE_XFER_MODE=y.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig         |   8 ++
 drivers/ide/Makefile        |   4 +-
 drivers/ide/ide-lib.c       | 240 ------------------------------------------
 drivers/ide/ide-xfer-mode.c | 246 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ide.h         |  12 ++-
 5 files changed, 263 insertions(+), 247 deletions(-)
 create mode 100644 drivers/ide/ide-xfer-mode.c

(limited to 'include/linux')

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 5ea3bfad172a..640c99207242 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -56,8 +56,12 @@ if IDE
 
 comment "Please see Documentation/ide/ide.txt for help/info on IDE drives"
 
+config IDE_XFER_MODE
+	bool
+
 config IDE_TIMINGS
 	bool
+	select IDE_XFER_MODE
 
 config IDE_ATAPI
 	bool
@@ -698,6 +702,7 @@ config BLK_DEV_IDE_PMAC_ATA100FIRST
 config BLK_DEV_IDE_AU1XXX
        bool "IDE for AMD Alchemy Au1200"
        depends on SOC_AU1200
+       select IDE_XFER_MODE
 choice
        prompt "IDE Mode for AMD Alchemy Au1200"
        default CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA
@@ -871,6 +876,7 @@ config BLK_DEV_ALI14XX
 
 config BLK_DEV_DTC2278
 	tristate "DTC-2278 support"
+	select IDE_XFER_MODE
 	select IDE_LEGACY
 	help
 	  This driver is enabled at runtime using the "dtc2278.probe" kernel
@@ -902,6 +908,7 @@ config BLK_DEV_QD65XX
 
 config BLK_DEV_UMC8672
 	tristate "UMC-8672 support"
+	select IDE_XFER_MODE
 	select IDE_LEGACY
 	help
 	  This driver is enabled at runtime using the "umc8672.probe" kernel
@@ -915,5 +922,6 @@ endif
 config BLK_DEV_IDEDMA
 	def_bool BLK_DEV_IDEDMA_SFF || \
 		 BLK_DEV_IDEDMA_ICS || BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
+	select IDE_XFER_MODE
 
 endif # IDE
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index 83a970ee4bfe..d0976e6ee090 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -5,10 +5,10 @@
 EXTRA_CFLAGS				+= -Idrivers/ide
 
 ide-core-y += ide.o ide-ioctls.o ide-io.o ide-iops.o ide-lib.o ide-probe.o \
-	      ide-taskfile.o ide-pm.o ide-park.o ide-pio-blacklist.o \
-	      ide-sysfs.o ide-devsets.o
+	      ide-taskfile.o ide-pm.o ide-park.o ide-sysfs.o ide-devsets.o
 
 # core IDE code
+ide-core-$(CONFIG_IDE_XFER_MODE)	+= ide-pio-blacklist.o ide-xfer-mode.o
 ide-core-$(CONFIG_IDE_TIMINGS)		+= ide-timings.o
 ide-core-$(CONFIG_IDE_ATAPI)		+= ide-atapi.o
 ide-core-$(CONFIG_BLK_DEV_IDEPCI)	+= setup-pci.o
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 09526a0de734..f6c683dd2987 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -5,163 +5,6 @@
 #include <linux/ide.h>
 #include <linux/bitops.h>
 
-static const char *udma_str[] =
-	 { "UDMA/16", "UDMA/25",  "UDMA/33",  "UDMA/44",
-	   "UDMA/66", "UDMA/100", "UDMA/133", "UDMA7" };
-static const char *mwdma_str[] =
-	{ "MWDMA0", "MWDMA1", "MWDMA2" };
-static const char *swdma_str[] =
-	{ "SWDMA0", "SWDMA1", "SWDMA2" };
-static const char *pio_str[] =
-	{ "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5" };
-
-/**
- *	ide_xfer_verbose	-	return IDE mode names
- *	@mode: transfer mode
- *
- *	Returns a constant string giving the name of the mode
- *	requested.
- */
-
-const char *ide_xfer_verbose(u8 mode)
-{
-	const char *s;
-	u8 i = mode & 0xf;
-
-	if (mode >= XFER_UDMA_0 && mode <= XFER_UDMA_7)
-		s = udma_str[i];
-	else if (mode >= XFER_MW_DMA_0 && mode <= XFER_MW_DMA_2)
-		s = mwdma_str[i];
-	else if (mode >= XFER_SW_DMA_0 && mode <= XFER_SW_DMA_2)
-		s = swdma_str[i];
-	else if (mode >= XFER_PIO_0 && mode <= XFER_PIO_5)
-		s = pio_str[i & 0x7];
-	else if (mode == XFER_PIO_SLOW)
-		s = "PIO SLOW";
-	else
-		s = "XFER ERROR";
-
-	return s;
-}
-EXPORT_SYMBOL(ide_xfer_verbose);
-
-/**
- *	ide_rate_filter		-	filter transfer mode
- *	@drive: IDE device
- *	@speed: desired speed
- *
- *	Given the available transfer modes this function returns
- *	the best available speed at or below the speed requested.
- *
- *	TODO: check device PIO capabilities
- */
-
-static u8 ide_rate_filter(ide_drive_t *drive, u8 speed)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 mode = ide_find_dma_mode(drive, speed);
-
-	if (mode == 0) {
-		if (hwif->pio_mask)
-			mode = fls(hwif->pio_mask) - 1 + XFER_PIO_0;
-		else
-			mode = XFER_PIO_4;
-	}
-
-/*	printk("%s: mode 0x%02x, speed 0x%02x\n", __func__, mode, speed); */
-
-	return min(speed, mode);
-}
-
-/**
- *	ide_get_best_pio_mode	-	get PIO mode from drive
- *	@drive: drive to consider
- *	@mode_wanted: preferred mode
- *	@max_mode: highest allowed mode
- *
- *	This routine returns the recommended PIO settings for a given drive,
- *	based on the drive->id information and the ide_pio_blacklist[].
- *
- *	Drive PIO mode is auto-selected if 255 is passed as mode_wanted.
- *	This is used by most chipset support modules when "auto-tuning".
- */
-
-u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
-{
-	u16 *id = drive->id;
-	int pio_mode = -1, overridden = 0;
-
-	if (mode_wanted != 255)
-		return min_t(u8, mode_wanted, max_mode);
-
-	if ((drive->hwif->host_flags & IDE_HFLAG_PIO_NO_BLACKLIST) == 0)
-		pio_mode = ide_scan_pio_blacklist((char *)&id[ATA_ID_PROD]);
-
-	if (pio_mode != -1) {
-		printk(KERN_INFO "%s: is on PIO blacklist\n", drive->name);
-	} else {
-		pio_mode = id[ATA_ID_OLD_PIO_MODES] >> 8;
-		if (pio_mode > 2) {	/* 2 is maximum allowed tPIO value */
-			pio_mode = 2;
-			overridden = 1;
-		}
-
-		if (id[ATA_ID_FIELD_VALID] & 2) {	      /* ATA2? */
-			if (ata_id_has_iordy(id)) {
-				if (id[ATA_ID_PIO_MODES] & 7) {
-					overridden = 0;
-					if (id[ATA_ID_PIO_MODES] & 4)
-						pio_mode = 5;
-					else if (id[ATA_ID_PIO_MODES] & 2)
-						pio_mode = 4;
-					else
-						pio_mode = 3;
-				}
-			}
-		}
-
-		if (overridden)
-			printk(KERN_INFO "%s: tPIO > 2, assuming tPIO = 2\n",
-					 drive->name);
-	}
-
-	if (pio_mode > max_mode)
-		pio_mode = max_mode;
-
-	return pio_mode;
-}
-EXPORT_SYMBOL_GPL(ide_get_best_pio_mode);
-
-/* req_pio == "255" for auto-tune */
-void ide_set_pio(ide_drive_t *drive, u8 req_pio)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	u8 host_pio, pio;
-
-	if (port_ops == NULL || port_ops->set_pio_mode == NULL ||
-	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
-		return;
-
-	BUG_ON(hwif->pio_mask == 0x00);
-
-	host_pio = fls(hwif->pio_mask) - 1;
-
-	pio = ide_get_best_pio_mode(drive, req_pio, host_pio);
-
-	/*
-	 * TODO:
-	 * - report device max PIO mode
-	 * - check req_pio != 255 against device max PIO mode
-	 */
-	printk(KERN_DEBUG "%s: host max PIO%d wanted PIO%d%s selected PIO%d\n",
-			  drive->name, host_pio, req_pio,
-			  req_pio == 255 ? "(auto-tune)" : "", pio);
-
-	(void)ide_set_pio_mode(drive, XFER_PIO_0 + pio);
-}
-EXPORT_SYMBOL_GPL(ide_set_pio);
-
 /**
  *	ide_toggle_bounce	-	handle bounce buffering
  *	@drive: drive to update
@@ -188,89 +31,6 @@ void ide_toggle_bounce(ide_drive_t *drive, int on)
 		blk_queue_bounce_limit(drive->queue, addr);
 }
 
-int ide_set_pio_mode(ide_drive_t *drive, const u8 mode)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (hwif->host_flags & IDE_HFLAG_NO_SET_MODE)
-		return 0;
-
-	if (port_ops == NULL || port_ops->set_pio_mode == NULL)
-		return -1;
-
-	/*
-	 * TODO: temporary hack for some legacy host drivers that didn't
-	 * set transfer mode on the device in ->set_pio_mode method...
-	 */
-	if (port_ops->set_dma_mode == NULL) {
-		port_ops->set_pio_mode(drive, mode - XFER_PIO_0);
-		return 0;
-	}
-
-	if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) {
-		if (ide_config_drive_speed(drive, mode))
-			return -1;
-		port_ops->set_pio_mode(drive, mode - XFER_PIO_0);
-		return 0;
-	} else {
-		port_ops->set_pio_mode(drive, mode - XFER_PIO_0);
-		return ide_config_drive_speed(drive, mode);
-	}
-}
-
-int ide_set_dma_mode(ide_drive_t *drive, const u8 mode)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (hwif->host_flags & IDE_HFLAG_NO_SET_MODE)
-		return 0;
-
-	if (port_ops == NULL || port_ops->set_dma_mode == NULL)
-		return -1;
-
-	if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) {
-		if (ide_config_drive_speed(drive, mode))
-			return -1;
-		port_ops->set_dma_mode(drive, mode);
-		return 0;
-	} else {
-		port_ops->set_dma_mode(drive, mode);
-		return ide_config_drive_speed(drive, mode);
-	}
-}
-EXPORT_SYMBOL_GPL(ide_set_dma_mode);
-
-/**
- *	ide_set_xfer_rate	-	set transfer rate
- *	@drive: drive to set
- *	@rate: speed to attempt to set
- *
- *	General helper for setting the speed of an IDE device. This
- *	function knows about user enforced limits from the configuration
- *	which ->set_pio_mode/->set_dma_mode does not.
- */
-
-int ide_set_xfer_rate(ide_drive_t *drive, u8 rate)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (port_ops == NULL || port_ops->set_dma_mode == NULL ||
-	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
-		return -1;
-
-	rate = ide_rate_filter(drive, rate);
-
-	BUG_ON(rate < XFER_PIO_0);
-
-	if (rate >= XFER_PIO_0 && rate <= XFER_PIO_5)
-		return ide_set_pio_mode(drive, rate);
-
-	return ide_set_dma_mode(drive, rate);
-}
-
 static void ide_dump_opcode(ide_drive_t *drive)
 {
 	struct request *rq = drive->hwif->rq;
diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c
new file mode 100644
index 000000000000..6910f6a257e8
--- /dev/null
+++ b/drivers/ide/ide-xfer-mode.c
@@ -0,0 +1,246 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/ide.h>
+#include <linux/bitops.h>
+
+static const char *udma_str[] =
+	 { "UDMA/16", "UDMA/25",  "UDMA/33",  "UDMA/44",
+	   "UDMA/66", "UDMA/100", "UDMA/133", "UDMA7" };
+static const char *mwdma_str[] =
+	{ "MWDMA0", "MWDMA1", "MWDMA2" };
+static const char *swdma_str[] =
+	{ "SWDMA0", "SWDMA1", "SWDMA2" };
+static const char *pio_str[] =
+	{ "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5" };
+
+/**
+ *	ide_xfer_verbose	-	return IDE mode names
+ *	@mode: transfer mode
+ *
+ *	Returns a constant string giving the name of the mode
+ *	requested.
+ */
+
+const char *ide_xfer_verbose(u8 mode)
+{
+	const char *s;
+	u8 i = mode & 0xf;
+
+	if (mode >= XFER_UDMA_0 && mode <= XFER_UDMA_7)
+		s = udma_str[i];
+	else if (mode >= XFER_MW_DMA_0 && mode <= XFER_MW_DMA_2)
+		s = mwdma_str[i];
+	else if (mode >= XFER_SW_DMA_0 && mode <= XFER_SW_DMA_2)
+		s = swdma_str[i];
+	else if (mode >= XFER_PIO_0 && mode <= XFER_PIO_5)
+		s = pio_str[i & 0x7];
+	else if (mode == XFER_PIO_SLOW)
+		s = "PIO SLOW";
+	else
+		s = "XFER ERROR";
+
+	return s;
+}
+EXPORT_SYMBOL(ide_xfer_verbose);
+
+/**
+ *	ide_get_best_pio_mode	-	get PIO mode from drive
+ *	@drive: drive to consider
+ *	@mode_wanted: preferred mode
+ *	@max_mode: highest allowed mode
+ *
+ *	This routine returns the recommended PIO settings for a given drive,
+ *	based on the drive->id information and the ide_pio_blacklist[].
+ *
+ *	Drive PIO mode is auto-selected if 255 is passed as mode_wanted.
+ *	This is used by most chipset support modules when "auto-tuning".
+ */
+
+u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
+{
+	u16 *id = drive->id;
+	int pio_mode = -1, overridden = 0;
+
+	if (mode_wanted != 255)
+		return min_t(u8, mode_wanted, max_mode);
+
+	if ((drive->hwif->host_flags & IDE_HFLAG_PIO_NO_BLACKLIST) == 0)
+		pio_mode = ide_scan_pio_blacklist((char *)&id[ATA_ID_PROD]);
+
+	if (pio_mode != -1) {
+		printk(KERN_INFO "%s: is on PIO blacklist\n", drive->name);
+	} else {
+		pio_mode = id[ATA_ID_OLD_PIO_MODES] >> 8;
+		if (pio_mode > 2) {	/* 2 is maximum allowed tPIO value */
+			pio_mode = 2;
+			overridden = 1;
+		}
+
+		if (id[ATA_ID_FIELD_VALID] & 2) {	      /* ATA2? */
+			if (ata_id_has_iordy(id)) {
+				if (id[ATA_ID_PIO_MODES] & 7) {
+					overridden = 0;
+					if (id[ATA_ID_PIO_MODES] & 4)
+						pio_mode = 5;
+					else if (id[ATA_ID_PIO_MODES] & 2)
+						pio_mode = 4;
+					else
+						pio_mode = 3;
+				}
+			}
+		}
+
+		if (overridden)
+			printk(KERN_INFO "%s: tPIO > 2, assuming tPIO = 2\n",
+					 drive->name);
+	}
+
+	if (pio_mode > max_mode)
+		pio_mode = max_mode;
+
+	return pio_mode;
+}
+EXPORT_SYMBOL_GPL(ide_get_best_pio_mode);
+
+int ide_set_pio_mode(ide_drive_t *drive, const u8 mode)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_port_ops *port_ops = hwif->port_ops;
+
+	if (hwif->host_flags & IDE_HFLAG_NO_SET_MODE)
+		return 0;
+
+	if (port_ops == NULL || port_ops->set_pio_mode == NULL)
+		return -1;
+
+	/*
+	 * TODO: temporary hack for some legacy host drivers that didn't
+	 * set transfer mode on the device in ->set_pio_mode method...
+	 */
+	if (port_ops->set_dma_mode == NULL) {
+		port_ops->set_pio_mode(drive, mode - XFER_PIO_0);
+		return 0;
+	}
+
+	if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) {
+		if (ide_config_drive_speed(drive, mode))
+			return -1;
+		port_ops->set_pio_mode(drive, mode - XFER_PIO_0);
+		return 0;
+	} else {
+		port_ops->set_pio_mode(drive, mode - XFER_PIO_0);
+		return ide_config_drive_speed(drive, mode);
+	}
+}
+
+int ide_set_dma_mode(ide_drive_t *drive, const u8 mode)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_port_ops *port_ops = hwif->port_ops;
+
+	if (hwif->host_flags & IDE_HFLAG_NO_SET_MODE)
+		return 0;
+
+	if (port_ops == NULL || port_ops->set_dma_mode == NULL)
+		return -1;
+
+	if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) {
+		if (ide_config_drive_speed(drive, mode))
+			return -1;
+		port_ops->set_dma_mode(drive, mode);
+		return 0;
+	} else {
+		port_ops->set_dma_mode(drive, mode);
+		return ide_config_drive_speed(drive, mode);
+	}
+}
+EXPORT_SYMBOL_GPL(ide_set_dma_mode);
+
+/* req_pio == "255" for auto-tune */
+void ide_set_pio(ide_drive_t *drive, u8 req_pio)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_port_ops *port_ops = hwif->port_ops;
+	u8 host_pio, pio;
+
+	if (port_ops == NULL || port_ops->set_pio_mode == NULL ||
+	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
+		return;
+
+	BUG_ON(hwif->pio_mask == 0x00);
+
+	host_pio = fls(hwif->pio_mask) - 1;
+
+	pio = ide_get_best_pio_mode(drive, req_pio, host_pio);
+
+	/*
+	 * TODO:
+	 * - report device max PIO mode
+	 * - check req_pio != 255 against device max PIO mode
+	 */
+	printk(KERN_DEBUG "%s: host max PIO%d wanted PIO%d%s selected PIO%d\n",
+			  drive->name, host_pio, req_pio,
+			  req_pio == 255 ? "(auto-tune)" : "", pio);
+
+	(void)ide_set_pio_mode(drive, XFER_PIO_0 + pio);
+}
+EXPORT_SYMBOL_GPL(ide_set_pio);
+
+/**
+ *	ide_rate_filter		-	filter transfer mode
+ *	@drive: IDE device
+ *	@speed: desired speed
+ *
+ *	Given the available transfer modes this function returns
+ *	the best available speed at or below the speed requested.
+ *
+ *	TODO: check device PIO capabilities
+ */
+
+static u8 ide_rate_filter(ide_drive_t *drive, u8 speed)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	u8 mode = ide_find_dma_mode(drive, speed);
+
+	if (mode == 0) {
+		if (hwif->pio_mask)
+			mode = fls(hwif->pio_mask) - 1 + XFER_PIO_0;
+		else
+			mode = XFER_PIO_4;
+	}
+
+/*	printk("%s: mode 0x%02x, speed 0x%02x\n", __func__, mode, speed); */
+
+	return min(speed, mode);
+}
+
+/**
+ *	ide_set_xfer_rate	-	set transfer rate
+ *	@drive: drive to set
+ *	@rate: speed to attempt to set
+ *
+ *	General helper for setting the speed of an IDE device. This
+ *	function knows about user enforced limits from the configuration
+ *	which ->set_pio_mode/->set_dma_mode does not.
+ */
+
+int ide_set_xfer_rate(ide_drive_t *drive, u8 rate)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_port_ops *port_ops = hwif->port_ops;
+
+	if (port_ops == NULL || port_ops->set_dma_mode == NULL ||
+	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
+		return -1;
+
+	rate = ide_rate_filter(drive, rate);
+
+	BUG_ON(rate < XFER_PIO_0);
+
+	if (rate >= XFER_PIO_0 && rate <= XFER_PIO_5)
+		return ide_set_pio_mode(drive, rate);
+
+	return ide_set_dma_mode(drive, rate);
+}
diff --git a/include/linux/ide.h b/include/linux/ide.h
index eca5082c3437..323c3710fbf4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1531,9 +1531,7 @@ static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
 	hwif->hwif_data = data;
 }
 
-const char *ide_xfer_verbose(u8 mode);
 extern void ide_toggle_bounce(ide_drive_t *drive, int on);
-extern int ide_set_xfer_rate(ide_drive_t *drive, u8 rate);
 
 u64 ide_get_lba_addr(struct ide_taskfile *, int);
 u8 ide_dump_status(ide_drive_t *, const char *, u8);
@@ -1572,14 +1570,18 @@ void ide_timing_merge(struct ide_timing *, struct ide_timing *,
 		      struct ide_timing *, unsigned int);
 int ide_timing_compute(ide_drive_t *, u8, struct ide_timing *, int, int);
 
+#ifdef CONFIG_IDE_XFER_MODE
 int ide_scan_pio_blacklist(char *);
-
+const char *ide_xfer_verbose(u8);
 u8 ide_get_best_pio_mode(ide_drive_t *, u8, u8);
-
 int ide_set_pio_mode(ide_drive_t *, u8);
 int ide_set_dma_mode(ide_drive_t *, u8);
-
 void ide_set_pio(ide_drive_t *, u8);
+int ide_set_xfer_rate(ide_drive_t *, u8);
+#else
+static inline void ide_set_pio(ide_drive_t *drive, u8 pio) { ; }
+static inline int ide_set_xfer_rate(ide_drive_t *drive, u8 rate) { return -1; }
+#endif
 
 static inline void ide_set_max_pio(ide_drive_t *drive)
 {
-- 
cgit v1.2.3-71-gd317


From 327fa1c29466b8fe471a91fc11e9c6171163c81a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:47 +0100
Subject: ide: move error handling code to ide-eh.c (v2)

Do some CodingStyle fixups in <linux/ide.h> while at it.

v2:
Add missing <linux/delay.h> include (reported by Stephen Rothwell).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Makefile   |   2 +-
 drivers/ide/ide-eh.c   | 428 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-io.c   | 129 +--------------
 drivers/ide/ide-iops.c | 299 +---------------------------------
 include/linux/ide.h    |  13 +-
 5 files changed, 440 insertions(+), 431 deletions(-)
 create mode 100644 drivers/ide/ide-eh.c

(limited to 'include/linux')

diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index cbb1aea2aea3..9b4bbe1cdc1a 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -6,7 +6,7 @@ EXTRA_CFLAGS				+= -Idrivers/ide
 
 ide-core-y += ide.o ide-ioctls.o ide-io.o ide-iops.o ide-lib.o ide-probe.o \
 	      ide-taskfile.o ide-pm.o ide-park.o ide-sysfs.o ide-devsets.o \
-	      ide-io-std.o
+	      ide-io-std.o ide-eh.o
 
 # core IDE code
 ide-core-$(CONFIG_IDE_XFER_MODE)	+= ide-pio-blacklist.o ide-xfer-mode.o
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
new file mode 100644
index 000000000000..1231b5e486f2
--- /dev/null
+++ b/drivers/ide/ide-eh.c
@@ -0,0 +1,428 @@
+
+#include <linux/kernel.h>
+#include <linux/ide.h>
+#include <linux/delay.h>
+
+static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
+				     u8 stat, u8 err)
+{
+	ide_hwif_t *hwif = drive->hwif;
+
+	if ((stat & ATA_BUSY) ||
+	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
+		/* other bits are useless when BUSY */
+		rq->errors |= ERROR_RESET;
+	} else if (stat & ATA_ERR) {
+		/* err has different meaning on cdrom and tape */
+		if (err == ATA_ABORTED) {
+			if ((drive->dev_flags & IDE_DFLAG_LBA) &&
+			    /* some newer drives don't support ATA_CMD_INIT_DEV_PARAMS */
+			    hwif->tp_ops->read_status(hwif) == ATA_CMD_INIT_DEV_PARAMS)
+				return ide_stopped;
+		} else if ((err & BAD_CRC) == BAD_CRC) {
+			/* UDMA crc error, just retry the operation */
+			drive->crc_count++;
+		} else if (err & (ATA_BBK | ATA_UNC)) {
+			/* retries won't help these */
+			rq->errors = ERROR_MAX;
+		} else if (err & ATA_TRK0NF) {
+			/* help it find track zero */
+			rq->errors |= ERROR_RECAL;
+		}
+	}
+
+	if ((stat & ATA_DRQ) && rq_data_dir(rq) == READ &&
+	    (hwif->host_flags & IDE_HFLAG_ERROR_STOPS_FIFO) == 0) {
+		int nsect = drive->mult_count ? drive->mult_count : 1;
+
+		ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
+	}
+
+	if (rq->errors >= ERROR_MAX || blk_noretry_request(rq)) {
+		ide_kill_rq(drive, rq);
+		return ide_stopped;
+	}
+
+	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
+		rq->errors |= ERROR_RESET;
+
+	if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
+		++rq->errors;
+		return ide_do_reset(drive);
+	}
+
+	if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
+		drive->special.b.recalibrate = 1;
+
+	++rq->errors;
+
+	return ide_stopped;
+}
+
+static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
+				       u8 stat, u8 err)
+{
+	ide_hwif_t *hwif = drive->hwif;
+
+	if ((stat & ATA_BUSY) ||
+	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
+		/* other bits are useless when BUSY */
+		rq->errors |= ERROR_RESET;
+	} else {
+		/* add decoding error stuff */
+	}
+
+	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
+		/* force an abort */
+		hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
+
+	if (rq->errors >= ERROR_MAX) {
+		ide_kill_rq(drive, rq);
+	} else {
+		if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
+			++rq->errors;
+			return ide_do_reset(drive);
+		}
+		++rq->errors;
+	}
+
+	return ide_stopped;
+}
+
+static ide_startstop_t __ide_error(ide_drive_t *drive, struct request *rq,
+				   u8 stat, u8 err)
+{
+	if (drive->media == ide_disk)
+		return ide_ata_error(drive, rq, stat, err);
+	return ide_atapi_error(drive, rq, stat, err);
+}
+
+/**
+ *	ide_error	-	handle an error on the IDE
+ *	@drive: drive the error occurred on
+ *	@msg: message to report
+ *	@stat: status bits
+ *
+ *	ide_error() takes action based on the error returned by the drive.
+ *	For normal I/O that may well include retries. We deal with
+ *	both new-style (taskfile) and old style command handling here.
+ *	In the case of taskfile command handling there is work left to
+ *	do
+ */
+
+ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
+{
+	struct request *rq;
+	u8 err;
+
+	err = ide_dump_status(drive, msg, stat);
+
+	rq = drive->hwif->rq;
+	if (rq == NULL)
+		return ide_stopped;
+
+	/* retry only "normal" I/O: */
+	if (!blk_fs_request(rq)) {
+		rq->errors = 1;
+		ide_end_drive_cmd(drive, stat, err);
+		return ide_stopped;
+	}
+
+	return __ide_error(drive, rq, stat, err);
+}
+EXPORT_SYMBOL_GPL(ide_error);
+
+static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
+{
+	struct request *rq = drive->hwif->rq;
+
+	if (rq && blk_special_request(rq) && rq->cmd[0] == REQ_DRIVE_RESET)
+		ide_end_request(drive, err ? err : 1, 0);
+}
+
+/* needed below */
+static ide_startstop_t do_reset1(ide_drive_t *, int);
+
+/*
+ * atapi_reset_pollfunc() gets invoked to poll the interface for completion
+ * every 50ms during an atapi drive reset operation.  If the drive has not yet
+ * responded, and we have not yet hit our maximum waiting time, then the timer
+ * is restarted for another 50ms.
+ */
+static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	u8 stat;
+
+	SELECT_DRIVE(drive);
+	udelay(10);
+	stat = hwif->tp_ops->read_status(hwif);
+
+	if (OK_STAT(stat, 0, ATA_BUSY))
+		printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name);
+	else {
+		if (time_before(jiffies, hwif->poll_timeout)) {
+			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20,
+					NULL);
+			/* continue polling */
+			return ide_started;
+		}
+		/* end of polling */
+		hwif->polling = 0;
+		printk(KERN_ERR "%s: ATAPI reset timed-out, status=0x%02x\n",
+			drive->name, stat);
+		/* do it the old fashioned way */
+		return do_reset1(drive, 1);
+	}
+	/* done polling */
+	hwif->polling = 0;
+	ide_complete_drive_reset(drive, 0);
+	return ide_stopped;
+}
+
+static void ide_reset_report_error(ide_hwif_t *hwif, u8 err)
+{
+	static const char *err_master_vals[] =
+		{ NULL, "passed", "formatter device error",
+		  "sector buffer error", "ECC circuitry error",
+		  "controlling MPU error" };
+
+	u8 err_master = err & 0x7f;
+
+	printk(KERN_ERR "%s: reset: master: ", hwif->name);
+	if (err_master && err_master < 6)
+		printk(KERN_CONT "%s", err_master_vals[err_master]);
+	else
+		printk(KERN_CONT "error (0x%02x?)", err);
+	if (err & 0x80)
+		printk(KERN_CONT "; slave: failed");
+	printk(KERN_CONT "\n");
+}
+
+/*
+ * reset_pollfunc() gets invoked to poll the interface for completion every 50ms
+ * during an ide reset operation. If the drives have not yet responded,
+ * and we have not yet hit our maximum waiting time, then the timer is restarted
+ * for another 50ms.
+ */
+static ide_startstop_t reset_pollfunc(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_port_ops *port_ops = hwif->port_ops;
+	u8 tmp;
+	int err = 0;
+
+	if (port_ops && port_ops->reset_poll) {
+		err = port_ops->reset_poll(drive);
+		if (err) {
+			printk(KERN_ERR "%s: host reset_poll failure for %s.\n",
+				hwif->name, drive->name);
+			goto out;
+		}
+	}
+
+	tmp = hwif->tp_ops->read_status(hwif);
+
+	if (!OK_STAT(tmp, 0, ATA_BUSY)) {
+		if (time_before(jiffies, hwif->poll_timeout)) {
+			ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
+			/* continue polling */
+			return ide_started;
+		}
+		printk(KERN_ERR "%s: reset timed-out, status=0x%02x\n",
+			hwif->name, tmp);
+		drive->failures++;
+		err = -EIO;
+	} else  {
+		tmp = ide_read_error(drive);
+
+		if (tmp == 1) {
+			printk(KERN_INFO "%s: reset: success\n", hwif->name);
+			drive->failures = 0;
+		} else {
+			ide_reset_report_error(hwif, tmp);
+			drive->failures++;
+			err = -EIO;
+		}
+	}
+out:
+	hwif->polling = 0;	/* done polling */
+	ide_complete_drive_reset(drive, err);
+	return ide_stopped;
+}
+
+static void ide_disk_pre_reset(ide_drive_t *drive)
+{
+	int legacy = (drive->id[ATA_ID_CFS_ENABLE_2] & 0x0400) ? 0 : 1;
+
+	drive->special.all = 0;
+	drive->special.b.set_geometry = legacy;
+	drive->special.b.recalibrate  = legacy;
+
+	drive->mult_count = 0;
+	drive->dev_flags &= ~IDE_DFLAG_PARKED;
+
+	if ((drive->dev_flags & IDE_DFLAG_KEEP_SETTINGS) == 0 &&
+	    (drive->dev_flags & IDE_DFLAG_USING_DMA) == 0)
+		drive->mult_req = 0;
+
+	if (drive->mult_req != drive->mult_count)
+		drive->special.b.set_multmode = 1;
+}
+
+static void pre_reset(ide_drive_t *drive)
+{
+	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
+
+	if (drive->media == ide_disk)
+		ide_disk_pre_reset(drive);
+	else
+		drive->dev_flags |= IDE_DFLAG_POST_RESET;
+
+	if (drive->dev_flags & IDE_DFLAG_USING_DMA) {
+		if (drive->crc_count)
+			ide_check_dma_crc(drive);
+		else
+			ide_dma_off(drive);
+	}
+
+	if ((drive->dev_flags & IDE_DFLAG_KEEP_SETTINGS) == 0) {
+		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0) {
+			drive->dev_flags &= ~IDE_DFLAG_UNMASK;
+			drive->io_32bit = 0;
+		}
+		return;
+	}
+
+	if (port_ops && port_ops->pre_reset)
+		port_ops->pre_reset(drive);
+
+	if (drive->current_speed != 0xff)
+		drive->desired_speed = drive->current_speed;
+	drive->current_speed = 0xff;
+}
+
+/*
+ * do_reset1() attempts to recover a confused drive by resetting it.
+ * Unfortunately, resetting a disk drive actually resets all devices on
+ * the same interface, so it can really be thought of as resetting the
+ * interface rather than resetting the drive.
+ *
+ * ATAPI devices have their own reset mechanism which allows them to be
+ * individually reset without clobbering other devices on the same interface.
+ *
+ * Unfortunately, the IDE interface does not generate an interrupt to let
+ * us know when the reset operation has finished, so we must poll for this.
+ * Equally poor, though, is the fact that this may a very long time to complete,
+ * (up to 30 seconds worstcase).  So, instead of busy-waiting here for it,
+ * we set a timer to poll at 50ms intervals.
+ */
+static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct ide_io_ports *io_ports = &hwif->io_ports;
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
+	const struct ide_port_ops *port_ops;
+	ide_drive_t *tdrive;
+	unsigned long flags, timeout;
+	int i;
+	DEFINE_WAIT(wait);
+
+	spin_lock_irqsave(&hwif->lock, flags);
+
+	/* We must not reset with running handlers */
+	BUG_ON(hwif->handler != NULL);
+
+	/* For an ATAPI device, first try an ATAPI SRST. */
+	if (drive->media != ide_disk && !do_not_try_atapi) {
+		pre_reset(drive);
+		SELECT_DRIVE(drive);
+		udelay(20);
+		tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
+		ndelay(400);
+		hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
+		hwif->polling = 1;
+		__ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL);
+		spin_unlock_irqrestore(&hwif->lock, flags);
+		return ide_started;
+	}
+
+	/* We must not disturb devices in the IDE_DFLAG_PARKED state. */
+	do {
+		unsigned long now;
+
+		prepare_to_wait(&ide_park_wq, &wait, TASK_UNINTERRUPTIBLE);
+		timeout = jiffies;
+		ide_port_for_each_present_dev(i, tdrive, hwif) {
+			if ((tdrive->dev_flags & IDE_DFLAG_PARKED) &&
+			    time_after(tdrive->sleep, timeout))
+				timeout = tdrive->sleep;
+		}
+
+		now = jiffies;
+		if (time_before_eq(timeout, now))
+			break;
+
+		spin_unlock_irqrestore(&hwif->lock, flags);
+		timeout = schedule_timeout_uninterruptible(timeout - now);
+		spin_lock_irqsave(&hwif->lock, flags);
+	} while (timeout);
+	finish_wait(&ide_park_wq, &wait);
+
+	/*
+	 * First, reset any device state data we were maintaining
+	 * for any of the drives on this interface.
+	 */
+	ide_port_for_each_dev(i, tdrive, hwif)
+		pre_reset(tdrive);
+
+	if (io_ports->ctl_addr == 0) {
+		spin_unlock_irqrestore(&hwif->lock, flags);
+		ide_complete_drive_reset(drive, -ENXIO);
+		return ide_stopped;
+	}
+
+	/*
+	 * Note that we also set nIEN while resetting the device,
+	 * to mask unwanted interrupts from the interface during the reset.
+	 * However, due to the design of PC hardware, this will cause an
+	 * immediate interrupt due to the edge transition it produces.
+	 * This single interrupt gives us a "fast poll" for drives that
+	 * recover from reset very quickly, saving us the first 50ms wait time.
+	 *
+	 * TODO: add ->softreset method and stop abusing ->set_irq
+	 */
+	/* set SRST and nIEN */
+	tp_ops->set_irq(hwif, 4);
+	/* more than enough time */
+	udelay(10);
+	/* clear SRST, leave nIEN (unless device is on the quirk list) */
+	tp_ops->set_irq(hwif, drive->quirk_list == 2);
+	/* more than enough time */
+	udelay(10);
+	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
+	hwif->polling = 1;
+	__ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
+
+	/*
+	 * Some weird controller like resetting themselves to a strange
+	 * state when the disks are reset this way. At least, the Winbond
+	 * 553 documentation says that
+	 */
+	port_ops = hwif->port_ops;
+	if (port_ops && port_ops->resetproc)
+		port_ops->resetproc(drive);
+
+	spin_unlock_irqrestore(&hwif->lock, flags);
+	return ide_started;
+}
+
+/*
+ * ide_do_reset() is the entry point to the drive/interface reset code.
+ */
+
+ide_startstop_t ide_do_reset(ide_drive_t *drive)
+{
+	return do_reset1(drive, 0);
+}
+EXPORT_SYMBOL(ide_do_reset);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 74d1a3e68252..2e92497b58aa 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -196,7 +196,7 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 }
 EXPORT_SYMBOL(ide_end_drive_cmd);
 
-static void ide_kill_rq(ide_drive_t *drive, struct request *rq)
+void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
 	if (rq->rq_disk) {
 		struct ide_driver *drv;
@@ -207,133 +207,6 @@ static void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 		ide_end_request(drive, 0, 0);
 }
 
-static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq, u8 stat, u8 err)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	if ((stat & ATA_BUSY) ||
-	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
-		/* other bits are useless when BUSY */
-		rq->errors |= ERROR_RESET;
-	} else if (stat & ATA_ERR) {
-		/* err has different meaning on cdrom and tape */
-		if (err == ATA_ABORTED) {
-			if ((drive->dev_flags & IDE_DFLAG_LBA) &&
-			    /* some newer drives don't support ATA_CMD_INIT_DEV_PARAMS */
-			    hwif->tp_ops->read_status(hwif) == ATA_CMD_INIT_DEV_PARAMS)
-				return ide_stopped;
-		} else if ((err & BAD_CRC) == BAD_CRC) {
-			/* UDMA crc error, just retry the operation */
-			drive->crc_count++;
-		} else if (err & (ATA_BBK | ATA_UNC)) {
-			/* retries won't help these */
-			rq->errors = ERROR_MAX;
-		} else if (err & ATA_TRK0NF) {
-			/* help it find track zero */
-			rq->errors |= ERROR_RECAL;
-		}
-	}
-
-	if ((stat & ATA_DRQ) && rq_data_dir(rq) == READ &&
-	    (hwif->host_flags & IDE_HFLAG_ERROR_STOPS_FIFO) == 0) {
-		int nsect = drive->mult_count ? drive->mult_count : 1;
-
-		ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
-	}
-
-	if (rq->errors >= ERROR_MAX || blk_noretry_request(rq)) {
-		ide_kill_rq(drive, rq);
-		return ide_stopped;
-	}
-
-	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
-		rq->errors |= ERROR_RESET;
-
-	if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-		++rq->errors;
-		return ide_do_reset(drive);
-	}
-
-	if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
-		drive->special.b.recalibrate = 1;
-
-	++rq->errors;
-
-	return ide_stopped;
-}
-
-static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq, u8 stat, u8 err)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	if ((stat & ATA_BUSY) ||
-	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
-		/* other bits are useless when BUSY */
-		rq->errors |= ERROR_RESET;
-	} else {
-		/* add decoding error stuff */
-	}
-
-	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
-		/* force an abort */
-		hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
-
-	if (rq->errors >= ERROR_MAX) {
-		ide_kill_rq(drive, rq);
-	} else {
-		if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-			++rq->errors;
-			return ide_do_reset(drive);
-		}
-		++rq->errors;
-	}
-
-	return ide_stopped;
-}
-
-static ide_startstop_t
-__ide_error(ide_drive_t *drive, struct request *rq, u8 stat, u8 err)
-{
-	if (drive->media == ide_disk)
-		return ide_ata_error(drive, rq, stat, err);
-	return ide_atapi_error(drive, rq, stat, err);
-}
-
-/**
- *	ide_error	-	handle an error on the IDE
- *	@drive: drive the error occurred on
- *	@msg: message to report
- *	@stat: status bits
- *
- *	ide_error() takes action based on the error returned by the drive.
- *	For normal I/O that may well include retries. We deal with
- *	both new-style (taskfile) and old style command handling here.
- *	In the case of taskfile command handling there is work left to
- *	do
- */
- 
-ide_startstop_t ide_error (ide_drive_t *drive, const char *msg, u8 stat)
-{
-	struct request *rq;
-	u8 err;
-
-	err = ide_dump_status(drive, msg, stat);
-
-	rq = drive->hwif->rq;
-	if (rq == NULL)
-		return ide_stopped;
-
-	/* retry only "normal" I/O: */
-	if (!blk_fs_request(rq)) {
-		rq->errors = 1;
-		ide_end_drive_cmd(drive, stat, err);
-		return ide_stopped;
-	}
-
-	return __ide_error(drive, rq, stat, err);
-}
-EXPORT_SYMBOL_GPL(ide_error);
-
 static void ide_tf_set_specify_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
 {
 	tf->nsect   = drive->sect;
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index cf6c3036ae7f..e0cfa2d2acc7 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -446,8 +446,8 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
  *
  * See also ide_execute_command
  */
-static void __ide_set_handler (ide_drive_t *drive, ide_handler_t *handler,
-		      unsigned int timeout, ide_expiry_t *expiry)
+void __ide_set_handler(ide_drive_t *drive, ide_handler_t *handler,
+		       unsigned int timeout, ide_expiry_t *expiry)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
@@ -517,301 +517,6 @@ void ide_execute_pkt_cmd(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_execute_pkt_cmd);
 
-static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
-{
-	struct request *rq = drive->hwif->rq;
-
-	if (rq && blk_special_request(rq) && rq->cmd[0] == REQ_DRIVE_RESET)
-		ide_end_request(drive, err ? err : 1, 0);
-}
-
-/* needed below */
-static ide_startstop_t do_reset1(ide_drive_t *, int);
-
-/*
- * atapi_reset_pollfunc() gets invoked to poll the interface for completion
- * every 50ms during an atapi drive reset operation.  If the drive has not yet
- * responded, and we have not yet hit our maximum waiting time, then the timer
- * is restarted for another 50ms.
- */
-static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 stat;
-
-	SELECT_DRIVE(drive);
-	udelay(10);
-	stat = hwif->tp_ops->read_status(hwif);
-
-	if (OK_STAT(stat, 0, ATA_BUSY))
-		printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name);
-	else {
-		if (time_before(jiffies, hwif->poll_timeout)) {
-			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20,
-					NULL);
-			/* continue polling */
-			return ide_started;
-		}
-		/* end of polling */
-		hwif->polling = 0;
-		printk(KERN_ERR "%s: ATAPI reset timed-out, status=0x%02x\n",
-			drive->name, stat);
-		/* do it the old fashioned way */
-		return do_reset1(drive, 1);
-	}
-	/* done polling */
-	hwif->polling = 0;
-	ide_complete_drive_reset(drive, 0);
-	return ide_stopped;
-}
-
-static void ide_reset_report_error(ide_hwif_t *hwif, u8 err)
-{
-	static const char *err_master_vals[] =
-		{ NULL, "passed", "formatter device error",
-		  "sector buffer error", "ECC circuitry error",
-		  "controlling MPU error" };
-
-	u8 err_master = err & 0x7f;
-
-	printk(KERN_ERR "%s: reset: master: ", hwif->name);
-	if (err_master && err_master < 6)
-		printk(KERN_CONT "%s", err_master_vals[err_master]);
-	else
-		printk(KERN_CONT "error (0x%02x?)", err);
-	if (err & 0x80)
-		printk(KERN_CONT "; slave: failed");
-	printk(KERN_CONT "\n");
-}
-
-/*
- * reset_pollfunc() gets invoked to poll the interface for completion every 50ms
- * during an ide reset operation. If the drives have not yet responded,
- * and we have not yet hit our maximum waiting time, then the timer is restarted
- * for another 50ms.
- */
-static ide_startstop_t reset_pollfunc(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	u8 tmp;
-	int err = 0;
-
-	if (port_ops && port_ops->reset_poll) {
-		err = port_ops->reset_poll(drive);
-		if (err) {
-			printk(KERN_ERR "%s: host reset_poll failure for %s.\n",
-				hwif->name, drive->name);
-			goto out;
-		}
-	}
-
-	tmp = hwif->tp_ops->read_status(hwif);
-
-	if (!OK_STAT(tmp, 0, ATA_BUSY)) {
-		if (time_before(jiffies, hwif->poll_timeout)) {
-			ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
-			/* continue polling */
-			return ide_started;
-		}
-		printk(KERN_ERR "%s: reset timed-out, status=0x%02x\n",
-			hwif->name, tmp);
-		drive->failures++;
-		err = -EIO;
-	} else  {
-		tmp = ide_read_error(drive);
-
-		if (tmp == 1) {
-			printk(KERN_INFO "%s: reset: success\n", hwif->name);
-			drive->failures = 0;
-		} else {
-			ide_reset_report_error(hwif, tmp);
-			drive->failures++;
-			err = -EIO;
-		}
-	}
-out:
-	hwif->polling = 0;	/* done polling */
-	ide_complete_drive_reset(drive, err);
-	return ide_stopped;
-}
-
-static void ide_disk_pre_reset(ide_drive_t *drive)
-{
-	int legacy = (drive->id[ATA_ID_CFS_ENABLE_2] & 0x0400) ? 0 : 1;
-
-	drive->special.all = 0;
-	drive->special.b.set_geometry = legacy;
-	drive->special.b.recalibrate  = legacy;
-
-	drive->mult_count = 0;
-	drive->dev_flags &= ~IDE_DFLAG_PARKED;
-
-	if ((drive->dev_flags & IDE_DFLAG_KEEP_SETTINGS) == 0 &&
-	    (drive->dev_flags & IDE_DFLAG_USING_DMA) == 0)
-		drive->mult_req = 0;
-
-	if (drive->mult_req != drive->mult_count)
-		drive->special.b.set_multmode = 1;
-}
-
-static void pre_reset(ide_drive_t *drive)
-{
-	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
-
-	if (drive->media == ide_disk)
-		ide_disk_pre_reset(drive);
-	else
-		drive->dev_flags |= IDE_DFLAG_POST_RESET;
-
-	if (drive->dev_flags & IDE_DFLAG_USING_DMA) {
-		if (drive->crc_count)
-			ide_check_dma_crc(drive);
-		else
-			ide_dma_off(drive);
-	}
-
-	if ((drive->dev_flags & IDE_DFLAG_KEEP_SETTINGS) == 0) {
-		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0) {
-			drive->dev_flags &= ~IDE_DFLAG_UNMASK;
-			drive->io_32bit = 0;
-		}
-		return;
-	}
-
-	if (port_ops && port_ops->pre_reset)
-		port_ops->pre_reset(drive);
-
-	if (drive->current_speed != 0xff)
-		drive->desired_speed = drive->current_speed;
-	drive->current_speed = 0xff;
-}
-
-/*
- * do_reset1() attempts to recover a confused drive by resetting it.
- * Unfortunately, resetting a disk drive actually resets all devices on
- * the same interface, so it can really be thought of as resetting the
- * interface rather than resetting the drive.
- *
- * ATAPI devices have their own reset mechanism which allows them to be
- * individually reset without clobbering other devices on the same interface.
- *
- * Unfortunately, the IDE interface does not generate an interrupt to let
- * us know when the reset operation has finished, so we must poll for this.
- * Equally poor, though, is the fact that this may a very long time to complete,
- * (up to 30 seconds worstcase).  So, instead of busy-waiting here for it,
- * we set a timer to poll at 50ms intervals.
- */
-static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	const struct ide_port_ops *port_ops;
-	ide_drive_t *tdrive;
-	unsigned long flags, timeout;
-	int i;
-	DEFINE_WAIT(wait);
-
-	spin_lock_irqsave(&hwif->lock, flags);
-
-	/* We must not reset with running handlers */
-	BUG_ON(hwif->handler != NULL);
-
-	/* For an ATAPI device, first try an ATAPI SRST. */
-	if (drive->media != ide_disk && !do_not_try_atapi) {
-		pre_reset(drive);
-		SELECT_DRIVE(drive);
-		udelay(20);
-		tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
-		ndelay(400);
-		hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
-		hwif->polling = 1;
-		__ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL);
-		spin_unlock_irqrestore(&hwif->lock, flags);
-		return ide_started;
-	}
-
-	/* We must not disturb devices in the IDE_DFLAG_PARKED state. */
-	do {
-		unsigned long now;
-
-		prepare_to_wait(&ide_park_wq, &wait, TASK_UNINTERRUPTIBLE);
-		timeout = jiffies;
-		ide_port_for_each_present_dev(i, tdrive, hwif) {
-			if ((tdrive->dev_flags & IDE_DFLAG_PARKED) &&
-			    time_after(tdrive->sleep, timeout))
-				timeout = tdrive->sleep;
-		}
-
-		now = jiffies;
-		if (time_before_eq(timeout, now))
-			break;
-
-		spin_unlock_irqrestore(&hwif->lock, flags);
-		timeout = schedule_timeout_uninterruptible(timeout - now);
-		spin_lock_irqsave(&hwif->lock, flags);
-	} while (timeout);
-	finish_wait(&ide_park_wq, &wait);
-
-	/*
-	 * First, reset any device state data we were maintaining
-	 * for any of the drives on this interface.
-	 */
-	ide_port_for_each_dev(i, tdrive, hwif)
-		pre_reset(tdrive);
-
-	if (io_ports->ctl_addr == 0) {
-		spin_unlock_irqrestore(&hwif->lock, flags);
-		ide_complete_drive_reset(drive, -ENXIO);
-		return ide_stopped;
-	}
-
-	/*
-	 * Note that we also set nIEN while resetting the device,
-	 * to mask unwanted interrupts from the interface during the reset.
-	 * However, due to the design of PC hardware, this will cause an
-	 * immediate interrupt due to the edge transition it produces.
-	 * This single interrupt gives us a "fast poll" for drives that
-	 * recover from reset very quickly, saving us the first 50ms wait time.
-	 *
-	 * TODO: add ->softreset method and stop abusing ->set_irq
-	 */
-	/* set SRST and nIEN */
-	tp_ops->set_irq(hwif, 4);
-	/* more than enough time */
-	udelay(10);
-	/* clear SRST, leave nIEN (unless device is on the quirk list) */
-	tp_ops->set_irq(hwif, drive->quirk_list == 2);
-	/* more than enough time */
-	udelay(10);
-	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
-	hwif->polling = 1;
-	__ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
-
-	/*
-	 * Some weird controller like resetting themselves to a strange
-	 * state when the disks are reset this way. At least, the Winbond
-	 * 553 documentation says that
-	 */
-	port_ops = hwif->port_ops;
-	if (port_ops && port_ops->resetproc)
-		port_ops->resetproc(drive);
-
-	spin_unlock_irqrestore(&hwif->lock, flags);
-	return ide_started;
-}
-
-/*
- * ide_do_reset() is the entry point to the drive/interface reset code.
- */
-
-ide_startstop_t ide_do_reset(ide_drive_t *drive)
-{
-	return do_reset1(drive, 0);
-}
-EXPORT_SYMBOL(ide_do_reset);
-
 /*
  * ide_wait_not_busy() waits for the currently selected device on the hwif
  * to report a non-busy status, see comments in ide_probe_port().
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 323c3710fbf4..0c87ed52a875 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1146,11 +1146,14 @@ int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned l
 extern int ide_vlb_clk;
 extern int ide_pci_clk;
 
-extern int ide_end_request (ide_drive_t *drive, int uptodate, int nrsecs);
-int ide_end_dequeued_request(ide_drive_t *drive, struct request *rq,
-			     int uptodate, int nr_sectors);
-
-extern void ide_set_handler (ide_drive_t *drive, ide_handler_t *handler, unsigned int timeout, ide_expiry_t *expiry);
+int ide_end_request(ide_drive_t *, int, int);
+int ide_end_dequeued_request(ide_drive_t *, struct request *, int, int);
+void ide_kill_rq(ide_drive_t *, struct request *);
+
+void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int,
+		       ide_expiry_t *);
+void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int,
+		     ide_expiry_t *);
 
 void ide_execute_command(ide_drive_t *, u8, ide_handler_t *, unsigned int,
 			 ide_expiry_t *);
-- 
cgit v1.2.3-71-gd317


From 2467922a560bb7e6eb4635435760ad0a2197ffcc Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:52 +0100
Subject: ide: remove no longer needed IDE_HFLAG[_FORCE]_LEGACY_IRQS

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/atiixp.c          |  3 +--
 drivers/ide/ide-pci-generic.c |  4 +---
 drivers/ide/piix.c            | 11 +----------
 drivers/ide/serverworks.c     |  9 ++-------
 drivers/ide/sis5513.c         |  2 +-
 drivers/ide/slc90e66.c        |  1 -
 drivers/ide/via82cxxx.c       | 10 ----------
 include/linux/ide.h           |  4 ----
 8 files changed, 6 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c
index ecd1e62ca91a..923cbfe259d3 100644
--- a/drivers/ide/atiixp.c
+++ b/drivers/ide/atiixp.c
@@ -142,7 +142,6 @@ static const struct ide_port_info atiixp_pci_info[] __devinitdata = {
 		.name		= DRV_NAME,
 		.enablebits	= {{0x48,0x01,0x00}, {0x48,0x08,0x00}},
 		.port_ops	= &atiixp_port_ops,
-		.host_flags	= IDE_HFLAG_LEGACY_IRQS,
 		.pio_mask	= ATA_PIO4,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= ATA_UDMA5,
@@ -151,7 +150,7 @@ static const struct ide_port_info atiixp_pci_info[] __devinitdata = {
 		.name		= DRV_NAME,
 		.enablebits	= {{0x48,0x01,0x00}, {0x00,0x00,0x00}},
 		.port_ops	= &atiixp_port_ops,
-		.host_flags	= IDE_HFLAG_SINGLE | IDE_HFLAG_LEGACY_IRQS,
+		.host_flags	= IDE_HFLAG_SINGLE,
 		.pio_mask	= ATA_PIO4,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= ATA_UDMA5,
diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c
index bddae2b329a0..61111fd27130 100644
--- a/drivers/ide/ide-pci-generic.c
+++ b/drivers/ide/ide-pci-generic.c
@@ -33,8 +33,6 @@ static int ide_generic_all;		/* Set to claim all devices */
 module_param_named(all_generic_ide, ide_generic_all, bool, 0444);
 MODULE_PARM_DESC(all_generic_ide, "IDE generic will claim all unknown PCI IDE storage controllers.");
 
-#define IDE_HFLAGS_UMC (IDE_HFLAG_NO_DMA | IDE_HFLAG_FORCE_LEGACY_IRQS)
-
 #define DECLARE_GENERIC_PCI_DEV(extra_flags) \
 	{ \
 		.name		= DRV_NAME, \
@@ -61,7 +59,7 @@ static const struct ide_port_info generic_chipsets[] __devinitdata = {
 	/*  2: SAMURAI / HT6565 / HINT_IDE */
 	DECLARE_GENERIC_PCI_DEV(0),
 	/*  3: UM8673F / UM8886A / UM8886BF */
-	DECLARE_GENERIC_PCI_DEV(IDE_HFLAGS_UMC),
+	DECLARE_GENERIC_PCI_DEV(IDE_HFLAG_NO_DMA),
 	/*  4: VIA_IDE / OPTI621V / Piccolo010{2,3,5} */
 	DECLARE_GENERIC_PCI_DEV(IDE_HFLAG_NO_AUTODMA),
 
diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c
index f1e2e4ef0d71..42c2e3522d74 100644
--- a/drivers/ide/piix.c
+++ b/drivers/ide/piix.c
@@ -318,19 +318,12 @@ static const struct ide_port_ops ich_port_ops = {
 	.cable_detect		= piix_cable_detect,
 };
 
-#ifndef CONFIG_IA64
- #define IDE_HFLAGS_PIIX IDE_HFLAG_LEGACY_IRQS
-#else
- #define IDE_HFLAGS_PIIX 0
-#endif
-
 #define DECLARE_PIIX_DEV(udma) \
 	{						\
 		.name		= DRV_NAME,		\
 		.init_hwif	= init_hwif_piix,	\
 		.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}}, \
 		.port_ops	= &piix_port_ops,	\
-		.host_flags	= IDE_HFLAGS_PIIX,	\
 		.pio_mask	= ATA_PIO4,		\
 		.swdma_mask	= ATA_SWDMA2_ONLY,	\
 		.mwdma_mask	= ATA_MWDMA12_ONLY,	\
@@ -344,7 +337,6 @@ static const struct ide_port_ops ich_port_ops = {
 		.init_hwif	= init_hwif_piix, \
 		.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}}, \
 		.port_ops	= &ich_port_ops, \
-		.host_flags	= IDE_HFLAGS_PIIX, \
 		.pio_mask	= ATA_PIO4, \
 		.swdma_mask	= ATA_SWDMA2_ONLY, \
 		.mwdma_mask	= ATA_MWDMA12_ONLY, \
@@ -360,8 +352,7 @@ static const struct ide_port_info piix_pci_info[] __devinitdata = {
 		 */
 		.name		= DRV_NAME,
 		.enablebits	= {{0x6d,0xc0,0x80}, {0x6d,0xc0,0xc0}},
-		.host_flags	= IDE_HFLAG_ISA_PORTS | IDE_HFLAG_NO_DMA |
-				  IDE_HFLAGS_PIIX,
+		.host_flags	= IDE_HFLAG_ISA_PORTS | IDE_HFLAG_NO_DMA,
 		.pio_mask	= ATA_PIO4,
 		/* This is a painful system best to let it self tune for now */
 	},
diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c
index 382102ba467b..14718e73991e 100644
--- a/drivers/ide/serverworks.c
+++ b/drivers/ide/serverworks.c
@@ -353,14 +353,11 @@ static const struct ide_port_ops svwks_port_ops = {
 	.cable_detect		= svwks_cable_detect,
 };
 
-#define IDE_HFLAGS_SVWKS IDE_HFLAG_LEGACY_IRQS
-
 static const struct ide_port_info serverworks_chipsets[] __devinitdata = {
 	{	/* 0: OSB4 */
 		.name		= DRV_NAME,
 		.init_chipset	= init_chipset_svwks,
 		.port_ops	= &osb4_port_ops,
-		.host_flags	= IDE_HFLAGS_SVWKS,
 		.pio_mask	= ATA_PIO4,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= 0x00, /* UDMA is problematic on OSB4 */
@@ -369,7 +366,6 @@ static const struct ide_port_info serverworks_chipsets[] __devinitdata = {
 		.name		= DRV_NAME,
 		.init_chipset	= init_chipset_svwks,
 		.port_ops	= &svwks_port_ops,
-		.host_flags	= IDE_HFLAGS_SVWKS,
 		.pio_mask	= ATA_PIO4,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= ATA_UDMA5,
@@ -378,7 +374,6 @@ static const struct ide_port_info serverworks_chipsets[] __devinitdata = {
 		.name		= DRV_NAME,
 		.init_chipset	= init_chipset_svwks,
 		.port_ops	= &svwks_port_ops,
-		.host_flags	= IDE_HFLAGS_SVWKS,
 		.pio_mask	= ATA_PIO4,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= ATA_UDMA5,
@@ -387,7 +382,7 @@ static const struct ide_port_info serverworks_chipsets[] __devinitdata = {
 		.name		= DRV_NAME,
 		.init_chipset	= init_chipset_svwks,
 		.port_ops	= &svwks_port_ops,
-		.host_flags	= IDE_HFLAGS_SVWKS | IDE_HFLAG_SINGLE,
+		.host_flags	= IDE_HFLAG_SINGLE,
 		.pio_mask	= ATA_PIO4,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= ATA_UDMA5,
@@ -396,7 +391,7 @@ static const struct ide_port_info serverworks_chipsets[] __devinitdata = {
 		.name		= DRV_NAME,
 		.init_chipset	= init_chipset_svwks,
 		.port_ops	= &svwks_port_ops,
-		.host_flags	= IDE_HFLAGS_SVWKS | IDE_HFLAG_SINGLE,
+		.host_flags	= IDE_HFLAG_SINGLE,
 		.pio_mask	= ATA_PIO4,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= ATA_UDMA5,
diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c
index 9ec1a4a4432c..d2d54aaea13a 100644
--- a/drivers/ide/sis5513.c
+++ b/drivers/ide/sis5513.c
@@ -563,7 +563,7 @@ static const struct ide_port_info sis5513_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_chipset	= init_chipset_sis5513,
 	.enablebits	= { {0x4a, 0x02, 0x02}, {0x4a, 0x04, 0x04} },
-	.host_flags	= IDE_HFLAG_LEGACY_IRQS | IDE_HFLAG_NO_AUTODMA,
+	.host_flags	= IDE_HFLAG_NO_AUTODMA,
 	.pio_mask	= ATA_PIO4,
 	.mwdma_mask	= ATA_MWDMA2,
 };
diff --git a/drivers/ide/slc90e66.c b/drivers/ide/slc90e66.c
index 40b4b94a4288..f55d7d6313e8 100644
--- a/drivers/ide/slc90e66.c
+++ b/drivers/ide/slc90e66.c
@@ -136,7 +136,6 @@ static const struct ide_port_info slc90e66_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.enablebits	= { {0x41, 0x80, 0x80}, {0x43, 0x80, 0x80} },
 	.port_ops	= &slc90e66_port_ops,
-	.host_flags	= IDE_HFLAG_LEGACY_IRQS,
 	.pio_mask	= ATA_PIO4,
 	.swdma_mask	= ATA_SWDMA2_ONLY,
 	.mwdma_mask	= ATA_MWDMA12_ONLY,
diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c
index 6092fe3f409d..a41eab5cb5df 100644
--- a/drivers/ide/via82cxxx.c
+++ b/drivers/ide/via82cxxx.c
@@ -443,16 +443,6 @@ static int __devinit via_init_one(struct pci_dev *dev, const struct pci_device_i
 	if ((via_config->flags & VIA_NO_UNMASK) == 0)
 		d.host_flags |= IDE_HFLAG_UNMASK_IRQS;
 
-#ifdef CONFIG_PPC_CHRP
-	if (machine_is(chrp) && _chrp_type == _CHRP_Pegasos)
-		d.host_flags |= IDE_HFLAG_FORCE_LEGACY_IRQS;
-#endif
-
-#ifdef CONFIG_AMIGAONE
-	if (machine_is(amigaone))
-		d.host_flags |= IDE_HFLAG_FORCE_LEGACY_IRQS;
-#endif
-
 	d.udma_mask = via_config->udma_mask;
 
 	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0c87ed52a875..bfd07b866b6a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1349,10 +1349,6 @@ enum {
 	IDE_HFLAG_ERROR_STOPS_FIFO	= (1 << 19),
 	/* serialize ports */
 	IDE_HFLAG_SERIALIZE		= (1 << 20),
-	/* use legacy IRQs */
-	IDE_HFLAG_LEGACY_IRQS		= (1 << 21),
-	/* force use of legacy IRQs */
-	IDE_HFLAG_FORCE_LEGACY_IRQS	= (1 << 22),
 	/* host is TRM290 */
 	IDE_HFLAG_TRM290		= (1 << 23),
 	/* use 32-bit I/O ops */
-- 
cgit v1.2.3-71-gd317


From 8b07ed26f8eb73d4f55a9d852712cd588c45ff51 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:52 +0100
Subject: ide: remove no longer needed IRQ fallback code from hwif_init()

Then remove no longer used __ide_default_irq().

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c | 30 ++++--------------------------
 include/linux/ide.h     | 15 ---------------
 2 files changed, 4 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 80967307f2bb..ebc328c2e7ee 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1070,14 +1070,9 @@ static void drive_release_dev (struct device *dev)
 
 static int hwif_init(ide_hwif_t *hwif)
 {
-	int old_irq;
-
 	if (!hwif->irq) {
-		hwif->irq = __ide_default_irq(hwif->io_ports.data_addr);
-		if (!hwif->irq) {
-			printk(KERN_ERR "%s: disabled, no IRQ\n", hwif->name);
-			return 0;
-		}
+		printk(KERN_ERR "%s: disabled, no IRQ\n", hwif->name);
+		return 0;
 	}
 
 	if (register_blkdev(hwif->major, hwif->name))
@@ -1095,29 +1090,12 @@ static int hwif_init(ide_hwif_t *hwif)
 
 	sg_init_table(hwif->sg_table, hwif->sg_max_nents);
 	
-	if (init_irq(hwif) == 0)
-		goto done;
-
-	old_irq = hwif->irq;
-	/*
-	 *	It failed to initialise. Find the default IRQ for 
-	 *	this port and try that.
-	 */
-	hwif->irq = __ide_default_irq(hwif->io_ports.data_addr);
-	if (!hwif->irq) {
-		printk(KERN_ERR "%s: disabled, unable to get IRQ %d\n",
-			hwif->name, old_irq);
-		goto out;
-	}
 	if (init_irq(hwif)) {
-		printk(KERN_ERR "%s: probed IRQ %d and default IRQ %d failed\n",
-			hwif->name, old_irq, hwif->irq);
+		printk(KERN_ERR "%s: disabled, unable to get IRQ %d\n",
+			hwif->name, hwif->irq);
 		goto out;
 	}
-	printk(KERN_WARNING "%s: probed IRQ %d failed, using default\n",
-		hwif->name, hwif->irq);
 
-done:
 	blk_register_region(MKDEV(hwif->major, 0), MAX_DRIVES << PARTN_BITS,
 			    THIS_MODULE, ata_probe, ata_lock, hwif);
 	return 1;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index bfd07b866b6a..31e492c7bdef 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -193,21 +193,6 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 	hw->io_ports.ctl_addr = ctl_addr;
 }
 
-/* for IDE PCI controllers in legacy mode, temporary */
-static inline int __ide_default_irq(unsigned long base)
-{
-	switch (base) {
-#ifdef CONFIG_IA64
-	case 0x1f0: return isa_irq_to_vector(14);
-	case 0x170: return isa_irq_to_vector(15);
-#else
-	case 0x1f0: return 14;
-	case 0x170: return 15;
-#endif
-	}
-	return 0;
-}
-
 #if defined(CONFIG_ARM) || defined(CONFIG_FRV) || defined(CONFIG_M68K) || \
     defined(CONFIG_MIPS) || defined(CONFIG_MN10300) || defined(CONFIG_PARISC) \
     || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || defined(CONFIG_SPARC64)
-- 
cgit v1.2.3-71-gd317


From 2ed0ef543ae3f3ea4f8bd0433fb1fed22625a309 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:53 +0100
Subject: ide: fix ->init_chipset method to return 'int' value

* Return 0 instead of dev->irq in ->init_chipset implementations.

* Fix ->init_chipset method to return 'int' value instead of
  'unsigned int' one.

This fixes ->init_chipset handling for host drivers (cs5530, hpt366
and pdc202xx_new) for which it is possible for this method to fail.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/aec62xx.c      | 4 ++--
 drivers/ide/alim15x3.c     | 2 +-
 drivers/ide/amd74xx.c      | 4 ++--
 drivers/ide/cmd64x.c       | 2 +-
 drivers/ide/cs5530.c       | 2 +-
 drivers/ide/delkin_cb.c    | 2 +-
 drivers/ide/hpt366.c       | 4 ++--
 drivers/ide/it821x.c       | 2 +-
 drivers/ide/pdc202xx_new.c | 4 ++--
 drivers/ide/pdc202xx_old.c | 4 ++--
 drivers/ide/piix.c         | 2 +-
 drivers/ide/serverworks.c  | 4 ++--
 drivers/ide/setup-pci.c    | 2 +-
 drivers/ide/siimage.c      | 2 +-
 drivers/ide/sis5513.c      | 2 +-
 drivers/ide/sl82c105.c     | 4 ++--
 drivers/ide/via82cxxx.c    | 2 +-
 include/linux/ide.h        | 4 ++--
 18 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c
index 4485b9c6f0e6..878f8ec6dbe1 100644
--- a/drivers/ide/aec62xx.c
+++ b/drivers/ide/aec62xx.c
@@ -139,7 +139,7 @@ static void aec_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	drive->hwif->port_ops->set_dma_mode(drive, pio + XFER_PIO_0);
 }
 
-static unsigned int init_chipset_aec62xx(struct pci_dev *dev)
+static int init_chipset_aec62xx(struct pci_dev *dev)
 {
 	/* These are necessary to get AEC6280 Macintosh cards to work */
 	if ((dev->device == PCI_DEVICE_ID_ARTOP_ATP865) ||
@@ -156,7 +156,7 @@ static unsigned int init_chipset_aec62xx(struct pci_dev *dev)
 		pci_write_config_byte(dev, 0x4a, reg4ah | 0x80);
 	}
 
-	return dev->irq;
+	return 0;
 }
 
 static u8 atp86x_cable_detect(ide_hwif_t *hwif)
diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index 66f43083408b..d3513b6b8530 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -212,7 +212,7 @@ static int ali15x3_dma_setup(ide_drive_t *drive)
  *	appropriate also sets up the 1533 southbridge.
  */
 
-static unsigned int init_chipset_ali15x3(struct pci_dev *dev)
+static int init_chipset_ali15x3(struct pci_dev *dev)
 {
 	unsigned long flags;
 	u8 tmpbyte;
diff --git a/drivers/ide/amd74xx.c b/drivers/ide/amd74xx.c
index 0b51921e63e6..628cd2e5fed8 100644
--- a/drivers/ide/amd74xx.c
+++ b/drivers/ide/amd74xx.c
@@ -140,7 +140,7 @@ static void amd7411_cable_detect(struct pci_dev *dev)
  * The initialization callback.  Initialize drive independent registers.
  */
 
-static unsigned int init_chipset_amd74xx(struct pci_dev *dev)
+static int init_chipset_amd74xx(struct pci_dev *dev)
 {
 	u8 t = 0, offset = amd_offset(dev);
 
@@ -172,7 +172,7 @@ static unsigned int init_chipset_amd74xx(struct pci_dev *dev)
 		t |= 0xf0;
 	pci_write_config_byte(dev, AMD_IDE_CONFIG + offset, t);
 
-	return dev->irq;
+	return 0;
 }
 
 static u8 amd_cable_detect(ide_hwif_t *hwif)
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index 2f9688d87ecd..aeee036b1503 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -333,7 +333,7 @@ static int cmd646_1_dma_end(ide_drive_t *drive)
 	return (dma_stat & 7) != 4;
 }
 
-static unsigned int init_chipset_cmd64x(struct pci_dev *dev)
+static int init_chipset_cmd64x(struct pci_dev *dev)
 {
 	u8 mrdmode = 0;
 
diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c
index d8ede85fe17f..8e8b35a89901 100644
--- a/drivers/ide/cs5530.c
+++ b/drivers/ide/cs5530.c
@@ -135,7 +135,7 @@ static void cs5530_set_dma_mode(ide_drive_t *drive, const u8 mode)
  *	Initialize the cs5530 bridge for reliable IDE DMA operation.
  */
 
-static unsigned int init_chipset_cs5530(struct pci_dev *dev)
+static int init_chipset_cs5530(struct pci_dev *dev)
 {
 	struct pci_dev *master_0 = NULL, *cs5530_0 = NULL;
 
diff --git a/drivers/ide/delkin_cb.c b/drivers/ide/delkin_cb.c
index 8f1b2d9f0513..bacb1194c9c9 100644
--- a/drivers/ide/delkin_cb.c
+++ b/drivers/ide/delkin_cb.c
@@ -46,7 +46,7 @@ static const struct ide_port_ops delkin_cb_port_ops = {
 	.quirkproc		= ide_undecoded_slave,
 };
 
-static unsigned int delkin_cb_init_chipset(struct pci_dev *dev)
+static int delkin_cb_init_chipset(struct pci_dev *dev)
 {
 	unsigned long base = pci_resource_start(dev, 0);
 	int i;
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 3eb9b5c63a0f..d3b3e824f445 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -995,7 +995,7 @@ static void hpt3xx_disable_fast_irq(struct pci_dev *dev, u8 mcr_addr)
 		pci_write_config_byte(dev, mcr_addr + 1, new_mcr);
 }
 
-static unsigned int init_chipset_hpt366(struct pci_dev *dev)
+static int init_chipset_hpt366(struct pci_dev *dev)
 {
 	unsigned long io_base	= pci_resource_start(dev, 4);
 	struct hpt_info *info	= hpt3xx_get_info(&dev->dev);
@@ -1237,7 +1237,7 @@ static unsigned int init_chipset_hpt366(struct pci_dev *dev)
 	hpt3xx_disable_fast_irq(dev, 0x50);
 	hpt3xx_disable_fast_irq(dev, 0x54);
 
-	return dev->irq;
+	return 0;
 }
 
 static u8 hpt3xx_cable_detect(ide_hwif_t *hwif)
diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c
index 13b8153112ed..6b9fc950b4af 100644
--- a/drivers/ide/it821x.c
+++ b/drivers/ide/it821x.c
@@ -603,7 +603,7 @@ static void it8212_disable_raid(struct pci_dev *dev)
 	pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x20);
 }
 
-static unsigned int init_chipset_it821x(struct pci_dev *dev)
+static int init_chipset_it821x(struct pci_dev *dev)
 {
 	u8 conf;
 	static char *mode[2] = { "pass through", "smart" };
diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c
index f21290c4b447..b68906c3c17e 100644
--- a/drivers/ide/pdc202xx_new.c
+++ b/drivers/ide/pdc202xx_new.c
@@ -325,7 +325,7 @@ static void apple_kiwi_init(struct pci_dev *pdev)
 }
 #endif /* CONFIG_PPC_PMAC */
 
-static unsigned int init_chipset_pdcnew(struct pci_dev *dev)
+static int init_chipset_pdcnew(struct pci_dev *dev)
 {
 	const char *name = DRV_NAME;
 	unsigned long dma_base = pci_resource_start(dev, 4);
@@ -444,7 +444,7 @@ static unsigned int init_chipset_pdcnew(struct pci_dev *dev)
 #endif
 
  out:
-	return dev->irq;
+	return 0;
 }
 
 static struct pci_dev * __devinit pdc20270_get_dev2(struct pci_dev *dev)
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index 97193323aebf..cba66ebce4e3 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -264,7 +264,7 @@ static void pdc202xx_dma_timeout(ide_drive_t *drive)
 	ide_dma_timeout(drive);
 }
 
-static unsigned int init_chipset_pdc202xx(struct pci_dev *dev)
+static int init_chipset_pdc202xx(struct pci_dev *dev)
 {
 	unsigned long dmabase = pci_resource_start(dev, 4);
 	u8 udma_speed_flag = 0, primary_mode = 0, secondary_mode = 0;
@@ -290,7 +290,7 @@ static unsigned int init_chipset_pdc202xx(struct pci_dev *dev)
 		printk("%sACTIVE\n", (inb(dmabase | 0x1f) & 1) ? "" : "IN");
 	}
 out:
-	return dev->irq;
+	return 0;
 }
 
 static void __devinit pdc202ata4_fixup_irq(struct pci_dev *dev,
diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c
index 42c2e3522d74..2aa699933064 100644
--- a/drivers/ide/piix.c
+++ b/drivers/ide/piix.c
@@ -204,7 +204,7 @@ static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed)
  *	out to be nice and simple.
  */
 
-static unsigned int init_chipset_ich(struct pci_dev *dev)
+static int init_chipset_ich(struct pci_dev *dev)
 {
 	u32 extra = 0;
 
diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c
index 14718e73991e..b6554ef92716 100644
--- a/drivers/ide/serverworks.c
+++ b/drivers/ide/serverworks.c
@@ -175,7 +175,7 @@ static void svwks_set_dma_mode(ide_drive_t *drive, const u8 speed)
 	pci_write_config_byte(dev, 0x54, ultra_enable);
 }
 
-static unsigned int init_chipset_svwks(struct pci_dev *dev)
+static int init_chipset_svwks(struct pci_dev *dev)
 {
 	unsigned int reg;
 	u8 btr;
@@ -270,7 +270,7 @@ static unsigned int init_chipset_svwks(struct pci_dev *dev)
 		pci_write_config_byte(dev, 0x5A, btr);
 	}
 
-	return dev->irq;
+	return 0;
 }
 
 static u8 ata66_svwks_svwks(ide_hwif_t *hwif)
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 79e3244691ec..75e3beca86f0 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -524,7 +524,7 @@ static int do_ide_setup_pci_device(struct pci_dev *dev,
 		if (noisy)
 			printk(KERN_INFO "%s %s: not 100%% native mode: will "
 				"probe irqs later\n", d->name, pci_name(dev));
-		pciirq = ret;
+		pciirq = 0;
 	} else if (!pciirq && noisy) {
 		printk(KERN_WARNING "%s %s: bad irq (%d): will probe later\n",
 			d->name, pci_name(dev), pciirq);
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index cb2b352b876b..1811ae9cd843 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -464,7 +464,7 @@ static void sil_sata_pre_reset(ide_drive_t *drive)
  *	to 133 MHz clocking if the system isn't already set up to do it.
  */
 
-static unsigned int init_chipset_siimage(struct pci_dev *dev)
+static int init_chipset_siimage(struct pci_dev *dev)
 {
 	struct ide_host *host = pci_get_drvdata(dev);
 	void __iomem *ioaddr = host->host_priv;
diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c
index d2d54aaea13a..afca22beaadf 100644
--- a/drivers/ide/sis5513.c
+++ b/drivers/ide/sis5513.c
@@ -447,7 +447,7 @@ static int __devinit sis_find_family(struct pci_dev *dev)
 	return chipset_family;
 }
 
-static unsigned int init_chipset_sis5513(struct pci_dev *dev)
+static int init_chipset_sis5513(struct pci_dev *dev)
 {
 	/* Make general config ops here
 	   1/ tell IDE channels to operate in Compatibility mode only
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index 6297956507c0..dba213c51baa 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -271,7 +271,7 @@ static u8 sl82c105_bridge_revision(struct pci_dev *dev)
  * channel 0 here at least, but channel 1 has to be enabled by
  * firmware or arch code. We still set both to 16 bits mode.
  */
-static unsigned int init_chipset_sl82c105(struct pci_dev *dev)
+static int init_chipset_sl82c105(struct pci_dev *dev)
 {
 	u32 val;
 
@@ -281,7 +281,7 @@ static unsigned int init_chipset_sl82c105(struct pci_dev *dev)
 	val |= CTRL_P0EN | CTRL_P0F16 | CTRL_P1F16;
 	pci_write_config_dword(dev, 0x40, val);
 
-	return dev->irq;
+	return 0;
 }
 
 static const struct ide_port_ops sl82c105_port_ops = {
diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c
index a41eab5cb5df..3ff7231e4858 100644
--- a/drivers/ide/via82cxxx.c
+++ b/drivers/ide/via82cxxx.c
@@ -267,7 +267,7 @@ static void via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
  *	and initialize its drive independent registers.
  */
 
-static unsigned int init_chipset_via82cxxx(struct pci_dev *dev)
+static int init_chipset_via82cxxx(struct pci_dev *dev)
 {
 	struct ide_host *host = pci_get_drvdata(dev);
 	struct via82cxxx_dev *vdev = host->host_priv;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 31e492c7bdef..117dd171e70b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -851,7 +851,7 @@ struct ide_host {
 	ide_hwif_t	*ports[MAX_HOST_PORTS + 1];
 	unsigned int	n_ports;
 	struct device	*dev[2];
-	unsigned int	(*init_chipset)(struct pci_dev *);
+	int		(*init_chipset)(struct pci_dev *);
 	irq_handler_t	irq_handler;
 	unsigned long	host_flags;
 	void		*host_priv;
@@ -1361,7 +1361,7 @@ enum {
 
 struct ide_port_info {
 	char			*name;
-	unsigned int		(*init_chipset)(struct pci_dev *);
+	int			(*init_chipset)(struct pci_dev *);
 	void			(*init_iops)(ide_hwif_t *);
 	void                    (*init_hwif)(ide_hwif_t *);
 	int			(*init_dma)(ide_hwif_t *,
-- 
cgit v1.2.3-71-gd317


From 86ccf37c6acd74cf7e4b7751ee045de19943c5a0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:53 +0100
Subject: ide: remove pciirq argument from ide_pci_setup_ports()

* Set ->irq explicitly in cs5520.c.

* Remove irq argument from ide_hw_configure().

* Remove pciirq argument from ide_pci_setup_ports().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/cs5520.c    |  3 ++-
 drivers/ide/setup-pci.c | 13 +++++--------
 include/linux/ide.h     |  2 +-
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c
index d003bec56ff9..58fb90e5b763 100644
--- a/drivers/ide/cs5520.c
+++ b/drivers/ide/cs5520.c
@@ -133,7 +133,8 @@ static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_devic
 	 *	do all the device setup for us
 	 */
 
-	ide_pci_setup_ports(dev, d, 14, &hw[0], &hws[0]);
+	ide_pci_setup_ports(dev, d, &hw[0], &hws[0]);
+	hw[0].irq = 14;
 
 	return ide_host_add(d, hws, NULL);
 }
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 75e3beca86f0..24bc884826fc 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -305,7 +305,6 @@ static int ide_pci_check_iomem(struct pci_dev *dev, const struct ide_port_info *
  *	@dev: PCI device holding interface
  *	@d: IDE port info
  *	@port: port number
- *	@irq: PCI IRQ
  *	@hw: hw_regs_t instance corresponding to this port
  *
  *	Perform the initial set up for the hardware interface structure. This
@@ -316,7 +315,7 @@ static int ide_pci_check_iomem(struct pci_dev *dev, const struct ide_port_info *
  */
 
 static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d,
-			    unsigned int port, int irq, hw_regs_t *hw)
+			    unsigned int port, hw_regs_t *hw)
 {
 	unsigned long ctl = 0, base = 0;
 
@@ -344,7 +343,6 @@ static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d,
 	}
 
 	memset(hw, 0, sizeof(*hw));
-	hw->irq = irq;
 	hw->dev = &dev->dev;
 	hw->chipset = d->chipset ? d->chipset : ide_pci;
 	ide_std_init_ports(hw, base, ctl | 2);
@@ -448,7 +446,6 @@ out:
  *	ide_pci_setup_ports	-	configure ports/devices on PCI IDE
  *	@dev: PCI device
  *	@d: IDE port info
- *	@pciirq: IRQ line
  *	@hw: hw_regs_t instances corresponding to this PCI IDE device
  *	@hws: hw_regs_t pointers table to update
  *
@@ -462,7 +459,7 @@ out:
  */
 
 void ide_pci_setup_ports(struct pci_dev *dev, const struct ide_port_info *d,
-			 int pciirq, hw_regs_t *hw, hw_regs_t **hws)
+			 hw_regs_t *hw, hw_regs_t **hws)
 {
 	int channels = (d->host_flags & IDE_HFLAG_SINGLE) ? 1 : 2, port;
 	u8 tmp;
@@ -481,7 +478,7 @@ void ide_pci_setup_ports(struct pci_dev *dev, const struct ide_port_info *d,
 			continue;	/* port not enabled */
 		}
 
-		if (ide_hw_configure(dev, d, port, pciirq, hw + port))
+		if (ide_hw_configure(dev, d, port, hw + port))
 			continue;
 
 		*(hws + port) = hw + port;
@@ -549,7 +546,7 @@ int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
 	if (ret < 0)
 		goto out;
 
-	ide_pci_setup_ports(dev, d, 0, &hw[0], &hws[0]);
+	ide_pci_setup_ports(dev, d, &hw[0], &hws[0]);
 
 	host = ide_host_alloc(d, hws);
 	if (host == NULL) {
@@ -595,7 +592,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
 		if (ret < 0)
 			goto out;
 
-		ide_pci_setup_ports(pdev[i], d, 0, &hw[i*2], &hws[i*2]);
+		ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]);
 	}
 
 	host = ide_host_alloc(d, hws);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 117dd171e70b..24e265af4f1c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1265,7 +1265,7 @@ static inline int ide_pci_is_in_compatibility_mode(struct pci_dev *dev)
 	return 0;
 }
 
-void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *, int,
+void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *,
 			 hw_regs_t *, hw_regs_t **);
 void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *);
 
-- 
cgit v1.2.3-71-gd317


From 662641d98b4396b48f513726d141c5f646c08259 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:54 +0100
Subject: frv: remove <asm/ide.h>

* Remove superfluous <asm/{setup,io,irq}.h> includes.

* Remove <asm/ide.h>.

Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/asm-frv/ide.h | 24 ------------------------
 include/linux/ide.h   |  4 ++--
 2 files changed, 2 insertions(+), 26 deletions(-)
 delete mode 100644 include/asm-frv/ide.h

(limited to 'include/linux')

diff --git a/include/asm-frv/ide.h b/include/asm-frv/ide.h
deleted file mode 100644
index 361076611855..000000000000
--- a/include/asm-frv/ide.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* ide.h: FRV IDE declarations
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ASM_IDE_H
-#define _ASM_IDE_H
-
-#ifdef __KERNEL__
-
-#include <asm/setup.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-
-#include <asm-generic/ide_iops.h>
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_IDE_H */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 24e265af4f1c..047b74621dd4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -193,8 +193,8 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 	hw->io_ports.ctl_addr = ctl_addr;
 }
 
-#if defined(CONFIG_ARM) || defined(CONFIG_FRV) || defined(CONFIG_M68K) || \
-    defined(CONFIG_MIPS) || defined(CONFIG_MN10300) || defined(CONFIG_PARISC) \
+#if defined(CONFIG_ARM) || defined(CONFIG_M68K) || defined(CONFIG_MIPS) || \
+    defined(CONFIG_MN10300) || defined(CONFIG_PARISC) \
     || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || defined(CONFIG_SPARC64)
 #include <asm/ide.h>
 #else
-- 
cgit v1.2.3-71-gd317


From d45b70ab9bbf1a46ae52972d532f9e267b8d39d9 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:54 +0100
Subject: mn10300: remove <asm/ide.h>

* Remove superfluous <asm/intctl-regs.h> include.

* Remove no longer used SUPPORT_SLOW_DATA_PORTS define.

* Move defining SUPPORT_VLB_SYNC to <linux/ide.h>.

* Use __ide_mm_*() macros from <asm-generic/ide_iops.h>
  (MN10300 uses only memory-mapped I/O).

* Remove <asm/ide.h>.

While at it:

* Remove superfluous SPARC64 #ifdef from <linux/ide.h>.

Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/asm-mn10300/ide.h | 39 ---------------------------------------
 include/linux/ide.h       |  5 ++---
 2 files changed, 2 insertions(+), 42 deletions(-)
 delete mode 100644 include/asm-mn10300/ide.h

(limited to 'include/linux')

diff --git a/include/asm-mn10300/ide.h b/include/asm-mn10300/ide.h
deleted file mode 100644
index 6adcdd92e83d..000000000000
--- a/include/asm-mn10300/ide.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* MN10300 Arch-specific IDE code
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- * - Derived from include/asm-i386/ide.h
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_IDE_H
-#define _ASM_IDE_H
-
-#ifdef __KERNEL__
-
-#include <asm/intctl-regs.h>
-
-#undef SUPPORT_SLOW_DATA_PORTS
-#define SUPPORT_SLOW_DATA_PORTS 0
-
-#undef SUPPORT_VLB_SYNC
-#define SUPPORT_VLB_SYNC 0
-
-/*
- * some bits needed for parts of the IDE subsystem to compile
- */
-#define __ide_mm_insw(port, addr, n) \
-	insw((unsigned long) (port), (addr), (n))
-#define __ide_mm_insl(port, addr, n) \
-	insl((unsigned long) (port), (addr), (n))
-#define __ide_mm_outsw(port, addr, n) \
-	outsw((unsigned long) (port), (addr), (n))
-#define __ide_mm_outsl(port, addr, n) \
-	outsl((unsigned long) (port), (addr), (n))
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_IDE_H */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 047b74621dd4..bbce9b2bdfc4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,7 +26,7 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
-#if defined(CONFIG_CRIS) || defined(CONFIG_FRV)
+#if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
 # define SUPPORT_VLB_SYNC 1
@@ -194,8 +194,7 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 }
 
 #if defined(CONFIG_ARM) || defined(CONFIG_M68K) || defined(CONFIG_MIPS) || \
-    defined(CONFIG_MN10300) || defined(CONFIG_PARISC) \
-    || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || defined(CONFIG_SPARC64)
+    defined(CONFIG_PARISC) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
 #include <asm/ide.h>
 #else
 #include <asm-generic/ide_iops.h>
-- 
cgit v1.2.3-71-gd317


From 552d3a99bdce8a0d7f9abe3766fb3655ef5757dc Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:58 +0100
Subject: ide: remove broken EXABYTENEST support

do_identify() marks EXABYTENEST device as non-present and frees
drive->id so enable_nest() has absolutely no chance of working.

The code was like this since at least 2.6.12-rc2 and nobody
has noticed so just remove broken EXABYTENEST support.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c | 37 -------------------------------------
 include/linux/ata.h     |  2 --
 2 files changed, 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 5e0c3fb3b43a..29649d09dbb8 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -465,37 +465,6 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 	return rc;
 }
 
-/*
- *
- */
-static void enable_nest (ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	u8 stat;
-
-	printk(KERN_INFO "%s: enabling %s -- ",
-		hwif->name, (char *)&drive->id[ATA_ID_PROD]);
-
-	SELECT_DRIVE(drive);
-	msleep(50);
-	tp_ops->exec_command(hwif, ATA_EXABYTE_ENABLE_NEST);
-
-	if (ide_busy_sleep(hwif, WAIT_WORSTCASE, 0)) {
-		printk(KERN_CONT "failed (timeout)\n");
-		return;
-	}
-
-	msleep(50);
-
-	stat = tp_ops->read_status(hwif);
-
-	if (!OK_STAT(stat, 0, BAD_STAT))
-		printk(KERN_CONT "failed (status = 0x%02x)\n", stat);
-	else
-		printk(KERN_CONT "success\n");
-}
-
 /**
  *	probe_for_drives	-	upper level drive probe
  *	@drive: drive to probe for
@@ -534,7 +503,6 @@ static u8 probe_for_drive(ide_drive_t *drive)
 
 	/* skip probing? */
 	if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0) {
-retry:
 		/* if !(success||timed-out) */
 		if (do_probe(drive, ATA_CMD_ID_ATA) >= 2)
 			/* look for ATAPI device */
@@ -544,11 +512,6 @@ retry:
 			/* drive not found */
 			return 0;
 
-		if (strstr(m, "E X A B Y T E N E S T")) {
-			enable_nest(drive);
-			goto retry;
-		}
-
 		/* identification failed? */
 		if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
 			if (drive->media == ide_disk) {
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 9a061accd8b8..68132c4a0e91 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -244,8 +244,6 @@ enum {
 	ATA_CMD_MEDIA_UNLOCK	= 0xDF,
 	/* marked obsolete in the ATA/ATAPI-7 spec */
 	ATA_CMD_RESTORE		= 0x10,
-	/* EXABYTE specific */
-	ATA_EXABYTE_ENABLE_NEST	= 0xF0,
 
 	/* READ_LOG_EXT pages */
 	ATA_LOG_SATA_NCQ	= 0x10,
-- 
cgit v1.2.3-71-gd317


From 2ebe1d9efed5f232afc8d00901d0959c9814bce3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 24 Mar 2009 23:22:59 +0100
Subject: ide: use try_to_identify() in ide_driveid_update()

* Pass pointer to buffer for IDENTIFY data to do_identify()
  and try_to_identify().

* Un-static try_to_identify() and use it in ide_driveid_update().

* Rename try_to_identify() to ide_dev_read_id().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-iops.c  | 54 ++++---------------------------------------------
 drivers/ide/ide-probe.c | 24 +++++++++++-----------
 include/linux/ide.h     |  2 ++
 3 files changed, 18 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index affbc603987a..317c5dadd7c0 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -289,65 +289,19 @@ no_80w:
 
 int ide_driveid_update(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	u16 *id;
-	unsigned long flags;
-	int use_altstatus = 0, rc;
-	u8 a, uninitialized_var(s);
+	int rc;
 
 	id = kmalloc(SECTOR_SIZE, GFP_ATOMIC);
 	if (id == NULL)
 		return 0;
 
-	/*
-	 * Re-read drive->id for possible DMA mode
-	 * change (copied from ide-probe.c)
-	 */
-
 	SELECT_MASK(drive, 1);
-	tp_ops->set_irq(hwif, 0);
-	msleep(50);
-
-	if (hwif->io_ports.ctl_addr &&
-	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0) {
-		a = tp_ops->read_altstatus(hwif);
-		s = tp_ops->read_status(hwif);
-		if ((a ^ s) & ~ATA_IDX)
-			/* ancient Seagate drives, broken interfaces */
-			printk(KERN_INFO "%s: probing with STATUS(0x%02x) "
-					 "instead of ALTSTATUS(0x%02x)\n",
-					 drive->name, s, a);
-		else
-			/* use non-intrusive polling */
-			use_altstatus = 1;
-	}
-
-	tp_ops->exec_command(hwif, ATA_CMD_ID_ATA);
-
-	if (ide_busy_sleep(hwif, WAIT_WORSTCASE / 2, use_altstatus)) {
-		rc = 1;
-		goto out_err;
-	}
-
-	msleep(50);	/* wait for IRQ and ATA_DRQ */
-
-	s = tp_ops->read_status(hwif);
+	rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id);
+	SELECT_MASK(drive, 0);
 
-	if (!OK_STAT(s, ATA_DRQ, BAD_R_STAT)) {
-		rc = 2;
+	if (rc)
 		goto out_err;
-	}
-
-	local_irq_save(flags);
-	tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
-	local_irq_restore(flags);
-
-	(void)tp_ops->read_status(hwif); /* clear drive IRQ */
-
-	ide_fix_driveid(id);
-
-	SELECT_MASK(drive, 0);
 
 	drive->id[ATA_ID_UDMA_MODES]  = id[ATA_ID_UDMA_MODES];
 	drive->id[ATA_ID_MWDMA_MODES] = id[ATA_ID_MWDMA_MODES];
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 3f9faef5e50e..974067043fba 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -181,16 +181,16 @@ static void ide_classify_atapi_dev(ide_drive_t *drive)
  *	do_identify	-	identify a drive
  *	@drive: drive to identify 
  *	@cmd: command used
+ *	@id: buffer for IDENTIFY data
  *
  *	Called when we have issued a drive identify command to
  *	read and parse the results. This function is run with
  *	interrupts disabled. 
  */
 
-static void do_identify(ide_drive_t *drive, u8 cmd)
+static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	u16 *id = drive->id;
 	char *m = (char *)&id[ATA_ID_PROD];
 	unsigned long flags;
 	int bswap = 1;
@@ -240,19 +240,19 @@ err_misc:
 }
 
 /**
- *	try_to_identify	-	send ATA/ATAPI identify
+ *	ide_dev_read_id	-	send ATA/ATAPI IDENTIFY command
  *	@drive: drive to identify
  *	@cmd: command to use
+ *	@id: buffer for IDENTIFY data
  *
- *	try_to_identify() sends an ATA(PI) IDENTIFY request to a drive
- *	and waits for a response.
+ *	Sends an ATA(PI) IDENTIFY request to a drive and waits for a response.
  *
  *	Returns:	0  device was identified
  *			1  device timed-out (no response to identify request)
  *			2  device aborted the command (refused to identify itself)
  */
 
-static int try_to_identify(ide_drive_t *drive, u8 cmd)
+int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
@@ -312,7 +312,7 @@ static int try_to_identify(ide_drive_t *drive, u8 cmd)
 
 	if (OK_STAT(s, ATA_DRQ, BAD_R_STAT)) {
 		/* drive returned ID */
-		do_identify(drive, cmd);
+		do_identify(drive, cmd, id);
 		/* drive responded with ID */
 		rc = 0;
 		/* clear drive IRQ */
@@ -378,6 +378,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
+	u16 *id = drive->id;
 	int rc;
 	u8 present = !!(drive->dev_flags & IDE_DFLAG_PRESENT), stat;
 
@@ -413,11 +414,10 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 
 	if (OK_STAT(stat, ATA_DRDY, ATA_BUSY) ||
 	    present || cmd == ATA_CMD_ID_ATAPI) {
-		/* send cmd and wait */
-		if ((rc = try_to_identify(drive, cmd))) {
+		rc = ide_dev_read_id(drive, cmd, id);
+		if (rc)
 			/* failed: try again */
-			rc = try_to_identify(drive,cmd);
-		}
+			rc = ide_dev_read_id(drive, cmd, id);
 
 		stat = tp_ops->read_status(hwif);
 
@@ -432,7 +432,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			msleep(50);
 			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 			(void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0);
-			rc = try_to_identify(drive, cmd);
+			rc = ide_dev_read_id(drive, cmd, id);
 		}
 
 		/* ensure drive IRQ is clear */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index bbce9b2bdfc4..854eba8b2ba3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1235,6 +1235,8 @@ int ide_no_data_taskfile(ide_drive_t *, ide_task_t *);
 
 int ide_taskfile_ioctl(ide_drive_t *, unsigned int, unsigned long);
 
+int ide_dev_read_id(ide_drive_t *, u8, u16 *);
+
 extern int ide_driveid_update(ide_drive_t *);
 extern int ide_config_drive_speed(ide_drive_t *, u8);
 extern u8 eighty_ninty_three (ide_drive_t *);
-- 
cgit v1.2.3-71-gd317


From 1662e3a7f076e51e3073faf9ce77157b529c475b Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 18 Mar 2009 14:28:53 -0400
Subject: USB: add quirk to avoid config and interface strings

Apparently the Configuration and Interface strings aren't used as
often as the Vendor, Product, and Serial strings.  In at least one
device (a Saitek Cyborg Gold 3D joystick), attempts to read the
Configuration string cause the device to stop responding to Control
requests.

This patch (as1226) adds a quirks flag, telling the kernel not to
read a device's Configuration or Interface strings, together with a
new quirk for the offending joystick.

Reported-by: Melchior FRANZ <melchior.franz@gmail.com>
Tested-by: Melchior FRANZ <melchior.franz@gmail.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Cc: stable <stable@kernel.org>  [2.6.28 and 2.6.29, nothing earlier]
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/message.c | 3 ++-
 drivers/usb/core/quirks.c  | 4 ++++
 drivers/usb/core/sysfs.c   | 4 +++-
 include/linux/usb/quirks.h | 3 +++
 4 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 49e7f56e0d7f..3922fa915ed2 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1719,7 +1719,8 @@ free_interfaces:
 	}
 	kfree(new_interfaces);
 
-	if (cp->string == NULL)
+	if (cp->string == NULL &&
+			!(dev->quirks & USB_QUIRK_CONFIG_INTF_STRINGS))
 		cp->string = usb_cache_string(dev, cp->desc.iConfiguration);
 
 	/* Now that all the interfaces are set up, register them
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index c070b34b669d..ab93918d9207 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -54,6 +54,10 @@ static const struct usb_device_id usb_quirk_list[] = {
 	{ USB_DEVICE(0x0638, 0x0a13), .driver_info =
 	  USB_QUIRK_STRING_FETCH_255 },
 
+	/* Saitek Cyborg Gold Joystick */
+	{ USB_DEVICE(0x06a3, 0x0006), .driver_info =
+			USB_QUIRK_CONFIG_INTF_STRINGS },
+
 	/* M-Systems Flash Disk Pioneers */
 	{ USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME },
 
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index 4cc2456ef3be..c66789197927 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/usb.h>
+#include <linux/usb/quirks.h>
 #include "usb.h"
 
 /* Active configuration fields */
@@ -813,7 +814,8 @@ int usb_create_sysfs_intf_files(struct usb_interface *intf)
 	if (intf->sysfs_files_created || intf->unregistering)
 		return 0;
 
-	if (alt->string == NULL)
+	if (alt->string == NULL &&
+			!(udev->quirks & USB_QUIRK_CONFIG_INTF_STRINGS))
 		alt->string = usb_cache_string(udev, alt->desc.iInterface);
 	if (alt->string)
 		retval = device_create_file(&intf->dev, &dev_attr_interface);
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index 7f6c603db654..2526f3bbd273 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -16,4 +16,7 @@
 /* device can't handle Set-Interface requests */
 #define USB_QUIRK_NO_SET_INTF		0x00000004
 
+/* device can't handle its Configuration or Interface strings */
+#define USB_QUIRK_CONFIG_INTF_STRINGS	0x00000008
+
 #endif /* __LINUX_USB_QUIRKS_H */
-- 
cgit v1.2.3-71-gd317


From c2344f13b59e007d782a3e591ebc551bc583a8b7 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Sat, 24 Jan 2009 23:54:31 -0800
Subject: USB: gpio_vbus: add delayed vbus_session calls

Call usb_gadget_vbus_connect() and ...disconnect() from a
workqueue rather than from an irq handler, allowing msleep()
calls in vbus_session.  Update kerneldoc to match.

[ dbrownell@users.sourceforge.net: more kerneldoc updates ]

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/otg/gpio_vbus.c | 42 +++++++++++++++++++++++++++++++-----------
 include/linux/usb/gadget.h  |  6 ++++--
 include/linux/usb/otg.h     |  4 ++++
 3 files changed, 39 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/otg/gpio_vbus.c b/drivers/usb/otg/gpio_vbus.c
index 63a6036f04be..1c26c94513e9 100644
--- a/drivers/usb/otg/gpio_vbus.c
+++ b/drivers/usb/otg/gpio_vbus.c
@@ -13,6 +13,7 @@
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
 #include <linux/usb.h>
+#include <linux/workqueue.h>
 
 #include <linux/regulator/consumer.h>
 
@@ -34,6 +35,7 @@ struct gpio_vbus_data {
 	struct regulator       *vbus_draw;
 	int			vbus_draw_enabled;
 	unsigned		mA;
+	struct work_struct	work;
 };
 
 
@@ -76,24 +78,26 @@ static void set_vbus_draw(struct gpio_vbus_data *gpio_vbus, unsigned mA)
 	gpio_vbus->mA = mA;
 }
 
-/* VBUS change IRQ handler */
-static irqreturn_t gpio_vbus_irq(int irq, void *data)
+static int is_vbus_powered(struct gpio_vbus_mach_info *pdata)
 {
-	struct platform_device *pdev = data;
-	struct gpio_vbus_mach_info *pdata = pdev->dev.platform_data;
-	struct gpio_vbus_data *gpio_vbus = platform_get_drvdata(pdev);
-	int gpio, vbus;
+	int vbus;
 
 	vbus = gpio_get_value(pdata->gpio_vbus);
 	if (pdata->gpio_vbus_inverted)
 		vbus = !vbus;
 
-	dev_dbg(&pdev->dev, "VBUS %s (gadget: %s)\n",
-		vbus ? "supplied" : "inactive",
-		gpio_vbus->otg.gadget ? gpio_vbus->otg.gadget->name : "none");
+	return vbus;
+}
+
+static void gpio_vbus_work(struct work_struct *work)
+{
+	struct gpio_vbus_data *gpio_vbus =
+		container_of(work, struct gpio_vbus_data, work);
+	struct gpio_vbus_mach_info *pdata = gpio_vbus->dev->platform_data;
+	int gpio;
 
 	if (!gpio_vbus->otg.gadget)
-		return IRQ_HANDLED;
+		return;
 
 	/* Peripheral controllers which manage the pullup themselves won't have
 	 * gpio_pullup configured here.  If it's configured here, we'll do what
@@ -101,7 +105,7 @@ static irqreturn_t gpio_vbus_irq(int irq, void *data)
 	 * that may complicate usb_gadget_{,dis}connect() support.
 	 */
 	gpio = pdata->gpio_pullup;
-	if (vbus) {
+	if (is_vbus_powered(pdata)) {
 		gpio_vbus->otg.state = OTG_STATE_B_PERIPHERAL;
 		usb_gadget_vbus_connect(gpio_vbus->otg.gadget);
 
@@ -121,6 +125,21 @@ static irqreturn_t gpio_vbus_irq(int irq, void *data)
 		usb_gadget_vbus_disconnect(gpio_vbus->otg.gadget);
 		gpio_vbus->otg.state = OTG_STATE_B_IDLE;
 	}
+}
+
+/* VBUS change IRQ handler */
+static irqreturn_t gpio_vbus_irq(int irq, void *data)
+{
+	struct platform_device *pdev = data;
+	struct gpio_vbus_mach_info *pdata = pdev->dev.platform_data;
+	struct gpio_vbus_data *gpio_vbus = platform_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "VBUS %s (gadget: %s)\n",
+		is_vbus_powered(pdata) ? "supplied" : "inactive",
+		gpio_vbus->otg.gadget ? gpio_vbus->otg.gadget->name : "none");
+
+	if (gpio_vbus->otg.gadget)
+		schedule_work(&gpio_vbus->work);
 
 	return IRQ_HANDLED;
 }
@@ -257,6 +276,7 @@ static int __init gpio_vbus_probe(struct platform_device *pdev)
 			irq, err);
 		goto err_irq;
 	}
+	INIT_WORK(&gpio_vbus->work, gpio_vbus_work);
 
 	/* only active when a gadget is registered */
 	err = otg_set_transceiver(&gpio_vbus->otg);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 0460a746480c..bbf45d500b6d 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -598,6 +598,7 @@ static inline int usb_gadget_clear_selfpowered(struct usb_gadget *gadget)
 /**
  * usb_gadget_vbus_connect - Notify controller that VBUS is powered
  * @gadget:The device which now has VBUS power.
+ * Context: can sleep
  *
  * This call is used by a driver for an external transceiver (or GPIO)
  * that detects a VBUS power session starting.  Common responses include
@@ -636,6 +637,7 @@ static inline int usb_gadget_vbus_draw(struct usb_gadget *gadget, unsigned mA)
 /**
  * usb_gadget_vbus_disconnect - notify controller about VBUS session end
  * @gadget:the device whose VBUS supply is being described
+ * Context: can sleep
  *
  * This call is used by a driver for an external transceiver (or GPIO)
  * that detects a VBUS power session ending.  Common responses include
@@ -792,19 +794,20 @@ struct usb_gadget_driver {
 /**
  * usb_gadget_register_driver - register a gadget driver
  * @driver:the driver being registered
+ * Context: can sleep
  *
  * Call this in your gadget driver's module initialization function,
  * to tell the underlying usb controller driver about your driver.
  * The driver's bind() function will be called to bind it to a
  * gadget before this registration call returns.  It's expected that
  * the bind() functions will be in init sections.
- * This function must be called in a context that can sleep.
  */
 int usb_gadget_register_driver(struct usb_gadget_driver *driver);
 
 /**
  * usb_gadget_unregister_driver - unregister a gadget driver
  * @driver:the driver being unregistered
+ * Context: can sleep
  *
  * Call this in your gadget driver's module cleanup function,
  * to tell the underlying usb controller that your driver is
@@ -813,7 +816,6 @@ int usb_gadget_register_driver(struct usb_gadget_driver *driver);
  * to unbind() and clean up any device state, before this procedure
  * finally returns.  It's expected that the unbind() functions
  * will in in exit sections, so may not be linked in some kernels.
- * This function must be called in a context that can sleep.
  */
 int usb_gadget_unregister_driver(struct usb_gadget_driver *driver);
 
diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 94df4fe6c6c0..60a52576fd5c 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -86,6 +86,7 @@ extern int otg_set_transceiver(struct otg_transceiver *);
 extern struct otg_transceiver *otg_get_transceiver(void);
 extern void otg_put_transceiver(struct otg_transceiver *);
 
+/* Context: can sleep */
 static inline int
 otg_start_hnp(struct otg_transceiver *otg)
 {
@@ -102,6 +103,8 @@ otg_set_host(struct otg_transceiver *otg, struct usb_bus *host)
 
 
 /* for usb peripheral controller drivers */
+
+/* Context: can sleep */
 static inline int
 otg_set_peripheral(struct otg_transceiver *otg, struct usb_gadget *periph)
 {
@@ -114,6 +117,7 @@ otg_set_power(struct otg_transceiver *otg, unsigned mA)
 	return otg->set_power(otg, mA);
 }
 
+/* Context: can sleep */
 static inline int
 otg_set_suspend(struct otg_transceiver *otg, int suspend)
 {
-- 
cgit v1.2.3-71-gd317


From 4d6914b72966862f37de634299a80ca2a4b1829f Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 29 Dec 2008 22:48:19 +0100
Subject: USB: Move definitions from usb.h to usb/ch9.h

The functions:

usb_endpoint_dir_in(epd)
usb_endpoint_dir_out(epd)
usb_endpoint_is_bulk_in(epd)
usb_endpoint_is_bulk_out(epd)
usb_endpoint_is_int_in(epd)
usb_endpoint_is_int_out(epd)
usb_endpoint_is_isoc_in(epd)
usb_endpoint_is_isoc_out(epd)
usb_endpoint_num(epd)
usb_endpoint_type(epd)
usb_endpoint_xfer_bulk(epd)
usb_endpoint_xfer_control(epd)
usb_endpoint_xfer_int(epd)
usb_endpoint_xfer_isoc(epd)

are moved from include/linux/usb.h to include/linux/usb/ch9.h.
include/linux/usb/ch9.h makes more sense for these functions because they
only depend on constants that are defined in this file.

Signed-off-by: Julia Lawall <julia@diku.dk>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h     | 180 ------------------------------------------------
 include/linux/usb/ch9.h | 179 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 180 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 88079fd60235..0c05ff621192 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -643,186 +643,6 @@ static inline int usb_make_path(struct usb_device *dev, char *buf, size_t size)
 
 /*-------------------------------------------------------------------------*/
 
-/**
- * usb_endpoint_num - get the endpoint's number
- * @epd: endpoint to be checked
- *
- * Returns @epd's number: 0 to 15.
- */
-static inline int usb_endpoint_num(const struct usb_endpoint_descriptor *epd)
-{
-	return epd->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK;
-}
-
-/**
- * usb_endpoint_type - get the endpoint's transfer type
- * @epd: endpoint to be checked
- *
- * Returns one of USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT} according
- * to @epd's transfer type.
- */
-static inline int usb_endpoint_type(const struct usb_endpoint_descriptor *epd)
-{
-	return epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK;
-}
-
-/**
- * usb_endpoint_dir_in - check if the endpoint has IN direction
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint is of type IN, otherwise it returns false.
- */
-static inline int usb_endpoint_dir_in(const struct usb_endpoint_descriptor *epd)
-{
-	return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN);
-}
-
-/**
- * usb_endpoint_dir_out - check if the endpoint has OUT direction
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint is of type OUT, otherwise it returns false.
- */
-static inline int usb_endpoint_dir_out(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT);
-}
-
-/**
- * usb_endpoint_xfer_bulk - check if the endpoint has bulk transfer type
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint is of type bulk, otherwise it returns false.
- */
-static inline int usb_endpoint_xfer_bulk(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
-		USB_ENDPOINT_XFER_BULK);
-}
-
-/**
- * usb_endpoint_xfer_control - check if the endpoint has control transfer type
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint is of type control, otherwise it returns false.
- */
-static inline int usb_endpoint_xfer_control(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
-		USB_ENDPOINT_XFER_CONTROL);
-}
-
-/**
- * usb_endpoint_xfer_int - check if the endpoint has interrupt transfer type
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint is of type interrupt, otherwise it returns
- * false.
- */
-static inline int usb_endpoint_xfer_int(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
-		USB_ENDPOINT_XFER_INT);
-}
-
-/**
- * usb_endpoint_xfer_isoc - check if the endpoint has isochronous transfer type
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint is of type isochronous, otherwise it returns
- * false.
- */
-static inline int usb_endpoint_xfer_isoc(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
-		USB_ENDPOINT_XFER_ISOC);
-}
-
-/**
- * usb_endpoint_is_bulk_in - check if the endpoint is bulk IN
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint has bulk transfer type and IN direction,
- * otherwise it returns false.
- */
-static inline int usb_endpoint_is_bulk_in(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return (usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_in(epd));
-}
-
-/**
- * usb_endpoint_is_bulk_out - check if the endpoint is bulk OUT
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint has bulk transfer type and OUT direction,
- * otherwise it returns false.
- */
-static inline int usb_endpoint_is_bulk_out(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return (usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_out(epd));
-}
-
-/**
- * usb_endpoint_is_int_in - check if the endpoint is interrupt IN
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint has interrupt transfer type and IN direction,
- * otherwise it returns false.
- */
-static inline int usb_endpoint_is_int_in(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return (usb_endpoint_xfer_int(epd) && usb_endpoint_dir_in(epd));
-}
-
-/**
- * usb_endpoint_is_int_out - check if the endpoint is interrupt OUT
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint has interrupt transfer type and OUT direction,
- * otherwise it returns false.
- */
-static inline int usb_endpoint_is_int_out(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return (usb_endpoint_xfer_int(epd) && usb_endpoint_dir_out(epd));
-}
-
-/**
- * usb_endpoint_is_isoc_in - check if the endpoint is isochronous IN
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint has isochronous transfer type and IN direction,
- * otherwise it returns false.
- */
-static inline int usb_endpoint_is_isoc_in(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return (usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_in(epd));
-}
-
-/**
- * usb_endpoint_is_isoc_out - check if the endpoint is isochronous OUT
- * @epd: endpoint to be checked
- *
- * Returns true if the endpoint has isochronous transfer type and OUT direction,
- * otherwise it returns false.
- */
-static inline int usb_endpoint_is_isoc_out(
-				const struct usb_endpoint_descriptor *epd)
-{
-	return (usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_out(epd));
-}
-
-/*-------------------------------------------------------------------------*/
-
 #define USB_DEVICE_ID_MATCH_DEVICE \
 		(USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT)
 #define USB_DEVICE_ID_MATCH_DEV_RANGE \
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 9b42baed3900..fa777db7f7eb 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -353,6 +353,185 @@ struct usb_endpoint_descriptor {
 #define USB_ENDPOINT_XFER_INT		3
 #define USB_ENDPOINT_MAX_ADJUSTABLE	0x80
 
+/*-------------------------------------------------------------------------*/
+
+/**
+ * usb_endpoint_num - get the endpoint's number
+ * @epd: endpoint to be checked
+ *
+ * Returns @epd's number: 0 to 15.
+ */
+static inline int usb_endpoint_num(const struct usb_endpoint_descriptor *epd)
+{
+	return epd->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK;
+}
+
+/**
+ * usb_endpoint_type - get the endpoint's transfer type
+ * @epd: endpoint to be checked
+ *
+ * Returns one of USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT} according
+ * to @epd's transfer type.
+ */
+static inline int usb_endpoint_type(const struct usb_endpoint_descriptor *epd)
+{
+	return epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK;
+}
+
+/**
+ * usb_endpoint_dir_in - check if the endpoint has IN direction
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint is of type IN, otherwise it returns false.
+ */
+static inline int usb_endpoint_dir_in(const struct usb_endpoint_descriptor *epd)
+{
+	return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN);
+}
+
+/**
+ * usb_endpoint_dir_out - check if the endpoint has OUT direction
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint is of type OUT, otherwise it returns false.
+ */
+static inline int usb_endpoint_dir_out(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT);
+}
+
+/**
+ * usb_endpoint_xfer_bulk - check if the endpoint has bulk transfer type
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint is of type bulk, otherwise it returns false.
+ */
+static inline int usb_endpoint_xfer_bulk(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
+		USB_ENDPOINT_XFER_BULK);
+}
+
+/**
+ * usb_endpoint_xfer_control - check if the endpoint has control transfer type
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint is of type control, otherwise it returns false.
+ */
+static inline int usb_endpoint_xfer_control(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
+		USB_ENDPOINT_XFER_CONTROL);
+}
+
+/**
+ * usb_endpoint_xfer_int - check if the endpoint has interrupt transfer type
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint is of type interrupt, otherwise it returns
+ * false.
+ */
+static inline int usb_endpoint_xfer_int(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
+		USB_ENDPOINT_XFER_INT);
+}
+
+/**
+ * usb_endpoint_xfer_isoc - check if the endpoint has isochronous transfer type
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint is of type isochronous, otherwise it returns
+ * false.
+ */
+static inline int usb_endpoint_xfer_isoc(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
+		USB_ENDPOINT_XFER_ISOC);
+}
+
+/**
+ * usb_endpoint_is_bulk_in - check if the endpoint is bulk IN
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint has bulk transfer type and IN direction,
+ * otherwise it returns false.
+ */
+static inline int usb_endpoint_is_bulk_in(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return (usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_in(epd));
+}
+
+/**
+ * usb_endpoint_is_bulk_out - check if the endpoint is bulk OUT
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint has bulk transfer type and OUT direction,
+ * otherwise it returns false.
+ */
+static inline int usb_endpoint_is_bulk_out(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return (usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_out(epd));
+}
+
+/**
+ * usb_endpoint_is_int_in - check if the endpoint is interrupt IN
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint has interrupt transfer type and IN direction,
+ * otherwise it returns false.
+ */
+static inline int usb_endpoint_is_int_in(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return (usb_endpoint_xfer_int(epd) && usb_endpoint_dir_in(epd));
+}
+
+/**
+ * usb_endpoint_is_int_out - check if the endpoint is interrupt OUT
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint has interrupt transfer type and OUT direction,
+ * otherwise it returns false.
+ */
+static inline int usb_endpoint_is_int_out(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return (usb_endpoint_xfer_int(epd) && usb_endpoint_dir_out(epd));
+}
+
+/**
+ * usb_endpoint_is_isoc_in - check if the endpoint is isochronous IN
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint has isochronous transfer type and IN direction,
+ * otherwise it returns false.
+ */
+static inline int usb_endpoint_is_isoc_in(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return (usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_in(epd));
+}
+
+/**
+ * usb_endpoint_is_isoc_out - check if the endpoint is isochronous OUT
+ * @epd: endpoint to be checked
+ *
+ * Returns true if the endpoint has isochronous transfer type and OUT direction,
+ * otherwise it returns false.
+ */
+static inline int usb_endpoint_is_isoc_out(
+				const struct usb_endpoint_descriptor *epd)
+{
+	return (usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_out(epd));
+}
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3-71-gd317


From f8bece8d91f9ed9cff3c98920802f1b3046b7560 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oliver@neukum.org>
Date: Thu, 5 Feb 2009 16:54:25 +0100
Subject: USB: serial: introduce a flag into the usb serial layer to tell
 drivers that their URBs are killed due to suspension

This patch introduces a flag into the usb serial layer to tell drivers
that their URBs are killed due to suspension. That is necessary to let
drivers know whether they should report an error back.

Signed-off-by: Oliver Neukum <oneukum@suse.de>

Hi Greg,

this is for 2.6.30. Patches to use this in drivers are under development.

	Regards
		Oliver
---
 drivers/usb/serial/usb-serial.c | 4 ++++
 include/linux/usb/serial.h      | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index cfcfd5ab06ce..c6aaa6dc7564 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1067,6 +1067,8 @@ int usb_serial_suspend(struct usb_interface *intf, pm_message_t message)
 	struct usb_serial_port *port;
 	int i, r = 0;
 
+	serial->suspending = 1;
+
 	for (i = 0; i < serial->num_ports; ++i) {
 		port = serial->port[i];
 		if (port)
@@ -1084,8 +1086,10 @@ int usb_serial_resume(struct usb_interface *intf)
 {
 	struct usb_serial *serial = usb_get_intfdata(intf);
 
+	serial->suspending = 0;
 	if (serial->type->resume)
 		return serial->type->resume(serial);
+
 	return 0;
 }
 EXPORT_SYMBOL(usb_serial_resume);
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 0b8617a9176d..b95842542590 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -130,7 +130,8 @@ struct usb_serial {
 	struct usb_device		*dev;
 	struct usb_serial_driver	*type;
 	struct usb_interface		*interface;
-	unsigned char			disconnected;
+	unsigned char			disconnected:1;
+	unsigned char			suspending:1;
 	unsigned char			minor;
 	unsigned char			num_ports;
 	unsigned char			num_port_pointers;
-- 
cgit v1.2.3-71-gd317


From f6d92a05c86754d62eabc84856d2035d0de3ddc3 Mon Sep 17 00:00:00 2001
From: Ajay Kumar Gupta <ajay.gupta@ti.com>
Date: Fri, 6 Feb 2009 17:32:35 +0530
Subject: USB: otg: adding nop usb transceiver

NOP transceiver is used by all the usb transceiver which are mostly
autonomous and doesn't require any programming or which are built
into the usb ip itself.NOP transceiver only allocates the memory
for struct xceiv and calls otg_set_transceiver() so function call
to otg_get_transceiver() will return a valid transceiver.

NOP transceiver device should be registered by calling
usb_nop_xceiv_register() from platform files.

Signed-off-by: Ajay Kumar Gupta <ajay.gupta@ti.com>
Cc: Felipe Balbi <felipe.balbi@nokia.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/otg/Kconfig         |   8 ++
 drivers/usb/otg/Makefile        |   1 +
 drivers/usb/otg/nop-usb-xceiv.c | 180 ++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/otg.h         |   4 +
 4 files changed, 193 insertions(+)
 create mode 100644 drivers/usb/otg/nop-usb-xceiv.c

(limited to 'include/linux')

diff --git a/drivers/usb/otg/Kconfig b/drivers/usb/otg/Kconfig
index ee55b449ffde..fc1ca03ce4da 100644
--- a/drivers/usb/otg/Kconfig
+++ b/drivers/usb/otg/Kconfig
@@ -51,4 +51,12 @@ config TWL4030_USB
 	  This transceiver supports high and full speed devices plus,
 	  in host mode, low speed.
 
+config NOP_USB_XCEIV
+	tristate "NOP USB Transceiver Driver"
+	select USB_OTG_UTILS
+	help
+	 this driver is to be used by all the usb transceiver which are either
+	 built-in with usb ip or which are autonomous and doesn't require any
+	 phy programming such as ISP1x04 etc.
+
 endif # USB || OTG
diff --git a/drivers/usb/otg/Makefile b/drivers/usb/otg/Makefile
index d73c7cf5e2f7..208167856529 100644
--- a/drivers/usb/otg/Makefile
+++ b/drivers/usb/otg/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_USB_OTG_UTILS)	+= otg.o
 obj-$(CONFIG_USB_GPIO_VBUS)	+= gpio_vbus.o
 obj-$(CONFIG_ISP1301_OMAP)	+= isp1301_omap.o
 obj-$(CONFIG_TWL4030_USB)	+= twl4030-usb.o
+obj-$(CONFIG_NOP_USB_XCEIV)	+= nop-usb-xceiv.o
 
 ccflags-$(CONFIG_USB_DEBUG)	+= -DDEBUG
 ccflags-$(CONFIG_USB_GADGET_DEBUG) += -DDEBUG
diff --git a/drivers/usb/otg/nop-usb-xceiv.c b/drivers/usb/otg/nop-usb-xceiv.c
new file mode 100644
index 000000000000..4b933f646f2e
--- /dev/null
+++ b/drivers/usb/otg/nop-usb-xceiv.c
@@ -0,0 +1,180 @@
+/*
+ * drivers/usb/otg/nop-usb-xceiv.c
+ *
+ * NOP USB transceiver for all USB transceiver which are either built-in
+ * into USB IP or which are mostly autonomous.
+ *
+ * Copyright (C) 2009 Texas Instruments Inc
+ * Author: Ajay Kumar Gupta <ajay.gupta@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Current status:
+ * 	this is to add "nop" transceiver for all those phy which is
+ * 	autonomous such as isp1504 etc.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/usb/otg.h>
+
+struct nop_usb_xceiv {
+	struct otg_transceiver	otg;
+	struct device		*dev;
+};
+
+static u64 nop_xceiv_dmamask = DMA_32BIT_MASK;
+
+static struct platform_device nop_xceiv_device = {
+	.name           = "nop_usb_xceiv",
+	.id             = -1,
+	.dev = {
+		.dma_mask               = &nop_xceiv_dmamask,
+		.coherent_dma_mask      = DMA_32BIT_MASK,
+		.platform_data          = NULL,
+	},
+};
+
+void usb_nop_xceiv_register(void)
+{
+	if (platform_device_register(&nop_xceiv_device) < 0) {
+		printk(KERN_ERR "Unable to register usb nop transceiver\n");
+		return;
+	}
+}
+
+void usb_nop_xceiv_unregister(void)
+{
+	platform_device_unregister(&nop_xceiv_device);
+}
+
+static inline struct nop_usb_xceiv *xceiv_to_nop(struct otg_transceiver *x)
+{
+	return container_of(x, struct nop_usb_xceiv, otg);
+}
+
+static int nop_set_suspend(struct otg_transceiver *x, int suspend)
+{
+	return 0;
+}
+
+static int nop_set_peripheral(struct otg_transceiver *x,
+		struct usb_gadget *gadget)
+{
+	struct nop_usb_xceiv *nop;
+
+	if (!x)
+		return -ENODEV;
+
+	nop = xceiv_to_nop(x);
+
+	if (!gadget) {
+		nop->otg.gadget = NULL;
+		return -ENODEV;
+	}
+
+	nop->otg.gadget = gadget;
+	nop->otg.state = OTG_STATE_B_IDLE;
+	return 0;
+}
+
+static int nop_set_host(struct otg_transceiver *x, struct usb_bus *host)
+{
+	struct nop_usb_xceiv *nop;
+
+	if (!x)
+		return -ENODEV;
+
+	nop = xceiv_to_nop(x);
+
+	if (!host) {
+		nop->otg.host = NULL;
+		return -ENODEV;
+	}
+
+	nop->otg.host = host;
+	return 0;
+}
+
+static int __devinit nop_usb_xceiv_probe(struct platform_device *pdev)
+{
+	struct nop_usb_xceiv	*nop;
+	int err;
+
+	nop = kzalloc(sizeof *nop, GFP_KERNEL);
+	if (!nop)
+		return -ENOMEM;
+
+	nop->dev		= &pdev->dev;
+	nop->otg.dev		= nop->dev;
+	nop->otg.label		= "nop-xceiv";
+	nop->otg.state		= OTG_STATE_UNDEFINED;
+	nop->otg.set_host	= nop_set_host;
+	nop->otg.set_peripheral	= nop_set_peripheral;
+	nop->otg.set_suspend	= nop_set_suspend;
+
+	err = otg_set_transceiver(&nop->otg);
+	if (err) {
+		dev_err(&pdev->dev, "can't register transceiver, err: %d\n",
+			err);
+		goto exit;
+	}
+
+	platform_set_drvdata(pdev, nop);
+
+	return 0;
+exit:
+	kfree(nop);
+	return err;
+}
+
+static int __devexit nop_usb_xceiv_remove(struct platform_device *pdev)
+{
+	struct nop_usb_xceiv *nop = platform_get_drvdata(pdev);
+
+	otg_set_transceiver(NULL);
+
+	platform_set_drvdata(pdev, NULL);
+	kfree(nop);
+
+	return 0;
+}
+
+static struct platform_driver nop_usb_xceiv_driver = {
+	.probe		= nop_usb_xceiv_probe,
+	.remove		= __devexit_p(nop_usb_xceiv_remove),
+	.driver		= {
+		.name	= "nop_usb_xceiv",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init nop_usb_xceiv_init(void)
+{
+	return platform_driver_register(&nop_usb_xceiv_driver);
+}
+subsys_initcall(nop_usb_xceiv_init);
+
+static void __exit nop_usb_xceiv_exit(void)
+{
+	platform_driver_unregister(&nop_usb_xceiv_driver);
+}
+module_exit(nop_usb_xceiv_exit);
+
+MODULE_ALIAS("platform:nop_usb_xceiv");
+MODULE_AUTHOR("Texas Instruments Inc");
+MODULE_DESCRIPTION("NOP USB Transceiver driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 60a52576fd5c..1aaa826396a1 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -80,6 +80,10 @@ struct otg_transceiver {
 
 /* for board-specific init logic */
 extern int otg_set_transceiver(struct otg_transceiver *);
+#ifdef CONFIG_NOP_USB_XCEIV
+extern void usb_nop_xceiv_register(void);
+extern void usb_nop_xceiv_unregister(void);
+#endif
 
 
 /* for usb host and peripheral controller drivers */
-- 
cgit v1.2.3-71-gd317


From e6e244b6cb1f70e7109381626293cd40a8334ed3 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 12 Feb 2009 14:47:44 -0500
Subject: usb-storage: prepare for subdriver separation

This patch (as1206) is the first step in converting usb-storage's
subdrivers into separate modules.  It makes the following large-scale
changes:

	Remove a bunch of unnecessary #ifdef's from usb_usual.h.
	Not truly necessary, but it does clean things up.

	Move the USB device-ID table (which is duplicated between
	libusual and usb-storage) into its own source file,
	usual-tables.c, and arrange for this to be linked with
	either libusual or usb-storage according to whether
	USB_LIBUSUAL is configured.

	Add to usual-tables.c a new usb_usual_ignore_device()
	function to detect whether a particular device needs to be
	managed by a subdriver and not by the standard handlers
	in usb-storage.

	Export a whole bunch of functions in usb-storage, renaming
	some of them because their names don't already begin with
	"usb_stor_".  These functions will be needed by the new
	subdriver modules.

	Split usb-storage's probe routine into two functions.
	The subdrivers will call the probe1 routine, then fill in
	their transport and protocol settings, and then call the
	probe2 routine.

	Take the default cases and error checking out of
	get_transport() and get_protocol(), which run during
	probe1, and instead put a check for invalid transport
	or protocol values into the probe2 function.

	Add a new probe routine to be used for standard devices,
	i.e., those that don't need a subdriver.  This new routine
	checks whether the device should be ignored (because it
	should be handled by ub or by a subdriver), and if not,
	calls the probe1 and probe2 functions.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
CC: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/block/ub.c                 |   2 +-
 drivers/usb/storage/Makefile       |   6 +-
 drivers/usb/storage/libusual.c     |  33 +-----
 drivers/usb/storage/protocol.c     |   3 +
 drivers/usb/storage/scsiglue.c     |   2 +-
 drivers/usb/storage/transport.c    |  10 ++
 drivers/usb/storage/usb.c          | 209 ++++++++++++++++++++-----------------
 drivers/usb/storage/usb.h          |  21 ++++
 drivers/usb/storage/usual-tables.c | 105 +++++++++++++++++++
 include/linux/usb_usual.h          |  21 +---
 10 files changed, 261 insertions(+), 151 deletions(-)
 create mode 100644 drivers/usb/storage/usual-tables.c

(limited to 'include/linux')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index b36b84fbe390..69b7f8e77596 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -391,7 +391,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum);
  */
 #ifdef CONFIG_USB_LIBUSUAL
 
-#define ub_usb_ids  storage_usb_ids
+#define ub_usb_ids  usb_storage_usb_ids
 #else
 
 static struct usb_device_id ub_usb_ids[] = {
diff --git a/drivers/usb/storage/Makefile b/drivers/usb/storage/Makefile
index b32069313390..a9e475e127a5 100644
--- a/drivers/usb/storage/Makefile
+++ b/drivers/usb/storage/Makefile
@@ -25,6 +25,8 @@ usb-storage-obj-$(CONFIG_USB_STORAGE_CYPRESS_ATACB) += cypress_atacb.o
 usb-storage-objs :=	scsiglue.o protocol.o transport.o usb.o \
 			initializers.o sierra_ms.o option_ms.o $(usb-storage-obj-y)
 
-ifneq ($(CONFIG_USB_LIBUSUAL),)
-	obj-$(CONFIG_USB)	+= libusual.o
+ifeq ($(CONFIG_USB_LIBUSUAL),)
+	usb-storage-objs	+= usual-tables.o
+else
+	obj-$(CONFIG_USB)	+= libusual.o usual-tables.o
 endif
diff --git a/drivers/usb/storage/libusual.c b/drivers/usb/storage/libusual.c
index f970b27ba308..fe3ffe1459b2 100644
--- a/drivers/usb/storage/libusual.c
+++ b/drivers/usb/storage/libusual.c
@@ -37,37 +37,6 @@ static atomic_t total_threads = ATOMIC_INIT(0);
 
 static int usu_probe_thread(void *arg);
 
-/*
- * The table.
- */
-#define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
-		    vendorName, productName,useProtocol, useTransport, \
-		    initFunction, flags) \
-{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin,bcdDeviceMax), \
-  .driver_info = (flags)|(USB_US_TYPE_STOR<<24) }
-
-#define COMPLIANT_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
-		    vendorName, productName, useProtocol, useTransport, \
-		    initFunction, flags) \
-{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \
-  .driver_info = (flags) }
-
-#define USUAL_DEV(useProto, useTrans, useType) \
-{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, useProto, useTrans), \
-  .driver_info = ((useType)<<24) }
-
-struct usb_device_id storage_usb_ids [] = {
-#	include "unusual_devs.h"
-	{ } /* Terminating entry */
-};
-
-#undef USUAL_DEV
-#undef UNUSUAL_DEV
-#undef COMPLIANT_DEV
-
-MODULE_DEVICE_TABLE(usb, storage_usb_ids);
-EXPORT_SYMBOL_GPL(storage_usb_ids);
-
 /*
  * @type: the module type as an integer
  */
@@ -167,7 +136,7 @@ static struct usb_driver usu_driver = {
 	.name =		"libusual",
 	.probe =	usu_probe,
 	.disconnect =	usu_disconnect,
-	.id_table =	storage_usb_ids,
+	.id_table =	usb_storage_usb_ids,
 };
 
 /*
diff --git a/drivers/usb/storage/protocol.c b/drivers/usb/storage/protocol.c
index be441d84bc64..fc310f75eada 100644
--- a/drivers/usb/storage/protocol.c
+++ b/drivers/usb/storage/protocol.c
@@ -121,6 +121,7 @@ void usb_stor_transparent_scsi_command(struct scsi_cmnd *srb,
 	/* send the command to the transport layer */
 	usb_stor_invoke_transport(srb, us);
 }
+EXPORT_SYMBOL_GPL(usb_stor_transparent_scsi_command);
 
 /***********************************************************************
  * Scatter-gather transfer buffer access routines
@@ -199,6 +200,7 @@ unsigned int usb_stor_access_xfer_buf(unsigned char *buffer,
 	/* Return the amount actually transferred */
 	return cnt;
 }
+EXPORT_SYMBOL_GPL(usb_stor_access_xfer_buf);
 
 /* Store the contents of buffer into srb's transfer buffer and set the
  * SCSI residue.
@@ -215,3 +217,4 @@ void usb_stor_set_xfer_buf(unsigned char *buffer,
 	if (buflen < scsi_bufflen(srb))
 		scsi_set_resid(srb, scsi_bufflen(srb) - buflen);
 }
+EXPORT_SYMBOL_GPL(usb_stor_set_xfer_buf);
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index ed710bcdaab2..4ca3b5860643 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -569,4 +569,4 @@ unsigned char usb_stor_sense_invalidCDB[18] = {
 	[7]	= 0x0a,			    /* additional length */
 	[12]	= 0x24			    /* Invalid Field in CDB */
 };
-
+EXPORT_SYMBOL_GPL(usb_stor_sense_invalidCDB);
diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index fb65d221cedf..d48c8553539d 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -220,6 +220,7 @@ int usb_stor_control_msg(struct us_data *us, unsigned int pipe,
 		status = us->current_urb->actual_length;
 	return status;
 }
+EXPORT_SYMBOL_GPL(usb_stor_control_msg);
 
 /* This is a version of usb_clear_halt() that allows early termination and
  * doesn't read the status from the device -- this is because some devices
@@ -254,6 +255,7 @@ int usb_stor_clear_halt(struct us_data *us, unsigned int pipe)
 	US_DEBUGP("%s: result = %d\n", __func__, result);
 	return result;
 }
+EXPORT_SYMBOL_GPL(usb_stor_clear_halt);
 
 
 /*
@@ -352,6 +354,7 @@ int usb_stor_ctrl_transfer(struct us_data *us, unsigned int pipe,
 	return interpret_urb_result(us, pipe, size, result,
 			us->current_urb->actual_length);
 }
+EXPORT_SYMBOL_GPL(usb_stor_ctrl_transfer);
 
 /*
  * Receive one interrupt buffer, without timeouts, but allowing early
@@ -407,6 +410,7 @@ int usb_stor_bulk_transfer_buf(struct us_data *us, unsigned int pipe,
 	return interpret_urb_result(us, pipe, length, result, 
 			us->current_urb->actual_length);
 }
+EXPORT_SYMBOL_GPL(usb_stor_bulk_transfer_buf);
 
 /*
  * Transfer a scatter-gather list via bulk transfer
@@ -474,6 +478,7 @@ int usb_stor_bulk_srb(struct us_data* us, unsigned int pipe,
 	scsi_set_resid(srb, scsi_bufflen(srb) - partial);
 	return result;
 }
+EXPORT_SYMBOL_GPL(usb_stor_bulk_srb);
 
 /*
  * Transfer an entire SCSI command's worth of data payload over the bulk
@@ -509,6 +514,7 @@ int usb_stor_bulk_transfer_sg(struct us_data* us, unsigned int pipe,
 		*residual = length_left;
 	return result;
 }
+EXPORT_SYMBOL_GPL(usb_stor_bulk_transfer_sg);
 
 /***********************************************************************
  * Transport routines
@@ -940,6 +946,7 @@ int usb_stor_CB_transport(struct scsi_cmnd *srb, struct us_data *us)
 		usb_stor_clear_halt(us, pipe);
 	return USB_STOR_TRANSPORT_FAILED;
 }
+EXPORT_SYMBOL_GPL(usb_stor_CB_transport);
 
 /*
  * Bulk only transport
@@ -1156,6 +1163,7 @@ int usb_stor_Bulk_transport(struct scsi_cmnd *srb, struct us_data *us)
 	/* we should never get here, but if we do, we're in trouble */
 	return USB_STOR_TRANSPORT_ERROR;
 }
+EXPORT_SYMBOL_GPL(usb_stor_Bulk_transport);
 
 /***********************************************************************
  * Reset routines
@@ -1230,6 +1238,7 @@ int usb_stor_CB_reset(struct us_data *us)
 				 USB_TYPE_CLASS | USB_RECIP_INTERFACE,
 				 0, us->ifnum, us->iobuf, CB_RESET_CMD_SIZE);
 }
+EXPORT_SYMBOL_GPL(usb_stor_CB_reset);
 
 /* This issues a Bulk-only Reset to the device in question, including
  * clearing the subsequent endpoint halts that may occur.
@@ -1242,6 +1251,7 @@ int usb_stor_Bulk_reset(struct us_data *us)
 				 USB_TYPE_CLASS | USB_RECIP_INTERFACE,
 				 0, us->ifnum, NULL, 0);
 }
+EXPORT_SYMBOL_GPL(usb_stor_Bulk_reset);
 
 /* Issue a USB port reset to the device.  The caller must not hold
  * us->dev_mutex.
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index b01dade63cb3..490ea761398c 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -5,7 +5,7 @@
  *
  * Developed with the assistance of:
  *   (c) 2000 David L. Brown, Jr. (usb-storage@davidb.org)
- *   (c) 2003 Alan Stern (stern@rowland.harvard.edu)
+ *   (c) 2003-2009 Alan Stern (stern@rowland.harvard.edu)
  *
  * Initial work by:
  *   (c) 1999 Michael Gee (michael@linuxspecific.com)
@@ -118,36 +118,8 @@ MODULE_PARM_DESC(quirks, "supplemental list of device IDs and their quirks");
 
 /*
  * The entries in this table correspond, line for line,
- * with the entries of us_unusual_dev_list[].
+ * with the entries in usb_storage_usb_ids[], defined in usual-tables.c.
  */
-#ifndef CONFIG_USB_LIBUSUAL
-
-#define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
-		    vendorName, productName,useProtocol, useTransport, \
-		    initFunction, flags) \
-{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin,bcdDeviceMax), \
-  .driver_info = (flags)|(USB_US_TYPE_STOR<<24) }
-
-#define COMPLIANT_DEV	UNUSUAL_DEV
-
-#define USUAL_DEV(useProto, useTrans, useType) \
-{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, useProto, useTrans), \
-  .driver_info = (USB_US_TYPE_STOR<<24) }
-
-static struct usb_device_id storage_usb_ids [] = {
-
-#	include "unusual_devs.h"
-#undef UNUSUAL_DEV
-#undef COMPLIANT_DEV
-#undef USUAL_DEV
-	/* Terminating entry */
-	{ }
-};
-
-MODULE_DEVICE_TABLE (usb, storage_usb_ids);
-#endif /* CONFIG_USB_LIBUSUAL */
-
-/* This is the list of devices we recognize, along with their flag data */
 
 /* The vendor name should be kept at eight characters or less, and
  * the product name should be kept at 16 characters or less. If a device
@@ -179,18 +151,17 @@ MODULE_DEVICE_TABLE (usb, storage_usb_ids);
 
 static struct us_unusual_dev us_unusual_dev_list[] = {
 #	include "unusual_devs.h" 
-#	undef UNUSUAL_DEV
-#	undef COMPLIANT_DEV
-#	undef USUAL_DEV
-
-	/* Terminating entry */
-	{ NULL }
+	{ }		/* Terminating entry */
 };
 
+#undef UNUSUAL_DEV
+#undef COMPLIANT_DEV
+#undef USUAL_DEV
+
 
 #ifdef CONFIG_PM	/* Minimal support for suspend and resume */
 
-static int storage_suspend(struct usb_interface *iface, pm_message_t message)
+int usb_stor_suspend(struct usb_interface *iface, pm_message_t message)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
@@ -207,8 +178,9 @@ static int storage_suspend(struct usb_interface *iface, pm_message_t message)
 	mutex_unlock(&us->dev_mutex);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_stor_suspend);
 
-static int storage_resume(struct usb_interface *iface)
+int usb_stor_resume(struct usb_interface *iface)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
@@ -221,8 +193,9 @@ static int storage_resume(struct usb_interface *iface)
 	mutex_unlock(&us->dev_mutex);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_stor_resume);
 
-static int storage_reset_resume(struct usb_interface *iface)
+int usb_stor_reset_resume(struct usb_interface *iface)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
@@ -235,6 +208,7 @@ static int storage_reset_resume(struct usb_interface *iface)
 	 * the device */
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_stor_reset_resume);
 
 #endif /* CONFIG_PM */
 
@@ -243,7 +217,7 @@ static int storage_reset_resume(struct usb_interface *iface)
  * a USB port reset, whether from this driver or a different one.
  */
 
-static int storage_pre_reset(struct usb_interface *iface)
+int usb_stor_pre_reset(struct usb_interface *iface)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
@@ -253,8 +227,9 @@ static int storage_pre_reset(struct usb_interface *iface)
 	mutex_lock(&us->dev_mutex);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_stor_pre_reset);
 
-static int storage_post_reset(struct usb_interface *iface)
+int usb_stor_post_reset(struct usb_interface *iface)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
@@ -269,6 +244,7 @@ static int storage_post_reset(struct usb_interface *iface)
 	mutex_unlock(&us->dev_mutex);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(usb_stor_post_reset);
 
 /*
  * fill_inquiry_response takes an unsigned char array (which must
@@ -311,6 +287,7 @@ void fill_inquiry_response(struct us_data *us, unsigned char *data,
 
 	usb_stor_set_xfer_buf(data, data_len, us->srb);
 }
+EXPORT_SYMBOL_GPL(fill_inquiry_response);
 
 static int usb_stor_control_thread(void * __us)
 {
@@ -551,20 +528,13 @@ static void adjust_quirks(struct us_data *us)
 			vid, pid, f);
 }
 
-/* Find an unusual_dev descriptor (always succeeds in the current code) */
-static struct us_unusual_dev *find_unusual(const struct usb_device_id *id)
-{
-	const int id_index = id - storage_usb_ids;
-	return &us_unusual_dev_list[id_index];
-}
-
 /* Get the unusual_devs entries and the string descriptors */
-static int get_device_info(struct us_data *us, const struct usb_device_id *id)
+static int get_device_info(struct us_data *us, const struct usb_device_id *id,
+		struct us_unusual_dev *unusual_dev)
 {
 	struct usb_device *dev = us->pusb_dev;
 	struct usb_interface_descriptor *idesc =
 		&us->pusb_intf->cur_altsetting->desc;
-	struct us_unusual_dev *unusual_dev = find_unusual(id);
 
 	/* Store the entries */
 	us->unusual_dev = unusual_dev;
@@ -629,7 +599,7 @@ static int get_device_info(struct us_data *us, const struct usb_device_id *id)
 }
 
 /* Get the transport settings */
-static int get_transport(struct us_data *us)
+static void get_transport(struct us_data *us)
 {
 	switch (us->protocol) {
 	case US_PR_CB:
@@ -732,19 +702,11 @@ static int get_transport(struct us_data *us)
 		break;
 #endif
 
-	default:
-		return -EIO;
 	}
-	US_DEBUGP("Transport: %s\n", us->transport_name);
-
-	/* fix for single-lun devices */
-	if (us->fflags & US_FL_SINGLE_LUN)
-		us->max_lun = 0;
-	return 0;
 }
 
 /* Get the protocol settings */
-static int get_protocol(struct us_data *us)
+static void get_protocol(struct us_data *us)
 {
 	switch (us->subclass) {
 	case US_SC_RBC:
@@ -794,11 +756,7 @@ static int get_protocol(struct us_data *us)
 		break;
 #endif
 
-	default:
-		return -EIO;
 	}
-	US_DEBUGP("Protocol: %s\n", us->protocol_name);
-	return 0;
 }
 
 /* Get the pipe settings */
@@ -1012,17 +970,15 @@ static int usb_stor_scan_thread(void * __us)
 }
 
 
-/* Probe to see if we can drive a newly-connected USB device */
-static int storage_probe(struct usb_interface *intf,
-			 const struct usb_device_id *id)
+/* First part of general USB mass-storage probing */
+int usb_stor_probe1(struct us_data **pus,
+		struct usb_interface *intf,
+		const struct usb_device_id *id,
+		struct us_unusual_dev *unusual_dev)
 {
 	struct Scsi_Host *host;
 	struct us_data *us;
 	int result;
-	struct task_struct *th;
-
-	if (usb_usual_check_type(id, USB_US_TYPE_STOR))
-		return -ENXIO;
 
 	US_DEBUGP("USB Mass Storage device detected\n");
 
@@ -1041,7 +997,7 @@ static int storage_probe(struct usb_interface *intf,
 	 * Allow 16-byte CDBs and thus > 2TB
 	 */
 	host->max_cmd_len = 16;
-	us = host_to_us(host);
+	*pus = us = host_to_us(host);
 	memset(us, 0, sizeof(struct us_data));
 	mutex_init(&(us->dev_mutex));
 	init_completion(&us->cmnd_ready);
@@ -1054,24 +1010,46 @@ static int storage_probe(struct usb_interface *intf,
 	if (result)
 		goto BadDevice;
 
-	/*
-	 * Get the unusual_devs entries and the descriptors
-	 *
-	 * id_index is calculated in the declaration to be the index number
-	 * of the match from the usb_device_id table, so we can find the
-	 * corresponding entry in the private table.
-	 */
-	result = get_device_info(us, id);
+	/* Get the unusual_devs entries and the descriptors */
+	result = get_device_info(us, id, unusual_dev);
 	if (result)
 		goto BadDevice;
 
-	/* Get the transport, protocol, and pipe settings */
-	result = get_transport(us);
-	if (result)
-		goto BadDevice;
-	result = get_protocol(us);
-	if (result)
+	/* Get standard transport and protocol settings */
+	get_transport(us);
+	get_protocol(us);
+
+	/* Give the caller a chance to fill in specialized transport
+	 * or protocol settings.
+	 */
+	return 0;
+
+BadDevice:
+	US_DEBUGP("storage_probe() failed\n");
+	release_everything(us);
+	return result;
+}
+EXPORT_SYMBOL_GPL(usb_stor_probe1);
+
+/* Second part of general USB mass-storage probing */
+int usb_stor_probe2(struct us_data *us)
+{
+	struct task_struct *th;
+	int result;
+
+	/* Make sure the transport and protocol have both been set */
+	if (!us->transport || !us->proto_handler) {
+		result = -ENXIO;
 		goto BadDevice;
+	}
+	US_DEBUGP("Transport: %s\n", us->transport_name);
+	US_DEBUGP("Protocol: %s\n", us->protocol_name);
+
+	/* fix for single-lun devices */
+	if (us->fflags & US_FL_SINGLE_LUN)
+		us->max_lun = 0;
+
+	/* Find the endpoints and calculate pipe values */
 	result = get_pipes(us);
 	if (result)
 		goto BadDevice;
@@ -1080,7 +1058,7 @@ static int storage_probe(struct usb_interface *intf,
 	result = usb_stor_acquire_resources(us);
 	if (result)
 		goto BadDevice;
-	result = scsi_add_host(host, &intf->dev);
+	result = scsi_add_host(us_to_host(us), &us->pusb_intf->dev);
 	if (result) {
 		printk(KERN_WARNING USB_STORAGE
 			"Unable to add the scsi host\n");
@@ -1108,9 +1086,10 @@ BadDevice:
 	release_everything(us);
 	return result;
 }
+EXPORT_SYMBOL_GPL(usb_stor_probe2);
 
-/* Handle a disconnect event from the USB core */
-static void storage_disconnect(struct usb_interface *intf)
+/* Handle a USB mass-storage disconnect */
+void usb_stor_disconnect(struct usb_interface *intf)
 {
 	struct us_data *us = usb_get_intfdata(intf);
 
@@ -1118,6 +1097,42 @@ static void storage_disconnect(struct usb_interface *intf)
 	quiesce_and_remove_host(us);
 	release_everything(us);
 }
+EXPORT_SYMBOL_GPL(usb_stor_disconnect);
+
+/* The main probe routine for standard devices */
+static int storage_probe(struct usb_interface *intf,
+			 const struct usb_device_id *id)
+{
+	struct us_data *us;
+	int result;
+
+	/*
+	 * If libusual is configured, let it decide whether a standard
+	 * device should be handled by usb-storage or by ub.
+	 * If the device isn't standard (is handled by a subdriver
+	 * module) then don't accept it.
+	 */
+	if (usb_usual_check_type(id, USB_US_TYPE_STOR) ||
+			usb_usual_ignore_device(intf))
+		return -ENXIO;
+
+	/*
+	 * Call the general probe procedures.
+	 *
+	 * The unusual_dev_list array is parallel to the usb_storage_usb_ids
+	 * table, so we use the index of the id entry to find the
+	 * corresponding unusual_devs entry.
+	 */
+	result = usb_stor_probe1(&us, intf, id,
+			(id - usb_storage_usb_ids) + us_unusual_dev_list);
+	if (result)
+		return result;
+
+	/* No special transport or protocol settings in the main module */
+
+	result = usb_stor_probe2(us);
+	return result;
+}
 
 /***********************************************************************
  * Initialization and registration
@@ -1126,15 +1141,13 @@ static void storage_disconnect(struct usb_interface *intf)
 static struct usb_driver usb_storage_driver = {
 	.name =		"usb-storage",
 	.probe =	storage_probe,
-	.disconnect =	storage_disconnect,
-#ifdef CONFIG_PM
-	.suspend =	storage_suspend,
-	.resume =	storage_resume,
-	.reset_resume =	storage_reset_resume,
-#endif
-	.pre_reset =	storage_pre_reset,
-	.post_reset =	storage_post_reset,
-	.id_table =	storage_usb_ids,
+	.disconnect =	usb_stor_disconnect,
+	.suspend =	usb_stor_suspend,
+	.resume =	usb_stor_resume,
+	.reset_resume =	usb_stor_reset_resume,
+	.pre_reset =	usb_stor_pre_reset,
+	.post_reset =	usb_stor_post_reset,
+	.id_table =	usb_storage_usb_ids,
 	.soft_unbind =	1,
 };
 
diff --git a/drivers/usb/storage/usb.h b/drivers/usb/storage/usb.h
index 65e674e4be99..2609efb2bd7e 100644
--- a/drivers/usb/storage/usb.h
+++ b/drivers/usb/storage/usb.h
@@ -177,4 +177,25 @@ extern void fill_inquiry_response(struct us_data *us,
 #define scsi_unlock(host)	spin_unlock_irq(host->host_lock)
 #define scsi_lock(host)		spin_lock_irq(host->host_lock)
 
+/* General routines provided by the usb-storage standard core */
+#ifdef CONFIG_PM
+extern int usb_stor_suspend(struct usb_interface *iface, pm_message_t message);
+extern int usb_stor_resume(struct usb_interface *iface);
+extern int usb_stor_reset_resume(struct usb_interface *iface);
+#else
+#define usb_stor_suspend	NULL
+#define usb_stor_resume		NULL
+#define usb_stor_reset_resume	NULL
+#endif
+
+extern int usb_stor_pre_reset(struct usb_interface *iface);
+extern int usb_stor_post_reset(struct usb_interface *iface);
+
+extern int usb_stor_probe1(struct us_data **pus,
+		struct usb_interface *intf,
+		const struct usb_device_id *id,
+		struct us_unusual_dev *unusual_dev);
+extern int usb_stor_probe2(struct us_data *us);
+extern void usb_stor_disconnect(struct usb_interface *intf);
+
 #endif
diff --git a/drivers/usb/storage/usual-tables.c b/drivers/usb/storage/usual-tables.c
new file mode 100644
index 000000000000..1924e3229409
--- /dev/null
+++ b/drivers/usb/storage/usual-tables.c
@@ -0,0 +1,105 @@
+/* Driver for USB Mass Storage devices
+ * Usual Tables File for usb-storage and libusual
+ *
+ * Copyright (C) 2009 Alan Stern (stern@rowland.harvard.edu)
+ *
+ * Please see http://www.one-eyed-alien.net/~mdharm/linux-usb for more
+ * information about this driver.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+
+
+/*
+ * The table of devices
+ */
+#define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
+		    vendorName, productName, useProtocol, useTransport, \
+		    initFunction, flags) \
+{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \
+  .driver_info = (flags)|(USB_US_TYPE_STOR<<24) }
+
+#define COMPLIANT_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
+		    vendorName, productName, useProtocol, useTransport, \
+		    initFunction, flags) \
+{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \
+  .driver_info = (flags) }
+
+#define USUAL_DEV(useProto, useTrans, useType) \
+{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, useProto, useTrans), \
+  .driver_info = ((useType)<<24) }
+
+struct usb_device_id usb_storage_usb_ids[] = {
+#	include "unusual_devs.h"
+	{ }		/* Terminating entry */
+};
+EXPORT_SYMBOL_GPL(usb_storage_usb_ids);
+
+MODULE_DEVICE_TABLE(usb, usb_storage_usb_ids);
+
+#undef UNUSUAL_DEV
+#undef COMPLIANT_DEV
+#undef USUAL_DEV
+
+
+/*
+ * The table of devices to ignore
+ */
+struct ignore_entry {
+	u16	vid, pid, bcdmin, bcdmax;
+};
+
+#define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
+		    vendorName, productName, useProtocol, useTransport, \
+		    initFunction, flags) \
+{					\
+	.vid	= id_vendor,		\
+	.pid 	= id_product,		\
+	.bcdmin	= bcdDeviceMin,		\
+	.bcdmax = bcdDeviceMax,		\
+}
+
+static struct ignore_entry ignore_ids[] = {
+	{ }		/* Terminating entry */
+};
+
+#undef UNUSUAL_DEV
+
+
+/* Return an error if a device is in the ignore_ids list */
+int usb_usual_ignore_device(struct usb_interface *intf)
+{
+	struct usb_device *udev;
+	unsigned vid, pid, bcd;
+	struct ignore_entry *p;
+
+	udev = interface_to_usbdev(intf);
+	vid = le16_to_cpu(udev->descriptor.idVendor);
+	pid = le16_to_cpu(udev->descriptor.idProduct);
+	bcd = le16_to_cpu(udev->descriptor.bcdDevice);
+
+	for (p = ignore_ids; p->vid; ++p) {
+		if (p->vid == vid && p->pid == pid &&
+				p->bcdmin <= bcd && p->bcdmax >= bcd)
+			return -ENXIO;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_usual_ignore_device);
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index 1eea1ab68dc4..3d15fb9bc116 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -96,39 +96,26 @@ enum { US_DO_ALL_FLAGS };
 #define US_PR_CBI	0x00		/* Control/Bulk/Interrupt */
 #define US_PR_CB	0x01		/* Control/Bulk w/o interrupt */
 #define US_PR_BULK	0x50		/* bulk only */
-#ifdef CONFIG_USB_STORAGE_USBAT
+
 #define US_PR_USBAT	0x80		/* SCM-ATAPI bridge */
-#endif
-#ifdef CONFIG_USB_STORAGE_SDDR09
 #define US_PR_EUSB_SDDR09	0x81	/* SCM-SCSI bridge for SDDR-09 */
-#endif
-#ifdef CONFIG_USB_STORAGE_SDDR55
 #define US_PR_SDDR55	0x82		/* SDDR-55 (made up) */
-#endif
 #define US_PR_DPCM_USB  0xf0		/* Combination CB/SDDR09 */
-#ifdef CONFIG_USB_STORAGE_FREECOM
 #define US_PR_FREECOM   0xf1		/* Freecom */
-#endif
-#ifdef CONFIG_USB_STORAGE_DATAFAB
 #define US_PR_DATAFAB   0xf2		/* Datafab chipsets */
-#endif
-#ifdef CONFIG_USB_STORAGE_JUMPSHOT
 #define US_PR_JUMPSHOT  0xf3		/* Lexar Jumpshot */
-#endif
-#ifdef CONFIG_USB_STORAGE_ALAUDA
 #define US_PR_ALAUDA    0xf4		/* Alauda chipsets */
-#endif
-#ifdef CONFIG_USB_STORAGE_KARMA
 #define US_PR_KARMA     0xf5		/* Rio Karma */
-#endif
 
 #define US_PR_DEVICE	0xff		/* Use device's value */
 
 /*
  */
+extern int usb_usual_ignore_device(struct usb_interface *intf);
+extern struct usb_device_id usb_storage_usb_ids[];
+
 #ifdef CONFIG_USB_LIBUSUAL
 
-extern struct usb_device_id storage_usb_ids[];
 extern void usb_usual_set_present(int type);
 extern void usb_usual_clear_present(int type);
 extern int usb_usual_check_type(const struct usb_device_id *, int type);
-- 
cgit v1.2.3-71-gd317


From 6da9c99059bf24fb1faae6b9613bae64ea50c05e Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Wed, 18 Feb 2009 14:43:47 +0000
Subject: USB: allow libusb to talk to unauthenticated WUSB devices

To permit a userspace application to associate with WUSB devices
using numeric association, control transfers to unauthenticated WUSB
devices must be allowed.

This requires that wusbcore correctly sets the device state to
UNAUTHENTICATED, DEFAULT and ADDRESS and that control transfers can be
performed to UNAUTHENTICATED devices.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devio.c          | 3 ++-
 drivers/usb/core/hub.c            | 1 +
 drivers/usb/core/urb.c            | 2 +-
 drivers/usb/wusbcore/devconnect.c | 2 ++
 drivers/usb/wusbcore/security.c   | 2 ++
 include/linux/usb/ch9.h           | 2 +-
 6 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 6585f527e381..8f022af2fd7a 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -525,7 +525,8 @@ static int check_ctrlrecip(struct dev_state *ps, unsigned int requesttype,
 {
 	int ret = 0;
 
-	if (ps->dev->state != USB_STATE_ADDRESS
+	if (ps->dev->state != USB_STATE_UNAUTHENTICATED
+	 && ps->dev->state != USB_STATE_ADDRESS
 	 && ps->dev->state != USB_STATE_CONFIGURED)
 		return -EHOSTUNREACH;
 	if (USB_TYPE_VENDOR == (USB_TYPE_MASK & requesttype))
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 7e33d63ab92f..f17d9ebc44af 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -1305,6 +1305,7 @@ void usb_set_device_state(struct usb_device *udev,
 		recursively_mark_NOTATTACHED(udev);
 	spin_unlock_irqrestore(&device_state_lock, flags);
 }
+EXPORT_SYMBOL_GPL(usb_set_device_state);
 
 /*
  * WUSB devices are simple: they have no hubs behind, so the mapping
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 58bc5e3c2560..7025d801f23a 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -295,7 +295,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
 	if (!urb || urb->hcpriv || !urb->complete)
 		return -EINVAL;
 	dev = urb->dev;
-	if ((!dev) || (dev->state < USB_STATE_DEFAULT))
+	if ((!dev) || (dev->state < USB_STATE_UNAUTHENTICATED))
 		return -ENODEV;
 
 	/* For now, get the endpoint from the pipe.  Eventually drivers
diff --git a/drivers/usb/wusbcore/devconnect.c b/drivers/usb/wusbcore/devconnect.c
index 8e18141bb2e0..f0aac0cf315a 100644
--- a/drivers/usb/wusbcore/devconnect.c
+++ b/drivers/usb/wusbcore/devconnect.c
@@ -889,6 +889,8 @@ static void wusb_dev_add_ncb(struct usb_device *usb_dev)
 	if (usb_dev->wusb == 0 || usb_dev->devnum == 1)
 		return;		/* skip non wusb and wusb RHs */
 
+	usb_set_device_state(usb_dev, USB_STATE_UNAUTHENTICATED);
+
 	wusbhc = wusbhc_get_by_usb_dev(usb_dev);
 	if (wusbhc == NULL)
 		goto error_nodev;
diff --git a/drivers/usb/wusbcore/security.c b/drivers/usb/wusbcore/security.c
index f4aa28eca70d..8118db7f1d8d 100644
--- a/drivers/usb/wusbcore/security.c
+++ b/drivers/usb/wusbcore/security.c
@@ -312,6 +312,7 @@ int wusb_dev_update_address(struct wusbhc *wusbhc, struct wusb_dev *wusb_dev)
 	result = wusb_set_dev_addr(wusbhc, wusb_dev, 0);
 	if (result < 0)
 		goto error_addr0;
+	usb_set_device_state(usb_dev, USB_STATE_DEFAULT);
 	usb_ep0_reinit(usb_dev);
 
 	/* Set new (authenticated) address. */
@@ -327,6 +328,7 @@ int wusb_dev_update_address(struct wusbhc *wusbhc, struct wusb_dev *wusb_dev)
 	result = wusb_set_dev_addr(wusbhc, wusb_dev, new_address);
 	if (result < 0)
 		goto error_addr;
+	usb_set_device_state(usb_dev, USB_STATE_ADDRESS);
 	usb_ep0_reinit(usb_dev);
 	usb_dev->authenticated = 1;
 error_addr:
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index fa777db7f7eb..d9d54803dbcb 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -763,8 +763,8 @@ enum usb_device_state {
 	/* chapter 9 and authentication (wireless) device states */
 	USB_STATE_ATTACHED,
 	USB_STATE_POWERED,			/* wired */
-	USB_STATE_UNAUTHENTICATED,		/* auth */
 	USB_STATE_RECONNECTING,			/* auth */
+	USB_STATE_UNAUTHENTICATED,		/* auth */
 	USB_STATE_DEFAULT,			/* limited function */
 	USB_STATE_ADDRESS,
 	USB_STATE_CONFIGURED,			/* most functions */
-- 
cgit v1.2.3-71-gd317


From 16e2e5f634f86ccda18366967c4e592eb61bc9cc Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 3 Mar 2009 16:44:13 -0800
Subject: USB: make transfer_buffer_lengths in struct urb field u32

Roel Kluin pointed out that transfer_buffer_lengths in struct urb was
declared as an 'int'.  This patch changes this field to be 'u32' to
prevent any potential negative conversion and comparison errors.

This triggered a few compiler warning messages when these fields were
being used with the min macro, so they have also been fixed up in this
patch.

Cc: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/dummy_hcd.c  | 2 +-
 drivers/usb/host/isp116x-hcd.c  | 2 +-
 drivers/usb/host/r8a66597-hcd.c | 2 +-
 drivers/usb/host/sl811-hcd.c    | 4 ++--
 drivers/usb/misc/ftdi-elan.c    | 6 +++---
 include/linux/usb.h             | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c
index 3b42888b72f8..a56b24d305f8 100644
--- a/drivers/usb/gadget/dummy_hcd.c
+++ b/drivers/usb/gadget/dummy_hcd.c
@@ -1437,7 +1437,7 @@ restart:
 					}
 					if (urb->transfer_buffer_length > 1)
 						buf [1] = 0;
-					urb->actual_length = min (2,
+					urb->actual_length = min_t(u32, 2,
 						urb->transfer_buffer_length);
 					value = 0;
 					status = 0;
diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c
index 4dda31b26892..a2b305477afe 100644
--- a/drivers/usb/host/isp116x-hcd.c
+++ b/drivers/usb/host/isp116x-hcd.c
@@ -772,7 +772,7 @@ static int isp116x_urb_enqueue(struct usb_hcd *hcd,
 		break;
 	case PIPE_INTERRUPT:
 		urb->interval = ep->period;
-		ep->length = min((int)ep->maxpacket,
+		ep->length = min_t(u32, ep->maxpacket,
 				 urb->transfer_buffer_length);
 
 		/* urb submitted for already existing endpoint */
diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index 5e942d94aebe..713f4cf0b0dd 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -1394,7 +1394,7 @@ static void packet_write(struct r8a66597 *r8a66597, u16 pipenum)
 			   (int)urb->iso_frame_desc[td->iso_cnt].length);
 	} else {
 		buf = (u16 *)(urb->transfer_buffer + urb->actual_length);
-		size = min((int)bufsize,
+		size = min_t(u32, bufsize,
 			   urb->transfer_buffer_length - urb->actual_length);
 	}
 
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index e106e9d48d4a..a949259f18b9 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -230,7 +230,7 @@ static void in_packet(
 	writeb(usb_pipedevice(urb->pipe), data_reg);
 
 	sl811_write(sl811, bank + SL11H_HOSTCTLREG, control);
-	ep->length = min((int)len,
+	ep->length = min_t(u32, len,
 			urb->transfer_buffer_length - urb->actual_length);
 	PACKET("IN%s/%d qh%p len%d\n", ep->nak_count ? "/retry" : "",
 			!!usb_gettoggle(urb->dev, ep->epnum, 0), ep, len);
@@ -255,7 +255,7 @@ static void out_packet(
 	buf = urb->transfer_buffer + urb->actual_length;
 	prefetch(buf);
 
-	len = min((int)ep->maxpacket,
+	len = min_t(u32, ep->maxpacket,
 			urb->transfer_buffer_length - urb->actual_length);
 
 	if (!(control & SL11H_HCTLMASK_ISOCH)
diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c
index 79a7668ef264..9d0675ed0d4c 100644
--- a/drivers/usb/misc/ftdi-elan.c
+++ b/drivers/usb/misc/ftdi-elan.c
@@ -1568,7 +1568,7 @@ static int ftdi_elan_edset_input(struct usb_ftdi *ftdi, u8 ed_number,
                         struct u132_target *target = &ftdi->target[ed];
                         struct u132_command *command = &ftdi->command[
                                 COMMAND_MASK & ftdi->command_next];
-                        int remaining_length = urb->transfer_buffer_length -
+                        u32 remaining_length = urb->transfer_buffer_length -
                                 urb->actual_length;
                         command->header = 0x82 | (ed << 5);
                         if (remaining_length == 0) {
@@ -1702,7 +1702,7 @@ static int ftdi_elan_edset_output(struct usb_ftdi *ftdi, u8 ed_number,
                                 | (address << 0);
                         command->width = usb_maxpacket(urb->dev, urb->pipe,
                                 usb_pipeout(urb->pipe));
-                        command->follows = min(1024,
+                        command->follows = min_t(u32, 1024,
                                 urb->transfer_buffer_length -
                                 urb->actual_length);
                         command->value = 0;
@@ -1766,7 +1766,7 @@ static int ftdi_elan_edset_single(struct usb_ftdi *ftdi, u8 ed_number,
                 mutex_lock(&ftdi->u132_lock);
                 command_size = ftdi->command_next - ftdi->command_head;
                 if (command_size < COMMAND_SIZE) {
-                        int remaining_length = urb->transfer_buffer_length -
+                        u32 remaining_length = urb->transfer_buffer_length -
                                 urb->actual_length;
                         struct u132_target *target = &ftdi->target[ed];
                         struct u132_command *command = &ftdi->command[
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 0c05ff621192..db8808e05a2a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1177,7 +1177,7 @@ struct urb {
 	unsigned int transfer_flags;	/* (in) URB_SHORT_NOT_OK | ...*/
 	void *transfer_buffer;		/* (in) associated data buffer */
 	dma_addr_t transfer_dma;	/* (in) dma addr for transfer_buffer */
-	int transfer_buffer_length;	/* (in) data buffer length */
+	u32 transfer_buffer_length;	/* (in) data buffer length */
 	int actual_length;		/* (return) actual transfer length */
 	unsigned char *setup_packet;	/* (in) setup packet (control only) */
 	dma_addr_t setup_dma;		/* (in) dma addr for setup_packet */
-- 
cgit v1.2.3-71-gd317


From 8c209e6782ca0e3046803fc04a5ac01c8c10437a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 6 Mar 2009 21:31:03 -0800
Subject: USB: make actual_length in struct urb field u32

actual_length should also be a u32 and not a signed value.  This patch
changes this field to be 'u32' to prevent any potential negative
conversion and comparison errors.

This triggered a few compiler warning messages when these fields were
being used with the min macro, so they have also been fixed up in this
patch.

Cc: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/serial/ftdi_sio.c | 2 +-
 include/linux/usb.h           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index adeb23fb8003..dcc87aaa8628 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1947,7 +1947,7 @@ static void ftdi_process_read(struct work_struct *work)
 			priv->prev_status = new_status;
 		}
 
-		length = min(PKTSZ, urb->actual_length-packet_offset)-2;
+		length = min_t(u32, PKTSZ, urb->actual_length-packet_offset)-2;
 		if (length < 0) {
 			dev_err(&port->dev, "%s - bad packet length: %d\n",
 				__func__, length+2);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index db8808e05a2a..c6b2ab41b908 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1178,7 +1178,7 @@ struct urb {
 	void *transfer_buffer;		/* (in) associated data buffer */
 	dma_addr_t transfer_dma;	/* (in) dma addr for transfer_buffer */
 	u32 transfer_buffer_length;	/* (in) data buffer length */
-	int actual_length;		/* (return) actual transfer length */
+	u32 actual_length;		/* (return) actual transfer length */
 	unsigned char *setup_packet;	/* (in) setup packet (control only) */
 	dma_addr_t setup_dma;		/* (in) dma addr for setup_packet */
 	int start_frame;		/* (modify) start frame (ISO) */
-- 
cgit v1.2.3-71-gd317


From 64a3a25f440c65510cb0d15080dcd2f0032d6051 Mon Sep 17 00:00:00 2001
From: "D.J. Capelis" <dev@capelis.dj>
Date: Wed, 4 Mar 2009 10:27:52 -0800
Subject: USB: pedantic: spelling correction in comment for ch9.h

Just noticed this during a grep, figured I might as well send it in.

From: D.J. Capelis <dev@capelis.dj>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/ch9.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index d9d54803dbcb..b145119a90da 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -102,7 +102,7 @@
 #define USB_REQ_LOOPBACK_DATA_READ	0x16
 #define USB_REQ_SET_INTERFACE_DS	0x17
 
-/* The Link Power Mangement (LPM) ECN defines USB_REQ_TEST_AND_SET command,
+/* The Link Power Management (LPM) ECN defines USB_REQ_TEST_AND_SET command,
  * used by hubs to put ports into a new L1 suspend state, except that it
  * forgot to define its number ...
  */
-- 
cgit v1.2.3-71-gd317


From 8942939a6c83f34615de5ae041cc9ca846923f94 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Thu, 19 Mar 2009 14:14:17 -0700
Subject: USB: gadget: composite device-level suspend/resume hooks

Address one open question in the composite gadget framework:
Yes, we should have device-level suspend/resume callbacks
in addition to the function-level ones.  We have at least one
scenario (with gadget zero in OTG test mode) that's awkward
to handle without it.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Felipe Balbi <felipe.balbi@nokia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/composite.c | 8 ++++++--
 include/linux/usb/composite.h  | 8 ++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 40f1da77a006..59e85234fa0a 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1014,7 +1014,7 @@ composite_suspend(struct usb_gadget *gadget)
 	struct usb_composite_dev	*cdev = get_gadget_data(gadget);
 	struct usb_function		*f;
 
-	/* REVISIT:  should we have config and device level
+	/* REVISIT:  should we have config level
 	 * suspend/resume callbacks?
 	 */
 	DBG(cdev, "suspend\n");
@@ -1024,6 +1024,8 @@ composite_suspend(struct usb_gadget *gadget)
 				f->suspend(f);
 		}
 	}
+	if (composite->suspend)
+		composite->suspend(cdev);
 }
 
 static void
@@ -1032,10 +1034,12 @@ composite_resume(struct usb_gadget *gadget)
 	struct usb_composite_dev	*cdev = get_gadget_data(gadget);
 	struct usb_function		*f;
 
-	/* REVISIT:  should we have config and device level
+	/* REVISIT:  should we have config level
 	 * suspend/resume callbacks?
 	 */
 	DBG(cdev, "resume\n");
+	if (composite->resume)
+		composite->resume(cdev);
 	if (cdev->config) {
 		list_for_each_entry(f, &cdev->config->functions, list) {
 			if (f->resume)
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 935c380ffe47..acd7b0f06c8a 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -244,6 +244,10 @@ int usb_add_config(struct usb_composite_dev *,
  *	value; it should return zero on successful initialization.
  * @unbind: Reverses @bind(); called as a side effect of unregistering
  *	this driver.
+ * @suspend: Notifies when the host stops sending USB traffic,
+ *	after function notifications
+ * @resume: Notifies configuration when the host restarts USB traffic,
+ *	before function notifications
  *
  * Devices default to reporting self powered operation.  Devices which rely
  * on bus powered operation should report this in their @bind() method.
@@ -268,6 +272,10 @@ struct usb_composite_driver {
 
 	int			(*bind)(struct usb_composite_dev *);
 	int			(*unbind)(struct usb_composite_dev *);
+
+	/* global suspend hooks */
+	void			(*suspend)(struct usb_composite_dev *);
+	void			(*resume)(struct usb_composite_dev *);
 };
 
 extern int usb_composite_register(struct usb_composite_driver *);
-- 
cgit v1.2.3-71-gd317


From 38938bfe3489394e2eed5e40c9bb8f66a2ce1405 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 24 Mar 2009 16:37:55 -0700
Subject: netlink: add NETLINK_NO_ENOBUFS socket flag

This patch adds the NETLINK_NO_ENOBUFS socket flag. This flag can
be used by unicast and broadcast listeners to avoid receiving
ENOBUFS errors.

Generally speaking, ENOBUFS errors are useful to notify two things
to the listener:

a) You may increase the receiver buffer size via setsockopt().
b) You have lost messages, you may be out of sync.

In some cases, ignoring ENOBUFS errors can be useful. For example:

a) nfnetlink_queue: this subsystem does not have any sort of resync
method and you can decide to ignore ENOBUFS once you have set a
given buffer size.

b) ctnetlink: you can use this together with the socket flag
NETLINK_BROADCAST_SEND_ERROR to stop getting ENOBUFS errors as
you do not need to resync (packets whose event are not delivered
are drop to provide reliable logging and state-synchronization).

Moreover, the use of NETLINK_NO_ENOBUFS also reduces a "go up, go down"
effect in terms of performance which is due to the netlink congestion
control when the listener cannot back off. The effect is the following:

1) throughput rate goes up and netlink messages are inserted in the
receiver buffer.
2) Then, netlink buffer fills and overruns (set on nlk->state bit 0).
3) While the listener empties the receiver buffer, netlink keeps
dropping messages. Thus, throughput goes dramatically down.
4) Then, once the listener has emptied the buffer (nlk->state
bit 0 is set off), goto step 1.

This effect is easy to trigger with netlink broadcast under heavy
load, and it is more noticeable when using a big receiver buffer.
You can find some results in [1] that show this problem.

[1] http://1984.lsi.us.es/linux/netlink/

This patch also includes the use of sk_drop to account the number of
netlink messages drop due to overrun. This value is shown in
/proc/net/netlink.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |  1 +
 net/netlink/af_netlink.c | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 1e6bf995435c..5ba398e90304 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -104,6 +104,7 @@ struct nlmsgerr
 #define NETLINK_DROP_MEMBERSHIP	2
 #define NETLINK_PKTINFO		3
 #define NETLINK_BROADCAST_ERROR	4
+#define NETLINK_NO_ENOBUFS	5
 
 struct nl_pktinfo
 {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b73d4e61c5ac..8b6bbb3032b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -86,6 +86,7 @@ struct netlink_sock {
 #define NETLINK_KERNEL_SOCKET	0x1
 #define NETLINK_RECV_PKTINFO	0x2
 #define NETLINK_BROADCAST_SEND_ERROR	0x4
+#define NETLINK_RECV_NO_ENOBUFS	0x8
 
 static inline struct netlink_sock *nlk_sk(struct sock *sk)
 {
@@ -717,10 +718,15 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
 
 static void netlink_overrun(struct sock *sk)
 {
-	if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
-		sk->sk_err = ENOBUFS;
-		sk->sk_error_report(sk);
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
+		if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
+			sk->sk_err = ENOBUFS;
+			sk->sk_error_report(sk);
+		}
 	}
+	atomic_inc(&sk->sk_drops);
 }
 
 static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
@@ -1182,6 +1188,15 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
 		err = 0;
 		break;
+	case NETLINK_NO_ENOBUFS:
+		if (val) {
+			nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
+			clear_bit(0, &nlk->state);
+			wake_up_interruptible(&nlk->wait);
+		} else
+			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1224,6 +1239,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 		err = 0;
 		break;
+	case NETLINK_NO_ENOBUFS:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
+		if (put_user(len, optlen) ||
+		    put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1879,12 +1904,12 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 	if (v == SEQ_START_TOKEN)
 		seq_puts(seq,
 			 "sk       Eth Pid    Groups   "
-			 "Rmem     Wmem     Dump     Locks\n");
+			 "Rmem     Wmem     Dump     Locks     Drops\n");
 	else {
 		struct sock *s = v;
 		struct netlink_sock *nlk = nlk_sk(s);
 
-		seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n",
+		seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %-8d %-8d\n",
 			   s,
 			   s->sk_protocol,
 			   nlk->pid,
@@ -1892,7 +1917,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 			   atomic_read(&s->sk_rmem_alloc),
 			   atomic_read(&s->sk_wmem_alloc),
 			   nlk->cb,
-			   atomic_read(&s->sk_refcnt)
+			   atomic_read(&s->sk_refcnt),
+			   atomic_read(&s->sk_drops)
 			);
 
 	}
-- 
cgit v1.2.3-71-gd317


From 2c0f3e96f3fc7bbd1cb3caa601f19cf030c2b958 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 24 Mar 2009 16:38:23 -0700
Subject: wimax: struct device - replace bus_id with dev_name(), dev_set_name()

Cc: inaky.perez-gonzalez@intel.com
Cc: linux-wimax@intel.com
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/net/wimax/i2400m/driver.c    | 2 +-
 drivers/net/wimax/i2400m/usb-notif.c | 2 +-
 include/linux/wimax/debug.h          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index e80a0b65a754..d58b971faa64 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -613,7 +613,7 @@ int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
 	d_fnstart(3, dev, "(i2400m %p)\n", i2400m);
 
 	snprintf(wimax_dev->name, sizeof(wimax_dev->name),
-		 "i2400m-%s:%s", dev->bus->name, dev->bus_id);
+		 "i2400m-%s:%s", dev->bus->name, dev_name(dev));
 
 	i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL);
 	if (i2400m->bm_cmd_buf == NULL) {
diff --git a/drivers/net/wimax/i2400m/usb-notif.c b/drivers/net/wimax/i2400m/usb-notif.c
index 9702c22b2497..0528879f6d39 100644
--- a/drivers/net/wimax/i2400m/usb-notif.c
+++ b/drivers/net/wimax/i2400m/usb-notif.c
@@ -102,7 +102,7 @@ int i2400mu_notification_grok(struct i2400mu *i2400mu, const void *buf,
 		dev_err(dev, "HW BUG? Unknown/unexpected data in notification "
 			"message (%zu bytes)\n", buf_len);
 		snprintf(prefix, sizeof(prefix), "%s %s: ",
-			 dev_driver_string(dev) , dev->bus_id);
+			 dev_driver_string(dev) , dev_name(dev));
 		if (buf_len > 64) {
 			print_hex_dump(KERN_ERR, prefix, DUMP_PREFIX_OFFSET,
 				       8, 4, buf, 64, 0);
diff --git a/include/linux/wimax/debug.h b/include/linux/wimax/debug.h
index ba0c49399a83..c703e0340423 100644
--- a/include/linux/wimax/debug.h
+++ b/include/linux/wimax/debug.h
@@ -178,7 +178,7 @@ void __d_head(char *head, size_t head_size,
 		WARN_ON(1);
 	} else
 		snprintf(head, head_size, "%s %s: ",
-			 dev_driver_string(dev), dev->bus_id);
+			 dev_driver_string(dev), dev_name(dev));
 }
 
 
-- 
cgit v1.2.3-71-gd317


From 1fa5ae857bb14f6046205171d98506d8112dd74e Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 25 Jan 2009 15:17:37 +0100
Subject: driver core: get rid of struct device's bus_id string array

Now that all users of bus_id is gone, we can remove it from struct
device.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c     | 39 +++++++++++++++++++--------------------
 include/linux/device.h  |  4 +---
 include/linux/kobject.h |  2 ++
 lib/kobject.c           |  2 +-
 4 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f3eae630e589..059966b617f6 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -777,17 +777,12 @@ static void device_remove_class_symlinks(struct device *dev)
 int dev_set_name(struct device *dev, const char *fmt, ...)
 {
 	va_list vargs;
-	char *s;
+	int err;
 
 	va_start(vargs, fmt);
-	vsnprintf(dev->bus_id, sizeof(dev->bus_id), fmt, vargs);
+	err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
 	va_end(vargs);
-
-	/* ewww... some of these buggers have / in the name... */
-	while ((s = strchr(dev->bus_id, '/')))
-		*s = '!';
-
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(dev_set_name);
 
@@ -864,12 +859,17 @@ int device_add(struct device *dev)
 	if (!dev)
 		goto done;
 
-	/* Temporarily support init_name if it is set.
-	 * It will override bus_id for now */
-	if (dev->init_name)
-		dev_set_name(dev, "%s", dev->init_name);
+	/*
+	 * for statically allocated devices, which should all be converted
+	 * some day, we need to initialize the name. We prevent reading back
+	 * the name, and force the use of dev_name()
+	 */
+	if (dev->init_name) {
+		dev_set_name(dev, dev->init_name);
+		dev->init_name = NULL;
+	}
 
-	if (!strlen(dev->bus_id))
+	if (!dev_name(dev))
 		goto done;
 
 	pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
@@ -1348,7 +1348,10 @@ struct device *device_create_vargs(struct class *class, struct device *parent,
 	dev->release = device_create_release;
 	dev_set_drvdata(dev, drvdata);
 
-	vsnprintf(dev->bus_id, BUS_ID_SIZE, fmt, args);
+	retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
+	if (retval)
+		goto error;
+
 	retval = device_register(dev);
 	if (retval)
 		goto error;
@@ -1452,19 +1455,15 @@ int device_rename(struct device *dev, char *new_name)
 		old_class_name = make_class_name(dev->class->name, &dev->kobj);
 #endif
 
-	old_device_name = kmalloc(BUS_ID_SIZE, GFP_KERNEL);
+	old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);
 	if (!old_device_name) {
 		error = -ENOMEM;
 		goto out;
 	}
-	strlcpy(old_device_name, dev->bus_id, BUS_ID_SIZE);
-	strlcpy(dev->bus_id, new_name, BUS_ID_SIZE);
 
 	error = kobject_rename(&dev->kobj, new_name);
-	if (error) {
-		strlcpy(dev->bus_id, old_device_name, BUS_ID_SIZE);
+	if (error)
 		goto out;
-	}
 
 #ifdef CONFIG_SYSFS_DEPRECATED
 	if (old_class_name) {
diff --git a/include/linux/device.h b/include/linux/device.h
index 47f343c7bdda..d5706c448bcb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -374,7 +374,6 @@ struct device {
 	struct device		*parent;
 
 	struct kobject kobj;
-	char	bus_id[BUS_ID_SIZE];	/* position on parent bus */
 	unsigned		uevent_suppress:1;
 	const char		*init_name; /* initial name of the device */
 	struct device_type	*type;
@@ -427,8 +426,7 @@ struct device {
 
 static inline const char *dev_name(const struct device *dev)
 {
-	/* will be changed into kobject_name(&dev->kobj) in the near future */
-	return dev->bus_id;
+	return kobject_name(&dev->kobj);
 }
 
 extern int dev_set_name(struct device *dev, const char *name, ...)
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 5437ac0276e2..c9c214d7bba2 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -72,6 +72,8 @@ struct kobject {
 
 extern int kobject_set_name(struct kobject *kobj, const char *name, ...)
 			    __attribute__((format(printf, 2, 3)));
+extern int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
+				  va_list vargs);
 
 static inline const char *kobject_name(const struct kobject *kobj)
 {
diff --git a/lib/kobject.c b/lib/kobject.c
index 0487d1f64806..a6dec32f2ddd 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -212,7 +212,7 @@ static int kobject_add_internal(struct kobject *kobj)
  * @fmt: format string used to build the name
  * @vargs: vargs to format the string.
  */
-static int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
+int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
 				  va_list vargs)
 {
 	const char *old_name = kobj->name;
-- 
cgit v1.2.3-71-gd317


From 57fee4a58fe802272742caae248872c392a60670 Mon Sep 17 00:00:00 2001
From: Eric Miao <eric.miao@marvell.com>
Date: Wed, 4 Feb 2009 11:52:40 +0800
Subject: platform: introduce module id table for platform devices

Now platform_device is being widely used on SoC processors where the
peripherals are attached to the system bus, which is simple enough.

However, silicon IPs for these SoCs are usually shared heavily across
a family of processors, even products from different companies.  This
makes the original simple driver name based matching insufficient, or
simply not straight-forward.

Introduce a module id table for platform devices, and makes it clear
that a platform driver is able to support some shared IP and handle
slight differences across different platforms (by 'driver_data').
Module alias is handled automatically when a MODULE_DEVICE_TABLE()
is defined.

To not disturb the current platform drivers too much, the matched id
entry is recorded and can be retrieved by platform_get_device_id().

Signed-off-by: Eric Miao <eric.miao@marvell.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/platform.c         | 23 ++++++++++++++++++++++-
 include/linux/mod_devicetable.h |  9 +++++++++
 include/linux/platform_device.h |  6 ++++++
 scripts/mod/file2alias.c        | 12 ++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 62a8768d96b3..ec993aa6a2ca 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -584,10 +584,25 @@ static int platform_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct platform_device	*pdev = to_platform_device(dev);
 
-	add_uevent_var(env, "MODALIAS=platform:%s", pdev->name);
+	add_uevent_var(env, "MODALIAS=%s%s", PLATFORM_MODULE_PREFIX,
+		(pdev->id_entry) ? pdev->id_entry->name : pdev->name);
 	return 0;
 }
 
+static const struct platform_device_id *platform_match_id(
+			struct platform_device_id *id,
+			struct platform_device *pdev)
+{
+	while (id->name[0]) {
+		if (strcmp(pdev->name, id->name) == 0) {
+			pdev->id_entry = id;
+			return id;
+		}
+		id++;
+	}
+	return NULL;
+}
+
 /**
  * platform_match - bind platform device to platform driver.
  * @dev: device.
@@ -604,7 +619,13 @@ static int platform_uevent(struct device *dev, struct kobj_uevent_env *env)
 static int platform_match(struct device *dev, struct device_driver *drv)
 {
 	struct platform_device *pdev = to_platform_device(dev);
+	struct platform_driver *pdrv = to_platform_driver(drv);
+
+	/* match against the id table first */
+	if (pdrv->id_table)
+		return platform_match_id(pdrv->id_table, pdev) != NULL;
 
+	/* fall-back to driver name match */
 	return (strcmp(pdev->name, drv->name) == 0);
 }
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index fde86671f48f..1bf5900ffe43 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -454,4 +454,13 @@ struct dmi_system_id {
 
 #define DMI_MATCH(a, b)	{ a, b }
 
+#define PLATFORM_NAME_SIZE	20
+#define PLATFORM_MODULE_PREFIX	"platform:"
+
+struct platform_device_id {
+	char name[PLATFORM_NAME_SIZE];
+	kernel_ulong_t driver_data
+			__attribute__((aligned(sizeof(kernel_ulong_t))));
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 9a342699c607..76aef7be32ab 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -12,6 +12,7 @@
 #define _PLATFORM_DEVICE_H_
 
 #include <linux/device.h>
+#include <linux/mod_devicetable.h>
 
 struct platform_device {
 	const char	* name;
@@ -19,8 +20,12 @@ struct platform_device {
 	struct device	dev;
 	u32		num_resources;
 	struct resource	* resource;
+
+	struct platform_device_id	*id_entry;
 };
 
+#define platform_get_device_id(pdev)	((pdev)->id_entry)
+
 #define to_platform_device(x) container_of((x), struct platform_device, dev)
 
 extern int platform_device_register(struct platform_device *);
@@ -56,6 +61,7 @@ struct platform_driver {
 	int (*resume_early)(struct platform_device *);
 	int (*resume)(struct platform_device *);
 	struct device_driver driver;
+	struct platform_device_id *id_table;
 };
 
 extern int platform_driver_register(struct platform_driver *);
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 4eea60b1693e..a3344285ccf4 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -710,6 +710,14 @@ static int do_dmi_entry(const char *filename, struct dmi_system_id *id,
 	strcat(alias, ":");
 	return 1;
 }
+
+static int do_platform_entry(const char *filename,
+			     struct platform_device_id *id, char *alias)
+{
+	sprintf(alias, PLATFORM_MODULE_PREFIX "%s", id->name);
+	return 1;
+}
+
 /* Ignore any prefix, eg. some architectures prepend _ */
 static inline int sym_is(const char *symbol, const char *name)
 {
@@ -849,6 +857,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 		do_table(symval, sym->st_size,
 			 sizeof(struct dmi_system_id), "dmi",
 			 do_dmi_entry, mod);
+	else if (sym_is(symname, "__mod_platform_device_table"))
+		do_table(symval, sym->st_size,
+			 sizeof(struct platform_device_id), "platform",
+			 do_platform_entry, mod);
 	free(zeros);
 }
 
-- 
cgit v1.2.3-71-gd317


From 8205779114e8f612549d191f8e151526a74ab9f2 Mon Sep 17 00:00:00 2001
From: "Hans J. Koch" <hjk@linutronix.de>
Date: Wed, 7 Jan 2009 00:15:39 +0100
Subject: UIO: Add name attributes for mappings and port regions

If a UIO device has several memory mappings, it can be difficult for userspace
to find the right one. The situation becomes even worse if the UIO driver can
handle different versions of a card that have different numbers of mappings.
Benedikt Spranger has such cards and pointed this out to me. Thanks, Bene!

To address this problem, this patch adds "name" sysfs attributes for each
mapping. Userspace can use these to clearly identify each mapping. The name
string is optional. If a driver doesn't set it, an empty string will be
returned, so this patch won't break existing drivers.

The same problem exists for port region information, so a "name" attribute is
added there, too.

Signed-off-by: Hans J. Koch <hjk@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/DocBook/uio-howto.tmpl | 29 +++++++++++++++++++++++++----
 drivers/uio/uio.c                    | 22 ++++++++++++++++++++++
 include/linux/uio_driver.h           |  4 ++++
 3 files changed, 51 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl
index 52e1b79ce0e6..8f6e3b2403c7 100644
--- a/Documentation/DocBook/uio-howto.tmpl
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -41,6 +41,13 @@ GPL version 2.
 </abstract>
 
 <revhistory>
+	<revision>
+	<revnumber>0.8</revnumber>
+	<date>2008-12-24</date>
+	<authorinitials>hjk</authorinitials>
+	<revremark>Added name attributes in mem and portio sysfs directories.
+		</revremark>
+	</revision>
 	<revision>
 	<revnumber>0.7</revnumber>
 	<date>2008-12-23</date>
@@ -303,10 +310,17 @@ interested in translating it, please email me
 	appear if the size of the mapping is not 0.
 </para>
 <para>
-	Each <filename>mapX/</filename> directory contains two read-only files
-	that show start address and size of the memory:
+	Each <filename>mapX/</filename> directory contains four read-only files
+	that show attributes of the memory:
 </para>
 <itemizedlist>
+<listitem>
+	<para>
+	<filename>name</filename>: A string identifier for this mapping. This
+	is optional, the string can be empty. Drivers can set this to make it
+	easier for userspace to find the correct mapping.
+	</para>
+</listitem>
 <listitem>
 	<para>
 	<filename>addr</filename>: The address of memory that can be mapped.
@@ -366,10 +380,17 @@ offset = N * getpagesize();
 	<filename>/sys/class/uio/uioX/portio/</filename>.
 </para>
 <para>
-	Each <filename>portX/</filename> directory contains three read-only
-	files that show start, size, and type of the port region:
+	Each <filename>portX/</filename> directory contains four read-only
+	files that show name, start, size, and type of the port region:
 </para>
 <itemizedlist>
+<listitem>
+	<para>
+	<filename>name</filename>: A string identifier for this port region.
+	The string is optional and can be empty. Drivers can set it to make it
+	easier for userspace to find a certain port region.
+	</para>
+</listitem>
 <listitem>
 	<para>
 	<filename>start</filename>: The first port of this region.
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 4ca85a113aa2..68a496557788 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -61,6 +61,14 @@ struct uio_map {
 };
 #define to_map(map) container_of(map, struct uio_map, kobj)
 
+static ssize_t map_name_show(struct uio_mem *mem, char *buf)
+{
+	if (unlikely(!mem->name))
+		mem->name = "";
+
+	return sprintf(buf, "%s\n", mem->name);
+}
+
 static ssize_t map_addr_show(struct uio_mem *mem, char *buf)
 {
 	return sprintf(buf, "0x%lx\n", mem->addr);
@@ -82,6 +90,8 @@ struct map_sysfs_entry {
 	ssize_t (*store)(struct uio_mem *, const char *, size_t);
 };
 
+static struct map_sysfs_entry name_attribute =
+	__ATTR(name, S_IRUGO, map_name_show, NULL);
 static struct map_sysfs_entry addr_attribute =
 	__ATTR(addr, S_IRUGO, map_addr_show, NULL);
 static struct map_sysfs_entry size_attribute =
@@ -90,6 +100,7 @@ static struct map_sysfs_entry offset_attribute =
 	__ATTR(offset, S_IRUGO, map_offset_show, NULL);
 
 static struct attribute *attrs[] = {
+	&name_attribute.attr,
 	&addr_attribute.attr,
 	&size_attribute.attr,
 	&offset_attribute.attr,
@@ -133,6 +144,14 @@ struct uio_portio {
 };
 #define to_portio(portio) container_of(portio, struct uio_portio, kobj)
 
+static ssize_t portio_name_show(struct uio_port *port, char *buf)
+{
+	if (unlikely(!port->name))
+		port->name = "";
+
+	return sprintf(buf, "%s\n", port->name);
+}
+
 static ssize_t portio_start_show(struct uio_port *port, char *buf)
 {
 	return sprintf(buf, "0x%lx\n", port->start);
@@ -159,6 +178,8 @@ struct portio_sysfs_entry {
 	ssize_t (*store)(struct uio_port *, const char *, size_t);
 };
 
+static struct portio_sysfs_entry portio_name_attribute =
+	__ATTR(name, S_IRUGO, portio_name_show, NULL);
 static struct portio_sysfs_entry portio_start_attribute =
 	__ATTR(start, S_IRUGO, portio_start_show, NULL);
 static struct portio_sysfs_entry portio_size_attribute =
@@ -167,6 +188,7 @@ static struct portio_sysfs_entry portio_porttype_attribute =
 	__ATTR(porttype, S_IRUGO, portio_porttype_show, NULL);
 
 static struct attribute *portio_attrs[] = {
+	&portio_name_attribute.attr,
 	&portio_start_attribute.attr,
 	&portio_size_attribute.attr,
 	&portio_porttype_attribute.attr,
diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h
index a0bb6bd2e5c1..5dcc9ff72f69 100644
--- a/include/linux/uio_driver.h
+++ b/include/linux/uio_driver.h
@@ -22,6 +22,7 @@ struct uio_map;
 
 /**
  * struct uio_mem - description of a UIO memory region
+ * @name:		name of the memory region for identification
  * @addr:		address of the device's memory
  * @size:		size of IO
  * @memtype:		type of memory addr points to
@@ -29,6 +30,7 @@ struct uio_map;
  * @map:		for use by the UIO core only.
  */
 struct uio_mem {
+	const char		*name;
 	unsigned long		addr;
 	unsigned long		size;
 	int			memtype;
@@ -42,12 +44,14 @@ struct uio_portio;
 
 /**
  * struct uio_port - description of a UIO port region
+ * @name:		name of the port region for identification
  * @start:		start of port region
  * @size:		size of port region
  * @porttype:		type of port (see UIO_PORT_* below)
  * @portio:		for use by the UIO core only.
  */
 struct uio_port {
+	const char		*name;
 	unsigned long		start;
 	unsigned long		size;
 	int			porttype;
-- 
cgit v1.2.3-71-gd317


From b23530ebc339c4092ae2c9f37341a5398fea8b89 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sat, 21 Feb 2009 16:45:07 +0800
Subject: driver core: remove polling for driver_probe_done(v5)

This patch removes 100ms polling for driver_probe_done in
wait_for_device_probe(), and uses wait_event() instead.
Removing polling in fs initialization may lead to
a faster boot.

This patch also changes the return type of wait_for_device_done()
from int to void.

This patch is against Arjan's patch in linux-next tree.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/dd.c      | 8 ++------
 include/linux/device.h | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 3f32df7ed373..0dfd08c15921 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -172,16 +172,12 @@ int driver_probe_done(void)
 /**
  * wait_for_device_probe
  * Wait for device probing to be completed.
- *
- * Note: this function polls at 100 msec intervals.
  */
-int wait_for_device_probe(void)
+void wait_for_device_probe(void)
 {
 	/* wait for the known devices to complete their probing */
-	while (driver_probe_done() != 0)
-		msleep(100);
+	wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
 	async_synchronize_full();
-	return 0;
 }
 
 /**
diff --git a/include/linux/device.h b/include/linux/device.h
index d5706c448bcb..c56b154a0bf4 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -147,7 +147,7 @@ extern void put_driver(struct device_driver *drv);
 extern struct device_driver *driver_find(const char *name,
 					 struct bus_type *bus);
 extern int driver_probe_done(void);
-extern int wait_for_device_probe(void);
+extern void wait_for_device_probe(void);
 
 
 /* sysfs interface for exporting driver attributes */
-- 
cgit v1.2.3-71-gd317


From fb069a5d132fb926ed17af3211a114ac7cf27d7a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 16 Dec 2008 12:23:36 -0800
Subject: driver core: create a private portion of struct device

This is to be used to move things out of struct device that no code
outside of the driver core should ever touch.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    | 12 ++++++++++++
 drivers/base/core.c    |  9 +++++++++
 include/linux/device.h |  3 +++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index ca2b0376685b..62a2cb5e1780 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -63,6 +63,18 @@ struct class_private {
 #define to_class(obj)	\
 	container_of(obj, struct class_private, class_subsys.kobj)
 
+/**
+ * struct device_private - structure to hold the private to the driver core portions of the device structure.
+ *
+ * @device - pointer back to the struct class that this structure is
+ * associated with.
+ *
+ * Nothing outside of the driver core should ever touch these fields.
+ */
+struct device_private {
+	struct device *device;
+};
+
 /* initialisation functions */
 extern int devices_init(void);
 extern int buses_init(void);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 059966b617f6..16d859910104 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -109,6 +109,7 @@ static struct sysfs_ops dev_sysfs_ops = {
 static void device_release(struct kobject *kobj)
 {
 	struct device *dev = to_dev(kobj);
+	struct device_private *p = dev->p;
 
 	if (dev->release)
 		dev->release(dev);
@@ -120,6 +121,7 @@ static void device_release(struct kobject *kobj)
 		WARN(1, KERN_ERR "Device '%s' does not have a release() "
 			"function, it is broken and must be fixed.\n",
 			dev_name(dev));
+	kfree(p);
 }
 
 static struct kobj_type device_ktype = {
@@ -859,6 +861,13 @@ int device_add(struct device *dev)
 	if (!dev)
 		goto done;
 
+	dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
+	if (!dev->p) {
+		error = -ENOMEM;
+		goto done;
+	}
+	dev->p->device = dev;
+
 	/*
 	 * for statically allocated devices, which should all be converted
 	 * some day, we need to initialize the name. We prevent reading back
diff --git a/include/linux/device.h b/include/linux/device.h
index c56b154a0bf4..4cf063fea2a9 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -28,6 +28,7 @@
 #define BUS_ID_SIZE		20
 
 struct device;
+struct device_private;
 struct device_driver;
 struct driver_private;
 struct class;
@@ -373,6 +374,8 @@ struct device {
 	struct klist_node	knode_bus;
 	struct device		*parent;
 
+	struct device_private	*p;
+
 	struct kobject kobj;
 	unsigned		uevent_suppress:1;
 	const char		*init_name; /* initial name of the device */
-- 
cgit v1.2.3-71-gd317


From f791b8c836307b58cbf62133a6a772ed1a92fb33 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 16 Dec 2008 12:24:56 -0800
Subject: driver core: move klist_children into private structure

Nothing outside of the driver core should ever touch klist_children, or
knode_parent, so move them out of the public eye.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    |  6 ++++++
 drivers/base/core.c    | 39 +++++++++++++++++++++++++--------------
 include/linux/device.h |  2 --
 3 files changed, 31 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 62a2cb5e1780..7c4fafc314c4 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -66,14 +66,20 @@ struct class_private {
 /**
  * struct device_private - structure to hold the private to the driver core portions of the device structure.
  *
+ * @klist_children - klist containing all children of this device
+ * @knode_parent - node in sibling list
  * @device - pointer back to the struct class that this structure is
  * associated with.
  *
  * Nothing outside of the driver core should ever touch these fields.
  */
 struct device_private {
+	struct klist klist_children;
+	struct klist_node knode_parent;
 	struct device *device;
 };
+#define to_device_private_parent(obj)	\
+	container_of(obj, struct device_private, knode_parent)
 
 /* initialisation functions */
 extern int devices_init(void);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 16d859910104..a90f56f64d6f 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -509,14 +509,16 @@ EXPORT_SYMBOL_GPL(device_schedule_callback_owner);
 
 static void klist_children_get(struct klist_node *n)
 {
-	struct device *dev = container_of(n, struct device, knode_parent);
+	struct device_private *p = to_device_private_parent(n);
+	struct device *dev = p->device;
 
 	get_device(dev);
 }
 
 static void klist_children_put(struct klist_node *n)
 {
-	struct device *dev = container_of(n, struct device, knode_parent);
+	struct device_private *p = to_device_private_parent(n);
+	struct device *dev = p->device;
 
 	put_device(dev);
 }
@@ -540,8 +542,6 @@ void device_initialize(struct device *dev)
 {
 	dev->kobj.kset = devices_kset;
 	kobject_init(&dev->kobj, &device_ktype);
-	klist_init(&dev->klist_children, klist_children_get,
-		   klist_children_put);
 	INIT_LIST_HEAD(&dev->dma_pools);
 	init_MUTEX(&dev->sem);
 	spin_lock_init(&dev->devres_lock);
@@ -867,6 +867,8 @@ int device_add(struct device *dev)
 		goto done;
 	}
 	dev->p->device = dev;
+	klist_init(&dev->p->klist_children, klist_children_get,
+		   klist_children_put);
 
 	/*
 	 * for statically allocated devices, which should all be converted
@@ -937,7 +939,8 @@ int device_add(struct device *dev)
 	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	bus_attach_device(dev);
 	if (parent)
-		klist_add_tail(&dev->knode_parent, &parent->klist_children);
+		klist_add_tail(&dev->p->knode_parent,
+			       &parent->p->klist_children);
 
 	if (dev->class) {
 		mutex_lock(&dev->class->p->class_mutex);
@@ -1051,7 +1054,7 @@ void device_del(struct device *dev)
 	device_pm_remove(dev);
 	dpm_sysfs_remove(dev);
 	if (parent)
-		klist_del(&dev->knode_parent);
+		klist_del(&dev->p->knode_parent);
 	if (MAJOR(dev->devt)) {
 		device_remove_sys_dev_entry(dev);
 		device_remove_file(dev, &devt_attr);
@@ -1112,7 +1115,14 @@ void device_unregister(struct device *dev)
 static struct device *next_device(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
-	return n ? container_of(n, struct device, knode_parent) : NULL;
+	struct device *dev = NULL;
+	struct device_private *p;
+
+	if (n) {
+		p = to_device_private_parent(n);
+		dev = p->device;
+	}
+	return dev;
 }
 
 /**
@@ -1134,7 +1144,7 @@ int device_for_each_child(struct device *parent, void *data,
 	struct device *child;
 	int error = 0;
 
-	klist_iter_init(&parent->klist_children, &i);
+	klist_iter_init(&parent->p->klist_children, &i);
 	while ((child = next_device(&i)) && !error)
 		error = fn(child, data);
 	klist_iter_exit(&i);
@@ -1165,7 +1175,7 @@ struct device *device_find_child(struct device *parent, void *data,
 	if (!parent)
 		return NULL;
 
-	klist_iter_init(&parent->klist_children, &i);
+	klist_iter_init(&parent->p->klist_children, &i);
 	while ((child = next_device(&i)))
 		if (match(child, data) && get_device(child))
 			break;
@@ -1578,9 +1588,10 @@ int device_move(struct device *dev, struct device *new_parent)
 	old_parent = dev->parent;
 	dev->parent = new_parent;
 	if (old_parent)
-		klist_remove(&dev->knode_parent);
+		klist_remove(&dev->p->knode_parent);
 	if (new_parent) {
-		klist_add_tail(&dev->knode_parent, &new_parent->klist_children);
+		klist_add_tail(&dev->p->knode_parent,
+			       &new_parent->p->klist_children);
 		set_dev_node(dev, dev_to_node(new_parent));
 	}
 
@@ -1592,11 +1603,11 @@ int device_move(struct device *dev, struct device *new_parent)
 		device_move_class_links(dev, new_parent, old_parent);
 		if (!kobject_move(&dev->kobj, &old_parent->kobj)) {
 			if (new_parent)
-				klist_remove(&dev->knode_parent);
+				klist_remove(&dev->p->knode_parent);
 			dev->parent = old_parent;
 			if (old_parent) {
-				klist_add_tail(&dev->knode_parent,
-					       &old_parent->klist_children);
+				klist_add_tail(&dev->p->knode_parent,
+					       &old_parent->p->klist_children);
 				set_dev_node(dev, dev_to_node(old_parent));
 			}
 		}
diff --git a/include/linux/device.h b/include/linux/device.h
index 4cf063fea2a9..808d808ec696 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -368,8 +368,6 @@ struct device_dma_parameters {
 };
 
 struct device {
-	struct klist		klist_children;
-	struct klist_node	knode_parent;	/* node in sibling list */
 	struct klist_node	knode_driver;
 	struct klist_node	knode_bus;
 	struct device		*parent;
-- 
cgit v1.2.3-71-gd317


From 8940b4f312dced51b45004819b776ec3aa7fcd5d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 16 Dec 2008 12:25:49 -0800
Subject: driver core: move knode_driver into private structure

Nothing outside of the driver core should ever touch knode_driver, so
move it out of the public eye.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    |  4 ++++
 drivers/base/dd.c      | 13 ++++++++-----
 drivers/base/driver.c  | 13 ++++++++++---
 include/linux/device.h |  1 -
 4 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 7c4fafc314c4..4fc5fd3984cc 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -68,6 +68,7 @@ struct class_private {
  *
  * @klist_children - klist containing all children of this device
  * @knode_parent - node in sibling list
+ * @knode_driver - node in driver list
  * @device - pointer back to the struct class that this structure is
  * associated with.
  *
@@ -76,10 +77,13 @@ struct class_private {
 struct device_private {
 	struct klist klist_children;
 	struct klist_node knode_parent;
+	struct klist_node knode_driver;
 	struct device *device;
 };
 #define to_device_private_parent(obj)	\
 	container_of(obj, struct device_private, knode_parent)
+#define to_device_private_driver(obj)	\
+	container_of(obj, struct device_private, knode_driver)
 
 /* initialisation functions */
 extern int devices_init(void);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 0dfd08c15921..f17c3266a0e0 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -30,7 +30,7 @@
 
 static void driver_bound(struct device *dev)
 {
-	if (klist_node_attached(&dev->knode_driver)) {
+	if (klist_node_attached(&dev->p->knode_driver)) {
 		printk(KERN_WARNING "%s: device %s already bound\n",
 			__func__, kobject_name(&dev->kobj));
 		return;
@@ -43,7 +43,7 @@ static void driver_bound(struct device *dev)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 					     BUS_NOTIFY_BOUND_DRIVER, dev);
 
-	klist_add_tail(&dev->knode_driver, &dev->driver->p->klist_devices);
+	klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices);
 }
 
 static int driver_sysfs_add(struct device *dev)
@@ -318,7 +318,7 @@ static void __device_release_driver(struct device *dev)
 			drv->remove(dev);
 		devres_release_all(dev);
 		dev->driver = NULL;
-		klist_remove(&dev->knode_driver);
+		klist_remove(&dev->p->knode_driver);
 	}
 }
 
@@ -348,6 +348,7 @@ EXPORT_SYMBOL_GPL(device_release_driver);
  */
 void driver_detach(struct device_driver *drv)
 {
+	struct device_private *dev_prv;
 	struct device *dev;
 
 	for (;;) {
@@ -356,8 +357,10 @@ void driver_detach(struct device_driver *drv)
 			spin_unlock(&drv->p->klist_devices.k_lock);
 			break;
 		}
-		dev = list_entry(drv->p->klist_devices.k_list.prev,
-				struct device, knode_driver.n_node);
+		dev_prv = list_entry(drv->p->klist_devices.k_list.prev,
+				     struct device_private,
+				     knode_driver.n_node);
+		dev = dev_prv->device;
 		get_device(dev);
 		spin_unlock(&drv->p->klist_devices.k_lock);
 
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index 2889ad57e48b..c51f11bb29ae 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -19,7 +19,14 @@
 static struct device *next_device(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
-	return n ? container_of(n, struct device, knode_driver) : NULL;
+	struct device *dev = NULL;
+	struct device_private *dev_prv;
+
+	if (n) {
+		dev_prv = to_device_private_driver(n);
+		dev = dev_prv->device;
+	}
+	return dev;
 }
 
 /**
@@ -42,7 +49,7 @@ int driver_for_each_device(struct device_driver *drv, struct device *start,
 		return -EINVAL;
 
 	klist_iter_init_node(&drv->p->klist_devices, &i,
-			     start ? &start->knode_driver : NULL);
+			     start ? &start->p->knode_driver : NULL);
 	while ((dev = next_device(&i)) && !error)
 		error = fn(dev, data);
 	klist_iter_exit(&i);
@@ -76,7 +83,7 @@ struct device *driver_find_device(struct device_driver *drv,
 		return NULL;
 
 	klist_iter_init_node(&drv->p->klist_devices, &i,
-			     (start ? &start->knode_driver : NULL));
+			     (start ? &start->p->knode_driver : NULL));
 	while ((dev = next_device(&i)))
 		if (match(dev, data) && get_device(dev))
 			break;
diff --git a/include/linux/device.h b/include/linux/device.h
index 808d808ec696..83e241f407be 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -368,7 +368,6 @@ struct device_dma_parameters {
 };
 
 struct device {
-	struct klist_node	knode_driver;
 	struct klist_node	knode_bus;
 	struct device		*parent;
 
-- 
cgit v1.2.3-71-gd317


From ae1b41715ee2aae356fbcca032838b71d70b855f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 16 Dec 2008 12:26:21 -0800
Subject: driver core: move knode_bus into private structure

Nothing outside of the driver core should ever touch knode_bus, so
move it out of the public eye.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    |  4 ++++
 drivers/base/bus.c     | 40 +++++++++++++++++++++++++++-------------
 include/linux/device.h |  1 -
 3 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 4fc5fd3984cc..ddc97496db4a 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -69,6 +69,7 @@ struct class_private {
  * @klist_children - klist containing all children of this device
  * @knode_parent - node in sibling list
  * @knode_driver - node in driver list
+ * @knode_bus - node in bus list
  * @device - pointer back to the struct class that this structure is
  * associated with.
  *
@@ -78,12 +79,15 @@ struct device_private {
 	struct klist klist_children;
 	struct klist_node knode_parent;
 	struct klist_node knode_driver;
+	struct klist_node knode_bus;
 	struct device *device;
 };
 #define to_device_private_parent(obj)	\
 	container_of(obj, struct device_private, knode_parent)
 #define to_device_private_driver(obj)	\
 	container_of(obj, struct device_private, knode_driver)
+#define to_device_private_bus(obj)	\
+	container_of(obj, struct device_private, knode_bus)
 
 /* initialisation functions */
 extern int devices_init(void);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 11463c00451e..dc030f1f00f1 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -253,7 +253,14 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
 static struct device *next_device(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
-	return n ? container_of(n, struct device, knode_bus) : NULL;
+	struct device *dev = NULL;
+	struct device_private *dev_prv;
+
+	if (n) {
+		dev_prv = to_device_private_bus(n);
+		dev = dev_prv->device;
+	}
+	return dev;
 }
 
 /**
@@ -286,7 +293,7 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start,
 		return -EINVAL;
 
 	klist_iter_init_node(&bus->p->klist_devices, &i,
-			     (start ? &start->knode_bus : NULL));
+			     (start ? &start->p->knode_bus : NULL));
 	while ((dev = next_device(&i)) && !error)
 		error = fn(dev, data);
 	klist_iter_exit(&i);
@@ -320,7 +327,7 @@ struct device *bus_find_device(struct bus_type *bus,
 		return NULL;
 
 	klist_iter_init_node(&bus->p->klist_devices, &i,
-			     (start ? &start->knode_bus : NULL));
+			     (start ? &start->p->knode_bus : NULL));
 	while ((dev = next_device(&i)))
 		if (match(dev, data) && get_device(dev))
 			break;
@@ -507,7 +514,8 @@ void bus_attach_device(struct device *dev)
 			ret = device_attach(dev);
 		WARN_ON(ret < 0);
 		if (ret >= 0)
-			klist_add_tail(&dev->knode_bus, &bus->p->klist_devices);
+			klist_add_tail(&dev->p->knode_bus,
+				       &bus->p->klist_devices);
 	}
 }
 
@@ -528,8 +536,8 @@ void bus_remove_device(struct device *dev)
 		sysfs_remove_link(&dev->bus->p->devices_kset->kobj,
 				  dev_name(dev));
 		device_remove_attrs(dev->bus, dev);
-		if (klist_node_attached(&dev->knode_bus))
-			klist_del(&dev->knode_bus);
+		if (klist_node_attached(&dev->p->knode_bus))
+			klist_del(&dev->p->knode_bus);
 
 		pr_debug("bus: '%s': remove device %s\n",
 			 dev->bus->name, dev_name(dev));
@@ -831,14 +839,16 @@ static void bus_remove_attrs(struct bus_type *bus)
 
 static void klist_devices_get(struct klist_node *n)
 {
-	struct device *dev = container_of(n, struct device, knode_bus);
+	struct device_private *dev_prv = to_device_private_bus(n);
+	struct device *dev = dev_prv->device;
 
 	get_device(dev);
 }
 
 static void klist_devices_put(struct klist_node *n)
 {
-	struct device *dev = container_of(n, struct device, knode_bus);
+	struct device_private *dev_prv = to_device_private_bus(n);
+	struct device *dev = dev_prv->device;
 
 	put_device(dev);
 }
@@ -995,18 +1005,20 @@ static void device_insertion_sort_klist(struct device *a, struct list_head *list
 {
 	struct list_head *pos;
 	struct klist_node *n;
+	struct device_private *dev_prv;
 	struct device *b;
 
 	list_for_each(pos, list) {
 		n = container_of(pos, struct klist_node, n_node);
-		b = container_of(n, struct device, knode_bus);
+		dev_prv = to_device_private_bus(n);
+		b = dev_prv->device;
 		if (compare(a, b) <= 0) {
-			list_move_tail(&a->knode_bus.n_node,
-				       &b->knode_bus.n_node);
+			list_move_tail(&a->p->knode_bus.n_node,
+				       &b->p->knode_bus.n_node);
 			return;
 		}
 	}
-	list_move_tail(&a->knode_bus.n_node, list);
+	list_move_tail(&a->p->knode_bus.n_node, list);
 }
 
 void bus_sort_breadthfirst(struct bus_type *bus,
@@ -1016,6 +1028,7 @@ void bus_sort_breadthfirst(struct bus_type *bus,
 	LIST_HEAD(sorted_devices);
 	struct list_head *pos, *tmp;
 	struct klist_node *n;
+	struct device_private *dev_prv;
 	struct device *dev;
 	struct klist *device_klist;
 
@@ -1024,7 +1037,8 @@ void bus_sort_breadthfirst(struct bus_type *bus,
 	spin_lock(&device_klist->k_lock);
 	list_for_each_safe(pos, tmp, &device_klist->k_list) {
 		n = container_of(pos, struct klist_node, n_node);
-		dev = container_of(n, struct device, knode_bus);
+		dev_prv = to_device_private_bus(n);
+		dev = dev_prv->device;
 		device_insertion_sort_klist(dev, &sorted_devices, compare);
 	}
 	list_splice(&sorted_devices, &device_klist->k_list);
diff --git a/include/linux/device.h b/include/linux/device.h
index 83e241f407be..5a64775e68e4 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -368,7 +368,6 @@ struct device_dma_parameters {
 };
 
 struct device {
-	struct klist_node	knode_bus;
 	struct device		*parent;
 
 	struct device_private	*p;
-- 
cgit v1.2.3-71-gd317


From 006f4571a15fae3a0575f2a0f9e9b63b3d1012f8 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 8 Mar 2009 23:13:32 +0800
Subject: driver core: move platform_data into platform_device

This patch moves platform_data from struct device into
struct platform_device, based on the two ideas:

1. Now all platform_driver is registered by platform_driver_register,
   which makes probe()/release()/... of platform_driver passed parameter
   of platform_device *, so platform driver can get platform_data from
   platform_device;

2. Other kind of devices do not need to use platform_data, we can
   decrease size of device if moving it to platform_device.

Taking into consideration of thousands of files to be fixed and they
can't be finished in one night(maybe it will take a long time), so we
keep platform_data in device to allow two kind of cases coexist until
all platform devices pass its platfrom data from
platform_device->platform_data.

All patches to do this kind of conversion are welcome.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/platform.c         | 3 +++
 include/linux/device.h          | 9 +++++++--
 include/linux/platform_device.h | 1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index ec993aa6a2ca..c5ac81d22303 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -217,6 +217,7 @@ int platform_device_add_data(struct platform_device *pdev, const void *data,
 	if (d) {
 		memcpy(d, data, size);
 		pdev->dev.platform_data = d;
+		pdev->platform_data = d;
 	}
 	return d ? 0 : -ENOMEM;
 }
@@ -246,6 +247,8 @@ int platform_device_add(struct platform_device *pdev)
 	else
 		dev_set_name(&pdev->dev, pdev->name);
 
+	pdev->platform_data = pdev->dev.platform_data;
+
 	for (i = 0; i < pdev->num_resources; i++) {
 		struct resource *p, *r = &pdev->resource[i];
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 5a64775e68e4..4bea53fe8f4c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -385,8 +385,13 @@ struct device {
 	struct device_driver *driver;	/* which driver has allocated this
 					   device */
 	void		*driver_data;	/* data private to the driver */
-	void		*platform_data;	/* Platform specific data, device
-					   core doesn't touch it */
+
+	void		*platform_data;	/* We will remove platform_data
+					   field if all platform devices
+					   pass its platform specific data
+					   from platform_device->platform_data,
+					   other kind of devices should not
+					   use platform_data. */
 	struct dev_pm_info	power;
 
 #ifdef CONFIG_NUMA
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 76aef7be32ab..76e470a299bf 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -20,6 +20,7 @@ struct platform_device {
 	struct device	dev;
 	u32		num_resources;
 	struct resource	* resource;
+	void		*platform_data;
 
 	struct platform_device_id	*id_entry;
 };
-- 
cgit v1.2.3-71-gd317


From 4995f8ef9d3aac72745e12419d7fbaa8d01b1d81 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Mon, 9 Mar 2009 14:18:52 +0100
Subject: vcs: hook sysfs devices into object lifetime instead of "binding"

During bootup performance tracing I noticed many occurrences of
vca* device creation and removal, leading to the usual userspace
uevent processing, which are, in this case, rather pointless.

A simple test showing the kernel timing (not including all the
work userspace has to do), gives us these numbers:
  $ time for i in `seq 1000`; do echo a > /dev/tty2; done
  real    0m1.142s
  user    0m0.015s
  sys     0m0.540s

If we move the hook for the vcs* driver core devices from the
tty "binding" to the vc allocation/deallocation, which is what
the vcs* devices represent, we get the following numbers:
  $ time for i in `seq 1000`; do echo a > /dev/tty2; done
  real    0m0.152s
  user    0m0.030s
  sys     0m0.072s

Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/vc_screen.c | 16 ++++++++--------
 drivers/char/vt.c        |  5 +++--
 include/linux/console.h  |  4 ++--
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vc_screen.c b/drivers/char/vc_screen.c
index 4f3b3f95fc42..d94d25c12aa8 100644
--- a/drivers/char/vc_screen.c
+++ b/drivers/char/vc_screen.c
@@ -479,18 +479,18 @@ static const struct file_operations vcs_fops = {
 
 static struct class *vc_class;
 
-void vcs_make_sysfs(struct tty_struct *tty)
+void vcs_make_sysfs(int index)
 {
-	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, tty->index + 1), NULL,
-		      "vcs%u", tty->index + 1);
-	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, tty->index + 129), NULL,
-		      "vcsa%u", tty->index + 1);
+	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 1), NULL,
+		      "vcs%u", index + 1);
+	device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 129), NULL,
+		      "vcsa%u", index + 1);
 }
 
-void vcs_remove_sysfs(struct tty_struct *tty)
+void vcs_remove_sysfs(int index)
 {
-	device_destroy(vc_class, MKDEV(VCS_MAJOR, tty->index + 1));
-	device_destroy(vc_class, MKDEV(VCS_MAJOR, tty->index + 129));
+	device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 1));
+	device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 129));
 }
 
 int __init vcs_init(void)
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 7900bd63b36d..2c1d133819b5 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -778,6 +778,7 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 	    }
 	    vc->vc_kmalloced = 1;
 	    vc_init(vc, vc->vc_rows, vc->vc_cols, 1);
+	    vcs_make_sysfs(currcons);
 	    atomic_notifier_call_chain(&vt_notifier_list, VT_ALLOCATE, &param);
 	}
 	return 0;
@@ -987,7 +988,9 @@ void vc_deallocate(unsigned int currcons)
 	if (vc_cons_allocated(currcons)) {
 		struct vc_data *vc = vc_cons[currcons].d;
 		struct vt_notifier_param param = { .vc = vc };
+
 		atomic_notifier_call_chain(&vt_notifier_list, VT_DEALLOCATE, &param);
+		vcs_remove_sysfs(currcons);
 		vc->vc_sw->con_deinit(vc);
 		put_pid(vc->vt_pid);
 		module_put(vc->vc_sw->owner);
@@ -2775,7 +2778,6 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 				tty->termios->c_iflag |= IUTF8;
 			else
 				tty->termios->c_iflag &= ~IUTF8;
-			vcs_make_sysfs(tty);
 			release_console_sem();
 			return ret;
 		}
@@ -2795,7 +2797,6 @@ static void con_shutdown(struct tty_struct *tty)
 	BUG_ON(vc == NULL);
 	acquire_console_sem();
 	vc->vc_tty = NULL;
-	vcs_remove_sysfs(tty);
 	release_console_sem();
 	tty_shutdown(tty);
 }
diff --git a/include/linux/console.h b/include/linux/console.h
index a67a90cf8268..dcca5339ceb3 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -137,8 +137,8 @@ extern void resume_console(void);
 int mda_console_init(void);
 void prom_con_init(void);
 
-void vcs_make_sysfs(struct tty_struct *tty);
-void vcs_remove_sysfs(struct tty_struct *tty);
+void vcs_make_sysfs(int index);
+void vcs_remove_sysfs(int index);
 
 /* Some debug stub to catch some of the obvious races in the VT code */
 #if 1
-- 
cgit v1.2.3-71-gd317


From f67f129e519fa87f8ebd236b6336fe43f31ee141 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 1 Mar 2009 21:10:49 +0800
Subject: Driver core: implement uevent suppress in kobject

This patch implements uevent suppress in kobject and removes it
from struct device, based on the following ideas:

1,Uevent sending should be one attribute of kobject, so suppressing it
in kobject layer is more natural than in device layer. By this way,
we can do it for other objects embedded with kobject.

2,It may save several bytes for each instance of struct device.(On my
omap3(32bit ARM) based box, can save 8bytes per device object)

This patch also introduces dev_set|get_uevent_suppress() helpers to
set and query uevent_suppress attribute in case to help kobject
as private part of struct device in future.

[This version is against the latest driver-core patch set of Greg,please
ignore the last version.]

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/acpi/dock.c           |  2 +-
 drivers/base/core.c           |  2 --
 drivers/base/firmware_class.c |  4 ++--
 drivers/i2c/i2c-core.c        |  2 +-
 drivers/s390/cio/chsc_sch.c   |  4 ++--
 drivers/s390/cio/css.c        |  4 ++--
 drivers/s390/cio/device.c     |  4 ++--
 fs/partitions/check.c         | 10 +++++-----
 include/linux/device.h        | 11 ++++++++++-
 include/linux/kobject.h       |  1 +
 lib/kobject_uevent.c          |  7 +++++++
 11 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index 35094f230b1e..7af7db1ba8c4 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c
@@ -977,7 +977,7 @@ static int dock_add(acpi_handle handle)
 		sizeof(struct dock_station *));
 
 	/* we want the dock device to send uevents */
-	dock_device->dev.uevent_suppress = 0;
+	dev_set_uevent_suppress(&dock_device->dev, 0);
 
 	if (is_dock(handle))
 		dock_station->flags |= DOCK_IS_DOCK;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index a90f56f64d6f..95c67ffd71da 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -136,8 +136,6 @@ static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
 
 	if (ktype == &device_ktype) {
 		struct device *dev = to_dev(kobj);
-		if (dev->uevent_suppress)
-			return 0;
 		if (dev->bus)
 			return 1;
 		if (dev->class)
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 44699d9dd85c..d3a59c688fe4 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -319,7 +319,7 @@ static int fw_register_device(struct device **dev_p, const char *fw_name,
 	f_dev->parent = device;
 	f_dev->class = &firmware_class;
 	dev_set_drvdata(f_dev, fw_priv);
-	f_dev->uevent_suppress = 1;
+	dev_set_uevent_suppress(f_dev, 1);
 	retval = device_register(f_dev);
 	if (retval) {
 		dev_err(device, "%s: device_register failed\n", __func__);
@@ -366,7 +366,7 @@ static int fw_setup_device(struct firmware *fw, struct device **dev_p,
 	}
 
 	if (uevent)
-		f_dev->uevent_suppress = 0;
+		dev_set_uevent_suppress(f_dev, 0);
 	*dev_p = f_dev;
 	goto out;
 
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index e7d984866de0..fbb9030b68a5 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -841,7 +841,7 @@ int i2c_attach_client(struct i2c_client *client)
 
 	if (client->driver && !is_newstyle_driver(client->driver)) {
 		client->dev.release = i2c_client_release;
-		client->dev.uevent_suppress = 1;
+		dev_set_uevent_suppress(&client->dev, 1);
 	} else
 		client->dev.release = i2c_client_dev_release;
 
diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c
index 0a2f2edafc03..93eca1731b81 100644
--- a/drivers/s390/cio/chsc_sch.c
+++ b/drivers/s390/cio/chsc_sch.c
@@ -84,8 +84,8 @@ static int chsc_subchannel_probe(struct subchannel *sch)
 		kfree(private);
 	} else {
 		sch->private = private;
-		if (sch->dev.uevent_suppress) {
-			sch->dev.uevent_suppress = 0;
+		if (dev_get_uevent_suppress(&sch->dev)) {
+			dev_set_uevent_suppress(&sch->dev, 0);
 			kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
 		}
 	}
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 8019288bc6de..427d11d88069 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -272,7 +272,7 @@ static int css_register_subchannel(struct subchannel *sch)
 	 * the subchannel driver can decide itself when it wants to inform
 	 * userspace of its existence.
 	 */
-	sch->dev.uevent_suppress = 1;
+	dev_set_uevent_suppress(&sch->dev, 1);
 	css_update_ssd_info(sch);
 	/* make it known to the system */
 	ret = css_sch_device_register(sch);
@@ -287,7 +287,7 @@ static int css_register_subchannel(struct subchannel *sch)
 		 * a fitting driver module may be loaded based on the
 		 * modalias.
 		 */
-		sch->dev.uevent_suppress = 0;
+		dev_set_uevent_suppress(&sch->dev, 0);
 		kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
 	}
 	return ret;
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 23d5752349b5..611d2e001dd5 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -981,7 +981,7 @@ io_subchannel_register(struct work_struct *work)
 	 * Now we know this subchannel will stay, we can throw
 	 * our delayed uevent.
 	 */
-	sch->dev.uevent_suppress = 0;
+	dev_set_uevent_suppress(&sch->dev, 0);
 	kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
 	/* make it known to the system */
 	ret = ccw_device_register(cdev);
@@ -1243,7 +1243,7 @@ static int io_subchannel_probe(struct subchannel *sch)
 		 * the ccw_device and exit. This happens for all early
 		 * devices, e.g. the console.
 		 */
-		sch->dev.uevent_suppress = 0;
+		dev_set_uevent_suppress(&sch->dev, 0);
 		kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
 		cdev->dev.groups = ccwdev_attr_groups;
 		device_initialize(&cdev->dev);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..38e337d51ced 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -400,7 +400,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
-	pdev->uevent_suppress = 1;
+	dev_set_uevent_suppress(pdev, 1);
 	err = device_add(pdev);
 	if (err)
 		goto out_put;
@@ -410,7 +410,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!p->holder_dir)
 		goto out_del;
 
-	pdev->uevent_suppress = 0;
+	dev_set_uevent_suppress(pdev, 0);
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		err = device_create_file(pdev, &dev_attr_whole_disk);
 		if (err)
@@ -422,7 +422,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
-	if (!ddev->uevent_suppress)
+	if (!dev_get_uevent_suppress(pdev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
 	return p;
@@ -455,7 +455,7 @@ void register_disk(struct gendisk *disk)
 	dev_set_name(ddev, disk->disk_name);
 
 	/* delay uevents, until we scanned partition table */
-	ddev->uevent_suppress = 1;
+	dev_set_uevent_suppress(ddev, 1);
 
 	if (device_add(ddev))
 		return;
@@ -490,7 +490,7 @@ void register_disk(struct gendisk *disk)
 
 exit:
 	/* announce disk after possible partitions are created */
-	ddev->uevent_suppress = 0;
+	dev_set_uevent_suppress(ddev, 0);
 	kobject_uevent(&ddev->kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
diff --git a/include/linux/device.h b/include/linux/device.h
index 4bea53fe8f4c..914c1016dd8f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -373,7 +373,6 @@ struct device {
 	struct device_private	*p;
 
 	struct kobject kobj;
-	unsigned		uevent_suppress:1;
 	const char		*init_name; /* initial name of the device */
 	struct device_type	*type;
 
@@ -465,6 +464,16 @@ static inline void dev_set_drvdata(struct device *dev, void *data)
 	dev->driver_data = data;
 }
 
+static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
+{
+	return dev->kobj.uevent_suppress;
+}
+
+static inline void dev_set_uevent_suppress(struct device *dev, int val)
+{
+	dev->kobj.uevent_suppress = val;
+}
+
 static inline int device_is_registered(struct device *dev)
 {
 	return dev->kobj.state_in_sysfs;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c9c214d7bba2..58ae8e00fcdd 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -68,6 +68,7 @@ struct kobject {
 	unsigned int state_in_sysfs:1;
 	unsigned int state_add_uevent_sent:1;
 	unsigned int state_remove_uevent_sent:1;
+	unsigned int uevent_suppress:1;
 };
 
 extern int kobject_set_name(struct kobject *kobj, const char *name, ...)
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 318328ddbd1c..b2181cc8e4d8 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -118,6 +118,13 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 	kset = top_kobj->kset;
 	uevent_ops = kset->uevent_ops;
 
+	/* skip the event, if uevent_suppress is set*/
+	if (kobj->uevent_suppress) {
+		pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
+				 "caused the event to drop!\n",
+				 kobject_name(kobj), kobj, __func__);
+		return 0;
+	}
 	/* skip the event, if the filter returns zero. */
 	if (uevent_ops && uevent_ops->filter)
 		if (!uevent_ops->filter(kset, kobj)) {
-- 
cgit v1.2.3-71-gd317


From ffa6a7054d172a2f57248dff2de600ca795c5656 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Wed, 4 Mar 2009 12:44:00 +0100
Subject: Driver core: Fix device_move() vs. dpm list ordering, v2

dpm_list currently relies on the fact that child devices will
be registered after their parents to get a correct suspend
order. Using device_move() however destroys this assumption, as
an already registered device may be moved under a newly registered
one.

This patch adds a new argument to device_move(), allowing callers
to specify how dpm_list should be adapted.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c        | 19 ++++++++++++++++++-
 drivers/base/power/main.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/base/power/power.h |  8 ++++++++
 drivers/s390/cio/device.c  |  9 +++++----
 include/linux/device.h     |  3 ++-
 include/linux/pm.h         | 11 +++++++++++
 net/bluetooth/hci_sysfs.c  |  2 +-
 net/bluetooth/rfcomm/tty.c |  5 +++--
 8 files changed, 92 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 95c67ffd71da..e73c92d13a23 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1561,8 +1561,10 @@ out:
  * device_move - moves a device to a new parent
  * @dev: the pointer to the struct device to be moved
  * @new_parent: the new parent of the device (can by NULL)
+ * @dpm_order: how to reorder the dpm_list
  */
-int device_move(struct device *dev, struct device *new_parent)
+int device_move(struct device *dev, struct device *new_parent,
+		enum dpm_order dpm_order)
 {
 	int error;
 	struct device *old_parent;
@@ -1572,6 +1574,7 @@ int device_move(struct device *dev, struct device *new_parent)
 	if (!dev)
 		return -EINVAL;
 
+	device_pm_lock();
 	new_parent = get_device(new_parent);
 	new_parent_kobj = get_device_parent(dev, new_parent);
 
@@ -1613,9 +1616,23 @@ int device_move(struct device *dev, struct device *new_parent)
 		put_device(new_parent);
 		goto out;
 	}
+	switch (dpm_order) {
+	case DPM_ORDER_NONE:
+		break;
+	case DPM_ORDER_DEV_AFTER_PARENT:
+		device_pm_move_after(dev, new_parent);
+		break;
+	case DPM_ORDER_PARENT_BEFORE_DEV:
+		device_pm_move_before(new_parent, dev);
+		break;
+	case DPM_ORDER_DEV_LAST:
+		device_pm_move_last(dev);
+		break;
+	}
 out_put:
 	put_device(old_parent);
 out:
+	device_pm_unlock();
 	put_device(dev);
 	return error;
 }
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 2d14f4ae6c01..e255341682c8 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -106,6 +106,50 @@ void device_pm_remove(struct device *dev)
 	mutex_unlock(&dpm_list_mtx);
 }
 
+/**
+ *	device_pm_move_before - move device in dpm_list
+ *	@deva:  Device to move in dpm_list
+ *	@devb:  Device @deva should come before
+ */
+void device_pm_move_before(struct device *deva, struct device *devb)
+{
+	pr_debug("PM: Moving %s:%s before %s:%s\n",
+		 deva->bus ? deva->bus->name : "No Bus",
+		 kobject_name(&deva->kobj),
+		 devb->bus ? devb->bus->name : "No Bus",
+		 kobject_name(&devb->kobj));
+	/* Delete deva from dpm_list and reinsert before devb. */
+	list_move_tail(&deva->power.entry, &devb->power.entry);
+}
+
+/**
+ *	device_pm_move_after - move device in dpm_list
+ *	@deva:  Device to move in dpm_list
+ *	@devb:  Device @deva should come after
+ */
+void device_pm_move_after(struct device *deva, struct device *devb)
+{
+	pr_debug("PM: Moving %s:%s after %s:%s\n",
+		 deva->bus ? deva->bus->name : "No Bus",
+		 kobject_name(&deva->kobj),
+		 devb->bus ? devb->bus->name : "No Bus",
+		 kobject_name(&devb->kobj));
+	/* Delete deva from dpm_list and reinsert after devb. */
+	list_move(&deva->power.entry, &devb->power.entry);
+}
+
+/**
+ * 	device_pm_move_last - move device to end of dpm_list
+ * 	@dev:   Device to move in dpm_list
+ */
+void device_pm_move_last(struct device *dev)
+{
+	pr_debug("PM: Moving %s:%s to end of list\n",
+		 dev->bus ? dev->bus->name : "No Bus",
+		 kobject_name(&dev->kobj));
+	list_move_tail(&dev->power.entry, &dpm_list);
+}
+
 /**
  *	pm_op - execute the PM operation appropiate for given PM event
  *	@dev:	Device.
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 41f51fae042f..c7cb4fc3735c 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -18,11 +18,19 @@ static inline struct device *to_device(struct list_head *entry)
 
 extern void device_pm_add(struct device *);
 extern void device_pm_remove(struct device *);
+extern void device_pm_move_before(struct device *, struct device *);
+extern void device_pm_move_after(struct device *, struct device *);
+extern void device_pm_move_last(struct device *);
 
 #else /* CONFIG_PM_SLEEP */
 
 static inline void device_pm_add(struct device *dev) {}
 static inline void device_pm_remove(struct device *dev) {}
+static inline void device_pm_move_before(struct device *deva,
+					 struct device *devb) {}
+static inline void device_pm_move_after(struct device *deva,
+					struct device *devb) {}
+static inline void device_pm_move_last(struct device *dev) {}
 
 #endif
 
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 611d2e001dd5..e28f8ae53453 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -799,7 +799,7 @@ static void sch_attach_disconnected_device(struct subchannel *sch,
 		return;
 	other_sch = to_subchannel(cdev->dev.parent);
 	/* Note: device_move() changes cdev->dev.parent */
-	ret = device_move(&cdev->dev, &sch->dev);
+	ret = device_move(&cdev->dev, &sch->dev, DPM_ORDER_PARENT_BEFORE_DEV);
 	if (ret) {
 		CIO_MSG_EVENT(0, "Moving disconnected device 0.%x.%04x failed "
 			      "(ret=%d)!\n", cdev->private->dev_id.ssid,
@@ -830,7 +830,7 @@ static void sch_attach_orphaned_device(struct subchannel *sch,
 	 * Try to move the ccw device to its new subchannel.
 	 * Note: device_move() changes cdev->dev.parent
 	 */
-	ret = device_move(&cdev->dev, &sch->dev);
+	ret = device_move(&cdev->dev, &sch->dev, DPM_ORDER_PARENT_BEFORE_DEV);
 	if (ret) {
 		CIO_MSG_EVENT(0, "Moving device 0.%x.%04x from orphanage "
 			      "failed (ret=%d)!\n",
@@ -897,7 +897,8 @@ void ccw_device_move_to_orphanage(struct work_struct *work)
 	 * ccw device can take its place on the subchannel.
 	 * Note: device_move() changes cdev->dev.parent
 	 */
-	ret = device_move(&cdev->dev, &css->pseudo_subchannel->dev);
+	ret = device_move(&cdev->dev, &css->pseudo_subchannel->dev,
+		DPM_ORDER_NONE);
 	if (ret) {
 		CIO_MSG_EVENT(0, "Moving device 0.%x.%04x to orphanage failed "
 			      "(ret=%d)!\n", cdev->private->dev_id.ssid,
@@ -1129,7 +1130,7 @@ static void ccw_device_move_to_sch(struct work_struct *work)
 	 * Try to move the ccw device to its new subchannel.
 	 * Note: device_move() changes cdev->dev.parent
 	 */
-	rc = device_move(&cdev->dev, &sch->dev);
+	rc = device_move(&cdev->dev, &sch->dev, DPM_ORDER_PARENT_BEFORE_DEV);
 	mutex_unlock(&sch->reg_mutex);
 	if (rc) {
 		CIO_MSG_EVENT(0, "Moving device 0.%x.%04x to subchannel "
diff --git a/include/linux/device.h b/include/linux/device.h
index 914c1016dd8f..f98d0cfb4f81 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -494,7 +494,8 @@ extern int device_for_each_child(struct device *dev, void *data,
 extern struct device *device_find_child(struct device *dev, void *data,
 				int (*match)(struct device *dev, void *data));
 extern int device_rename(struct device *dev, char *new_name);
-extern int device_move(struct device *dev, struct device *new_parent);
+extern int device_move(struct device *dev, struct device *new_parent,
+		       enum dpm_order dpm_order);
 
 /*
  * Root device objects for grouping under /sys/devices
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 24ba5f67b3a3..1d4e2d289821 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -400,6 +400,9 @@ extern void __suspend_report_result(const char *function, void *fn, int ret);
 
 #else /* !CONFIG_PM_SLEEP */
 
+#define device_pm_lock() do {} while (0)
+#define device_pm_unlock() do {} while (0)
+
 static inline int device_suspend(pm_message_t state)
 {
 	return 0;
@@ -409,6 +412,14 @@ static inline int device_suspend(pm_message_t state)
 
 #endif /* !CONFIG_PM_SLEEP */
 
+/* How to reorder dpm_list after device_move() */
+enum dpm_order {
+	DPM_ORDER_NONE,
+	DPM_ORDER_DEV_AFTER_PARENT,
+	DPM_ORDER_PARENT_BEFORE_DEV,
+	DPM_ORDER_DEV_LAST,
+};
+
 /*
  * Global Power Management flags
  * Used to keep APM and ACPI from both being active
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 1a1f916be44e..ed82796d4a0f 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -140,7 +140,7 @@ static void del_conn(struct work_struct *work)
 		dev = device_find_child(&conn->dev, NULL, __match_tty);
 		if (!dev)
 			break;
-		device_move(dev, NULL);
+		device_move(dev, NULL, DPM_ORDER_DEV_LAST);
 		put_device(dev);
 	}
 
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index d030c69cb5a3..abdc703a11d2 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -731,7 +731,8 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
 	remove_wait_queue(&dev->wait, &wait);
 
 	if (err == 0)
-		device_move(dev->tty_dev, rfcomm_get_device(dev));
+		device_move(dev->tty_dev, rfcomm_get_device(dev),
+			    DPM_ORDER_DEV_AFTER_PARENT);
 
 	rfcomm_tty_copy_pending(dev);
 
@@ -751,7 +752,7 @@ static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
 
 	if (atomic_dec_and_test(&dev->opened)) {
 		if (dev->tty_dev->parent)
-			device_move(dev->tty_dev, NULL);
+			device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
 
 		/* Close DLC and dettach TTY */
 		rfcomm_dlc_close(dev->dlc, 0);
-- 
cgit v1.2.3-71-gd317


From e9d376f0fa66bd630fe27403669c6ae6c22a868f Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 5 Feb 2009 11:51:38 -0500
Subject: dynamic debug: combine dprintk and dynamic printk

This patch combines Greg Bank's dprintk() work with the existing dynamic
printk patchset, we are now calling it 'dynamic debug'.

The new feature of this patchset is a richer /debugfs control file interface,
(an example output from my system is at the bottom), which allows fined grained
control over the the debug output. The output can be controlled by function,
file, module, format string, and line number.

for example, enabled all debug messages in module 'nf_conntrack':

echo -n 'module nf_conntrack +p' > /mnt/debugfs/dynamic_debug/control

to disable them:

echo -n 'module nf_conntrack -p' > /mnt/debugfs/dynamic_debug/control

A further explanation can be found in the documentation patch.

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/kernel-parameters.txt |   5 -
 include/asm-generic/vmlinux.lds.h   |  15 +-
 include/linux/device.h              |   2 +-
 include/linux/dynamic_debug.h       |  88 +++++
 include/linux/dynamic_printk.h      |  93 -----
 include/linux/kernel.h              |   4 +-
 kernel/module.c                     |  25 +-
 lib/Kconfig.debug                   |   2 +-
 lib/Makefile                        |   2 +-
 lib/dynamic_debug.c                 | 756 ++++++++++++++++++++++++++++++++++++
 lib/dynamic_printk.c                | 414 --------------------
 net/netfilter/nf_conntrack_pptp.c   |   2 +-
 scripts/Makefile.lib                |   2 +-
 13 files changed, 867 insertions(+), 543 deletions(-)
 create mode 100644 include/linux/dynamic_debug.h
 delete mode 100644 include/linux/dynamic_printk.h
 create mode 100644 lib/dynamic_debug.c
 delete mode 100644 lib/dynamic_printk.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 54f21a5c262b..3a1aa8a4affc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1816,11 +1816,6 @@ and is between 256 and 4096 characters. It is defined in the file
 			autoconfiguration.
 			Ranges are in pairs (memory base and size).
 
-	dynamic_printk	Enables pr_debug()/dev_dbg() calls if
-			CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
-			These can also be switched on/off via
-			<debugfs>/dynamic_printk/modules
-
 	print-fatal-signals=
 			[KNL] debug: print fatal signals
 			print-fatal-signals=1: print segfault info to
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..aca40b93bd28 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -80,6 +80,11 @@
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	/* implement dynamic printk debug */				\
+	. = ALIGN(8);							\
+	VMLINUX_SYMBOL(__start___verbose) = .;                          \
+	*(__verbose)                                                    \
+	VMLINUX_SYMBOL(__stop___verbose) = .;				\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()
 
@@ -309,15 +314,7 @@
 	CPU_DISCARD(init.data)						\
 	CPU_DISCARD(init.rodata)					\
 	MEM_DISCARD(init.data)						\
-	MEM_DISCARD(init.rodata)					\
-	/* implement dynamic printk debug */				\
-	VMLINUX_SYMBOL(__start___verbose_strings) = .;                  \
-	*(__verbose_strings)                                            \
-	VMLINUX_SYMBOL(__stop___verbose_strings) = .;                   \
-	. = ALIGN(8);							\
-	VMLINUX_SYMBOL(__start___verbose) = .;                          \
-	*(__verbose)                                                    \
-	VMLINUX_SYMBOL(__stop___verbose) = .;
+	MEM_DISCARD(init.rodata)
 
 #define INIT_TEXT							\
 	*(.init.text)							\
diff --git a/include/linux/device.h b/include/linux/device.h
index f98d0cfb4f81..2918c0e8fdfd 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -582,7 +582,7 @@ extern const char *dev_driver_string(const struct device *dev);
 #if defined(DEBUG)
 #define dev_dbg(dev, format, arg...)		\
 	dev_printk(KERN_DEBUG , dev , format , ## arg)
-#elif defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#elif defined(CONFIG_DYNAMIC_DEBUG)
 #define dev_dbg(dev, format, ...) do { \
 	dynamic_dev_dbg(dev, format, ##__VA_ARGS__); \
 	} while (0)
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
new file mode 100644
index 000000000000..07781aaa1164
--- /dev/null
+++ b/include/linux/dynamic_debug.h
@@ -0,0 +1,88 @@
+#ifndef _DYNAMIC_DEBUG_H
+#define _DYNAMIC_DEBUG_H
+
+/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
+ * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
+ * use independent hash functions, to reduce the chance of false positives.
+ */
+extern long long dynamic_debug_enabled;
+extern long long dynamic_debug_enabled2;
+
+/*
+ * An instance of this structure is created in a special
+ * ELF section at every dynamic debug callsite.  At runtime,
+ * the special section is treated as an array of these.
+ */
+struct _ddebug {
+	/*
+	 * These fields are used to drive the user interface
+	 * for selecting and displaying debug callsites.
+	 */
+	const char *modname;
+	const char *function;
+	const char *filename;
+	const char *format;
+	char primary_hash;
+	char secondary_hash;
+	unsigned int lineno:24;
+	/*
+ 	 * The flags field controls the behaviour at the callsite.
+ 	 * The bits here are changed dynamically when the user
+ 	 * writes commands to <debugfs>/dynamic_debug/ddebug
+	 */
+#define _DPRINTK_FLAGS_PRINT   (1<<0)  /* printk() a message using the format */
+#define _DPRINTK_FLAGS_DEFAULT 0
+	unsigned int flags:8;
+} __attribute__((aligned(8)));
+
+
+int ddebug_add_module(struct _ddebug *tab, unsigned int n,
+				const char *modname);
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+extern int ddebug_remove_module(char *mod_name);
+
+#define __dynamic_dbg_enabled(dd)  ({	     \
+	int __ret = 0;							     \
+	if (unlikely((dynamic_debug_enabled & (1LL << DEBUG_HASH)) &&	     \
+			(dynamic_debug_enabled2 & (1LL << DEBUG_HASH2))))   \
+				if (unlikely(dd.flags))			     \
+					__ret = 1;			     \
+	__ret; })
+
+#define dynamic_pr_debug(fmt, ...) do {					\
+	static struct _ddebug descriptor				\
+	__used								\
+	__attribute__((section("__verbose"), aligned(8))) =		\
+	{ KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,	\
+		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
+	if (__dynamic_dbg_enabled(descriptor))				\
+		printk(KERN_DEBUG KBUILD_MODNAME ":" fmt,		\
+				##__VA_ARGS__);				\
+	} while (0)
+
+
+#define dynamic_dev_dbg(dev, fmt, ...) do {				\
+	static struct _ddebug descriptor				\
+	__used								\
+	__attribute__((section("__verbose"), aligned(8))) =		\
+	{ KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,	\
+		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
+	if (__dynamic_dbg_enabled(descriptor))				\
+			dev_printk(KERN_DEBUG, dev,			\
+					KBUILD_MODNAME ": " fmt,	\
+					##__VA_ARGS__);			\
+	} while (0)
+
+#else
+
+static inline int ddebug_remove_module(char *mod)
+{
+	return 0;
+}
+
+#define dynamic_pr_debug(fmt, ...)  do { } while (0)
+#define dynamic_dev_dbg(dev, format, ...)  do { } while (0)
+#endif
+
+#endif
diff --git a/include/linux/dynamic_printk.h b/include/linux/dynamic_printk.h
deleted file mode 100644
index 2d528d009074..000000000000
--- a/include/linux/dynamic_printk.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef _DYNAMIC_PRINTK_H
-#define _DYNAMIC_PRINTK_H
-
-#define DYNAMIC_DEBUG_HASH_BITS 6
-#define DEBUG_HASH_TABLE_SIZE (1 << DYNAMIC_DEBUG_HASH_BITS)
-
-#define TYPE_BOOLEAN 1
-
-#define DYNAMIC_ENABLED_ALL 0
-#define DYNAMIC_ENABLED_NONE 1
-#define DYNAMIC_ENABLED_SOME 2
-
-extern int dynamic_enabled;
-
-/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
- * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
- * use independent hash functions, to reduce the chance of false positives.
- */
-extern long long dynamic_printk_enabled;
-extern long long dynamic_printk_enabled2;
-
-struct mod_debug {
-	char *modname;
-	char *logical_modname;
-	char *flag_names;
-	int type;
-	int hash;
-	int hash2;
-} __attribute__((aligned(8)));
-
-int register_dynamic_debug_module(char *mod_name, int type, char *share_name,
-					char *flags, int hash, int hash2);
-
-#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
-extern int unregister_dynamic_debug_module(char *mod_name);
-extern int __dynamic_dbg_enabled_helper(char *modname, int type,
-					int value, int hash);
-
-#define __dynamic_dbg_enabled(module, type, value, level, hash)  ({	     \
-	int __ret = 0;							     \
-	if (unlikely((dynamic_printk_enabled & (1LL << DEBUG_HASH)) &&	     \
-			(dynamic_printk_enabled2 & (1LL << DEBUG_HASH2))))   \
-			__ret = __dynamic_dbg_enabled_helper(module, type,   \
-								value, hash);\
-	__ret; })
-
-#define dynamic_pr_debug(fmt, ...) do {					    \
-	static char mod_name[]						    \
-	__attribute__((section("__verbose_strings")))			    \
-	 = KBUILD_MODNAME;						    \
-	static struct mod_debug descriptor				    \
-	__used								    \
-	__attribute__((section("__verbose"), aligned(8))) =		    \
-	{ mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\
-	if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN,		    \
-						0, 0, DEBUG_HASH))	    \
-		printk(KERN_DEBUG KBUILD_MODNAME ":" fmt,		    \
-				##__VA_ARGS__);				    \
-	} while (0)
-
-#define dynamic_dev_dbg(dev, format, ...) do {				    \
-	static char mod_name[]						    \
-	__attribute__((section("__verbose_strings")))			    \
-	 = KBUILD_MODNAME;						    \
-	static struct mod_debug descriptor				    \
-	__used								    \
-	__attribute__((section("__verbose"), aligned(8))) =		    \
-	{ mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\
-	if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN,		    \
-						0, 0, DEBUG_HASH))	    \
-			dev_printk(KERN_DEBUG, dev,			    \
-					KBUILD_MODNAME ": " format,	    \
-					##__VA_ARGS__);			    \
-	} while (0)
-
-#else
-
-static inline int unregister_dynamic_debug_module(const char *mod_name)
-{
-	return 0;
-}
-static inline int __dynamic_dbg_enabled_helper(char *modname, int type,
-						int value, int hash)
-{
-	return 0;
-}
-
-#define __dynamic_dbg_enabled(module, type, value, level, hash)  ({ 0; })
-#define dynamic_pr_debug(fmt, ...)  do { } while (0)
-#define dynamic_dev_dbg(dev, format, ...)  do { } while (0)
-#endif
-
-#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7fa371898e3e..b5496ecbec71 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -16,7 +16,7 @@
 #include <linux/log2.h>
 #include <linux/typecheck.h>
 #include <linux/ratelimit.h>
-#include <linux/dynamic_printk.h>
+#include <linux/dynamic_debug.h>
 #include <asm/byteorder.h>
 #include <asm/bug.h>
 
@@ -358,7 +358,7 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 #if defined(DEBUG)
 #define pr_debug(fmt, ...) \
 	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
-#elif defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#elif defined(CONFIG_DYNAMIC_DEBUG)
 #define pr_debug(fmt, ...) do { \
 	dynamic_pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
 	} while (0)
diff --git a/kernel/module.c b/kernel/module.c
index 1196f5d11700..77672233387f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -822,7 +822,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	mutex_lock(&module_mutex);
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-	unregister_dynamic_debug_module(mod->name);
+	ddebug_remove_module(mod->name);
 	free_module(mod);
 
  out:
@@ -1827,19 +1827,13 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
 
-static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
+static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 {
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
-	unsigned int i;
-
-	for (i = 0; i < num; i++) {
-		register_dynamic_debug_module(debug[i].modname,
-					      debug[i].type,
-					      debug[i].logical_modname,
-					      debug[i].flag_names,
-					      debug[i].hash, debug[i].hash2);
-	}
-#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+#ifdef CONFIG_DYNAMIC_DEBUG
+	if (ddebug_add_module(debug, num, debug->modname))
+		printk(KERN_ERR "dynamic debug error adding module: %s\n",
+					debug->modname);
+#endif
 }
 
 static void *module_alloc_update_bounds(unsigned long size)
@@ -2213,12 +2207,13 @@ static noinline struct module *load_module(void __user *umod,
 	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
 
 	if (!mod->taints) {
-		struct mod_debug *debug;
+		struct _ddebug *debug;
 		unsigned int num_debug;
 
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
-		dynamic_printk_setup(debug, num_debug);
+		if (debug)
+			dynamic_debug_setup(debug, num_debug);
 	}
 
 	/* sechdrs[0].sh_size is always zero */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1bcf9cd4baa0..0dd1c04c7323 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -847,7 +847,7 @@ config BUILD_DOCSRC
 
 	  Say N if you are unsure.
 
-config DYNAMIC_PRINTK_DEBUG
+config DYNAMIC_DEBUG
 	bool "Enable dynamic printk() call support"
 	default n
 	depends on PRINTK
diff --git a/lib/Makefile b/lib/Makefile
index 32b0e64ded27..8633d6be9d21 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_HAVE_LMB) += lmb.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
 
-obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o
+obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
new file mode 100644
index 000000000000..9e123ae326bc
--- /dev/null
+++ b/lib/dynamic_debug.c
@@ -0,0 +1,756 @@
+/*
+ * lib/dynamic_debug.c
+ *
+ * make pr_debug()/dev_dbg() calls runtime configurable based upon their
+ * source module.
+ *
+ * Copyright (C) 2008 Jason Baron <jbaron@redhat.com>
+ * By Greg Banks <gnb@melbourne.sgi.com>
+ * Copyright (c) 2008 Silicon Graphics Inc.  All Rights Reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kallsyms.h>
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <linux/dynamic_debug.h>
+#include <linux/debugfs.h>
+
+extern struct _ddebug __start___verbose[];
+extern struct _ddebug __stop___verbose[];
+
+/* dynamic_debug_enabled, and dynamic_debug_enabled2 are bitmasks in which
+ * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
+ * use independent hash functions, to reduce the chance of false positives.
+ */
+long long dynamic_debug_enabled;
+EXPORT_SYMBOL_GPL(dynamic_debug_enabled);
+long long dynamic_debug_enabled2;
+EXPORT_SYMBOL_GPL(dynamic_debug_enabled2);
+
+struct ddebug_table {
+	struct list_head link;
+	char *mod_name;
+	unsigned int num_ddebugs;
+	unsigned int num_enabled;
+	struct _ddebug *ddebugs;
+};
+
+struct ddebug_query {
+	const char *filename;
+	const char *module;
+	const char *function;
+	const char *format;
+	unsigned int first_lineno, last_lineno;
+};
+
+struct ddebug_iter {
+	struct ddebug_table *table;
+	unsigned int idx;
+};
+
+static DEFINE_MUTEX(ddebug_lock);
+static LIST_HEAD(ddebug_tables);
+static int verbose = 0;
+
+/* Return the last part of a pathname */
+static inline const char *basename(const char *path)
+{
+	const char *tail = strrchr(path, '/');
+	return tail ? tail+1 : path;
+}
+
+/* format a string into buf[] which describes the _ddebug's flags */
+static char *ddebug_describe_flags(struct _ddebug *dp, char *buf,
+				    size_t maxlen)
+{
+	char *p = buf;
+
+	BUG_ON(maxlen < 4);
+	if (dp->flags & _DPRINTK_FLAGS_PRINT)
+		*p++ = 'p';
+	if (p == buf)
+		*p++ = '-';
+	*p = '\0';
+
+	return buf;
+}
+
+/*
+ * must be called with ddebug_lock held
+ */
+
+static int disabled_hash(char hash, bool first_table)
+{
+	struct ddebug_table *dt;
+	char table_hash_value;
+
+	list_for_each_entry(dt, &ddebug_tables, link) {
+		if (first_table)
+			table_hash_value = dt->ddebugs->primary_hash;
+		else
+			table_hash_value = dt->ddebugs->secondary_hash;
+		if (dt->num_enabled && (hash == table_hash_value))
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Search the tables for _ddebug's which match the given
+ * `query' and apply the `flags' and `mask' to them.  Tells
+ * the user which ddebug's were changed, or whether none
+ * were matched.
+ */
+static void ddebug_change(const struct ddebug_query *query,
+			   unsigned int flags, unsigned int mask)
+{
+	int i;
+	struct ddebug_table *dt;
+	unsigned int newflags;
+	unsigned int nfound = 0;
+	char flagbuf[8];
+
+	/* search for matching ddebugs */
+	mutex_lock(&ddebug_lock);
+	list_for_each_entry(dt, &ddebug_tables, link) {
+
+		/* match against the module name */
+		if (query->module != NULL &&
+		    strcmp(query->module, dt->mod_name))
+			continue;
+
+		for (i = 0 ; i < dt->num_ddebugs ; i++) {
+			struct _ddebug *dp = &dt->ddebugs[i];
+
+			/* match against the source filename */
+			if (query->filename != NULL &&
+			    strcmp(query->filename, dp->filename) &&
+			    strcmp(query->filename, basename(dp->filename)))
+				continue;
+
+			/* match against the function */
+			if (query->function != NULL &&
+			    strcmp(query->function, dp->function))
+				continue;
+
+			/* match against the format */
+			if (query->format != NULL &&
+			    strstr(dp->format, query->format) == NULL)
+				continue;
+
+			/* match against the line number range */
+			if (query->first_lineno &&
+			    dp->lineno < query->first_lineno)
+				continue;
+			if (query->last_lineno &&
+			    dp->lineno > query->last_lineno)
+				continue;
+
+			nfound++;
+
+			newflags = (dp->flags & mask) | flags;
+			if (newflags == dp->flags)
+				continue;
+
+			if (!newflags)
+				dt->num_enabled--;
+			else if (!dp-flags)
+				dt->num_enabled++;
+			dp->flags = newflags;
+			if (newflags) {
+				dynamic_debug_enabled |=
+						(1LL << dp->primary_hash);
+				dynamic_debug_enabled2 |=
+						(1LL << dp->secondary_hash);
+			} else {
+				if (disabled_hash(dp->primary_hash, true))
+					dynamic_debug_enabled &=
+						~(1LL << dp->primary_hash);
+				if (disabled_hash(dp->secondary_hash, false))
+					dynamic_debug_enabled2 &=
+						~(1LL << dp->secondary_hash);
+			}
+			if (verbose)
+				printk(KERN_INFO
+					"ddebug: changed %s:%d [%s]%s %s\n",
+					dp->filename, dp->lineno,
+					dt->mod_name, dp->function,
+					ddebug_describe_flags(dp, flagbuf,
+							sizeof(flagbuf)));
+		}
+	}
+	mutex_unlock(&ddebug_lock);
+
+	if (!nfound && verbose)
+		printk(KERN_INFO "ddebug: no matches for query\n");
+}
+
+/*
+ * Wrapper around strsep() to collapse the multiple empty tokens
+ * that it returns when fed sequences of separator characters.
+ * Now, if we had strtok_r()...
+ */
+static inline char *nearly_strtok_r(char **p, const char *sep)
+{
+	char *r;
+
+	while ((r = strsep(p, sep)) != NULL && *r == '\0')
+		;
+	return r;
+}
+
+/*
+ * Split the buffer `buf' into space-separated words.
+ * Return the number of such words or <0 on error.
+ */
+static int ddebug_tokenize(char *buf, char *words[], int maxwords)
+{
+	int nwords = 0;
+
+	while (nwords < maxwords &&
+	       (words[nwords] = nearly_strtok_r(&buf, " \t\r\n")) != NULL)
+		nwords++;
+	if (buf)
+		return -EINVAL;	/* ran out of words[] before bytes */
+
+	if (verbose) {
+		int i;
+		printk(KERN_INFO "%s: split into words:", __func__);
+		for (i = 0 ; i < nwords ; i++)
+			printk(" \"%s\"", words[i]);
+		printk("\n");
+	}
+
+	return nwords;
+}
+
+/*
+ * Parse a single line number.  Note that the empty string ""
+ * is treated as a special case and converted to zero, which
+ * is later treated as a "don't care" value.
+ */
+static inline int parse_lineno(const char *str, unsigned int *val)
+{
+	char *end = NULL;
+	BUG_ON(str == NULL);
+	if (*str == '\0') {
+		*val = 0;
+		return 0;
+	}
+	*val = simple_strtoul(str, &end, 10);
+	return end == NULL || end == str || *end != '\0' ? -EINVAL : 0;
+}
+
+/*
+ * Undo octal escaping in a string, inplace.  This is useful to
+ * allow the user to express a query which matches a format
+ * containing embedded spaces.
+ */
+#define isodigit(c)		((c) >= '0' && (c) <= '7')
+static char *unescape(char *str)
+{
+	char *in = str;
+	char *out = str;
+
+	while (*in) {
+		if (*in == '\\') {
+			if (in[1] == '\\') {
+				*out++ = '\\';
+				in += 2;
+				continue;
+			} else if (in[1] == 't') {
+				*out++ = '\t';
+				in += 2;
+				continue;
+			} else if (in[1] == 'n') {
+				*out++ = '\n';
+				in += 2;
+				continue;
+			} else if (isodigit(in[1]) &&
+			         isodigit(in[2]) &&
+			         isodigit(in[3])) {
+				*out++ = ((in[1] - '0')<<6) |
+				          ((in[2] - '0')<<3) |
+				          (in[3] - '0');
+				in += 4;
+				continue;
+			}
+		}
+		*out++ = *in++;
+	}
+	*out = '\0';
+
+	return str;
+}
+
+/*
+ * Parse words[] as a ddebug query specification, which is a series
+ * of (keyword, value) pairs chosen from these possibilities:
+ *
+ * func <function-name>
+ * file <full-pathname>
+ * file <base-filename>
+ * module <module-name>
+ * format <escaped-string-to-find-in-format>
+ * line <lineno>
+ * line <first-lineno>-<last-lineno> // where either may be empty
+ */
+static int ddebug_parse_query(char *words[], int nwords,
+			       struct ddebug_query *query)
+{
+	unsigned int i;
+
+	/* check we have an even number of words */
+	if (nwords % 2 != 0)
+		return -EINVAL;
+	memset(query, 0, sizeof(*query));
+
+	for (i = 0 ; i < nwords ; i += 2) {
+		if (!strcmp(words[i], "func"))
+			query->function = words[i+1];
+		else if (!strcmp(words[i], "file"))
+			query->filename = words[i+1];
+		else if (!strcmp(words[i], "module"))
+			query->module = words[i+1];
+		else if (!strcmp(words[i], "format"))
+			query->format = unescape(words[i+1]);
+		else if (!strcmp(words[i], "line")) {
+			char *first = words[i+1];
+			char *last = strchr(first, '-');
+			if (last)
+				*last++ = '\0';
+			if (parse_lineno(first, &query->first_lineno) < 0)
+				return -EINVAL;
+			if (last != NULL) {
+				/* range <first>-<last> */
+				if (parse_lineno(last, &query->last_lineno) < 0)
+					return -EINVAL;
+			} else {
+				query->last_lineno = query->first_lineno;
+			}
+		} else {
+			if (verbose)
+				printk(KERN_ERR "%s: unknown keyword \"%s\"\n",
+					__func__, words[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (verbose)
+		printk(KERN_INFO "%s: q->function=\"%s\" q->filename=\"%s\" "
+		       "q->module=\"%s\" q->format=\"%s\" q->lineno=%u-%u\n",
+			__func__, query->function, query->filename,
+			query->module, query->format, query->first_lineno,
+			query->last_lineno);
+
+	return 0;
+}
+
+/*
+ * Parse `str' as a flags specification, format [-+=][p]+.
+ * Sets up *maskp and *flagsp to be used when changing the
+ * flags fields of matched _ddebug's.  Returns 0 on success
+ * or <0 on error.
+ */
+static int ddebug_parse_flags(const char *str, unsigned int *flagsp,
+			       unsigned int *maskp)
+{
+	unsigned flags = 0;
+	int op = '=';
+
+	switch (*str) {
+	case '+':
+	case '-':
+	case '=':
+		op = *str++;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (verbose)
+		printk(KERN_INFO "%s: op='%c'\n", __func__, op);
+
+	for ( ; *str ; ++str) {
+		switch (*str) {
+		case 'p':
+			flags |= _DPRINTK_FLAGS_PRINT;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	if (flags == 0)
+		return -EINVAL;
+	if (verbose)
+		printk(KERN_INFO "%s: flags=0x%x\n", __func__, flags);
+
+	/* calculate final *flagsp, *maskp according to mask and op */
+	switch (op) {
+	case '=':
+		*maskp = 0;
+		*flagsp = flags;
+		break;
+	case '+':
+		*maskp = ~0U;
+		*flagsp = flags;
+		break;
+	case '-':
+		*maskp = ~flags;
+		*flagsp = 0;
+		break;
+	}
+	if (verbose)
+		printk(KERN_INFO "%s: *flagsp=0x%x *maskp=0x%x\n",
+			__func__, *flagsp, *maskp);
+	return 0;
+}
+
+/*
+ * File_ops->write method for <debugfs>/dynamic_debug/conrol.  Gathers the
+ * command text from userspace, parses and executes it.
+ */
+static ssize_t ddebug_proc_write(struct file *file, const char __user *ubuf,
+				  size_t len, loff_t *offp)
+{
+	unsigned int flags = 0, mask = 0;
+	struct ddebug_query query;
+#define MAXWORDS 9
+	int nwords;
+	char *words[MAXWORDS];
+	char tmpbuf[256];
+
+	if (len == 0)
+		return 0;
+	/* we don't check *offp -- multiple writes() are allowed */
+	if (len > sizeof(tmpbuf)-1)
+		return -E2BIG;
+	if (copy_from_user(tmpbuf, ubuf, len))
+		return -EFAULT;
+	tmpbuf[len] = '\0';
+	if (verbose)
+		printk(KERN_INFO "%s: read %d bytes from userspace\n",
+			__func__, (int)len);
+
+	nwords = ddebug_tokenize(tmpbuf, words, MAXWORDS);
+	if (nwords < 0)
+		return -EINVAL;
+	if (ddebug_parse_query(words, nwords-1, &query))
+		return -EINVAL;
+	if (ddebug_parse_flags(words[nwords-1], &flags, &mask))
+		return -EINVAL;
+
+	/* actually go and implement the change */
+	ddebug_change(&query, flags, mask);
+
+	*offp += len;
+	return len;
+}
+
+/*
+ * Set the iterator to point to the first _ddebug object
+ * and return a pointer to that first object.  Returns
+ * NULL if there are no _ddebugs at all.
+ */
+static struct _ddebug *ddebug_iter_first(struct ddebug_iter *iter)
+{
+	if (list_empty(&ddebug_tables)) {
+		iter->table = NULL;
+		iter->idx = 0;
+		return NULL;
+	}
+	iter->table = list_entry(ddebug_tables.next,
+				 struct ddebug_table, link);
+	iter->idx = 0;
+	return &iter->table->ddebugs[iter->idx];
+}
+
+/*
+ * Advance the iterator to point to the next _ddebug
+ * object from the one the iterator currently points at,
+ * and returns a pointer to the new _ddebug.  Returns
+ * NULL if the iterator has seen all the _ddebugs.
+ */
+static struct _ddebug *ddebug_iter_next(struct ddebug_iter *iter)
+{
+	if (iter->table == NULL)
+		return NULL;
+	if (++iter->idx == iter->table->num_ddebugs) {
+		/* iterate to next table */
+		iter->idx = 0;
+		if (list_is_last(&iter->table->link, &ddebug_tables)) {
+			iter->table = NULL;
+			return NULL;
+		}
+		iter->table = list_entry(iter->table->link.next,
+					 struct ddebug_table, link);
+	}
+	return &iter->table->ddebugs[iter->idx];
+}
+
+/*
+ * Seq_ops start method.  Called at the start of every
+ * read() call from userspace.  Takes the ddebug_lock and
+ * seeks the seq_file's iterator to the given position.
+ */
+static void *ddebug_proc_start(struct seq_file *m, loff_t *pos)
+{
+	struct ddebug_iter *iter = m->private;
+	struct _ddebug *dp;
+	int n = *pos;
+
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p *pos=%lld\n",
+			__func__, m, (unsigned long long)*pos);
+
+	mutex_lock(&ddebug_lock);
+
+	if (!n)
+		return SEQ_START_TOKEN;
+	if (n < 0)
+		return NULL;
+	dp = ddebug_iter_first(iter);
+	while (dp != NULL && --n > 0)
+		dp = ddebug_iter_next(iter);
+	return dp;
+}
+
+/*
+ * Seq_ops next method.  Called several times within a read()
+ * call from userspace, with ddebug_lock held.  Walks to the
+ * next _ddebug object with a special case for the header line.
+ */
+static void *ddebug_proc_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct ddebug_iter *iter = m->private;
+	struct _ddebug *dp;
+
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p p=%p *pos=%lld\n",
+			__func__, m, p, (unsigned long long)*pos);
+
+	if (p == SEQ_START_TOKEN)
+		dp = ddebug_iter_first(iter);
+	else
+		dp = ddebug_iter_next(iter);
+	++*pos;
+	return dp;
+}
+
+/*
+ * Seq_ops show method.  Called several times within a read()
+ * call from userspace, with ddebug_lock held.  Formats the
+ * current _ddebug as a single human-readable line, with a
+ * special case for the header line.
+ */
+static int ddebug_proc_show(struct seq_file *m, void *p)
+{
+	struct ddebug_iter *iter = m->private;
+	struct _ddebug *dp = p;
+	char flagsbuf[8];
+
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p p=%p\n",
+			__func__, m, p);
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m,
+			"# filename:lineno [module]function flags format\n");
+		return 0;
+	}
+
+	seq_printf(m, "%s:%u [%s]%s %s \"",
+		   dp->filename, dp->lineno,
+		   iter->table->mod_name, dp->function,
+		   ddebug_describe_flags(dp, flagsbuf, sizeof(flagsbuf)));
+	seq_escape(m, dp->format, "\t\r\n\"");
+	seq_puts(m, "\"\n");
+
+	return 0;
+}
+
+/*
+ * Seq_ops stop method.  Called at the end of each read()
+ * call from userspace.  Drops ddebug_lock.
+ */
+static void ddebug_proc_stop(struct seq_file *m, void *p)
+{
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p p=%p\n",
+			__func__, m, p);
+	mutex_unlock(&ddebug_lock);
+}
+
+static const struct seq_operations ddebug_proc_seqops = {
+	.start = ddebug_proc_start,
+	.next = ddebug_proc_next,
+	.show = ddebug_proc_show,
+	.stop = ddebug_proc_stop
+};
+
+/*
+ * File_ops->open method for <debugfs>/dynamic_debug/control.  Does the seq_file
+ * setup dance, and also creates an iterator to walk the _ddebugs.
+ * Note that we create a seq_file always, even for O_WRONLY files
+ * where it's not needed, as doing so simplifies the ->release method.
+ */
+static int ddebug_proc_open(struct inode *inode, struct file *file)
+{
+	struct ddebug_iter *iter;
+	int err;
+
+	if (verbose)
+		printk(KERN_INFO "%s: called\n", __func__);
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (iter == NULL)
+		return -ENOMEM;
+
+	err = seq_open(file, &ddebug_proc_seqops);
+	if (err) {
+		kfree(iter);
+		return err;
+	}
+	((struct seq_file *) file->private_data)->private = iter;
+	return 0;
+}
+
+static const struct file_operations ddebug_proc_fops = {
+	.owner = THIS_MODULE,
+	.open = ddebug_proc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_private,
+	.write = ddebug_proc_write
+};
+
+/*
+ * Allocate a new ddebug_table for the given module
+ * and add it to the global list.
+ */
+int ddebug_add_module(struct _ddebug *tab, unsigned int n,
+			     const char *name)
+{
+	struct ddebug_table *dt;
+	char *new_name;
+
+	dt = kzalloc(sizeof(*dt), GFP_KERNEL);
+	if (dt == NULL)
+		return -ENOMEM;
+	new_name = kstrdup(name, GFP_KERNEL);
+	if (new_name == NULL) {
+		kfree(dt);
+		return -ENOMEM;
+	}
+	dt->mod_name = new_name;
+	dt->num_ddebugs = n;
+	dt->num_enabled = 0;
+	dt->ddebugs = tab;
+
+	mutex_lock(&ddebug_lock);
+	list_add_tail(&dt->link, &ddebug_tables);
+	mutex_unlock(&ddebug_lock);
+
+	if (verbose)
+		printk(KERN_INFO "%u debug prints in module %s\n",
+				 n, dt->mod_name);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ddebug_add_module);
+
+static void ddebug_table_free(struct ddebug_table *dt)
+{
+	list_del_init(&dt->link);
+	kfree(dt->mod_name);
+	kfree(dt);
+}
+
+/*
+ * Called in response to a module being unloaded.  Removes
+ * any ddebug_table's which point at the module.
+ */
+int ddebug_remove_module(char *mod_name)
+{
+	struct ddebug_table *dt, *nextdt;
+	int ret = -ENOENT;
+
+	if (verbose)
+		printk(KERN_INFO "%s: removing module \"%s\"\n",
+				__func__, mod_name);
+
+	mutex_lock(&ddebug_lock);
+	list_for_each_entry_safe(dt, nextdt, &ddebug_tables, link) {
+		if (!strcmp(dt->mod_name, mod_name)) {
+			ddebug_table_free(dt);
+			ret = 0;
+		}
+	}
+	mutex_unlock(&ddebug_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ddebug_remove_module);
+
+static void ddebug_remove_all_tables(void)
+{
+	mutex_lock(&ddebug_lock);
+	while (!list_empty(&ddebug_tables)) {
+		struct ddebug_table *dt = list_entry(ddebug_tables.next,
+						      struct ddebug_table,
+						      link);
+		ddebug_table_free(dt);
+	}
+	mutex_unlock(&ddebug_lock);
+}
+
+static int __init dynamic_debug_init(void)
+{
+	struct dentry *dir, *file;
+	struct _ddebug *iter, *iter_start;
+	const char *modname = NULL;
+	int ret = 0;
+	int n = 0;
+
+	dir = debugfs_create_dir("dynamic_debug", NULL);
+	if (!dir)
+		return -ENOMEM;
+	file = debugfs_create_file("control", 0644, dir, NULL,
+					&ddebug_proc_fops);
+	if (!file) {
+		debugfs_remove(dir);
+		return -ENOMEM;
+	}
+	if (__start___verbose != __stop___verbose) {
+		iter = __start___verbose;
+		modname = iter->modname;
+		iter_start = iter;
+		for (; iter < __stop___verbose; iter++) {
+			if (strcmp(modname, iter->modname)) {
+				ret = ddebug_add_module(iter_start, n, modname);
+				if (ret)
+					goto out_free;
+				n = 0;
+				modname = iter->modname;
+				iter_start = iter;
+			}
+			n++;
+		}
+		ret = ddebug_add_module(iter_start, n, modname);
+	}
+out_free:
+	if (ret) {
+		ddebug_remove_all_tables();
+		debugfs_remove(dir);
+		debugfs_remove(file);
+	}
+	return 0;
+}
+module_init(dynamic_debug_init);
diff --git a/lib/dynamic_printk.c b/lib/dynamic_printk.c
deleted file mode 100644
index 165a19763dc9..000000000000
--- a/lib/dynamic_printk.c
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- * lib/dynamic_printk.c
- *
- * make pr_debug()/dev_dbg() calls runtime configurable based upon their
- * their source module.
- *
- * Copyright (C) 2008 Red Hat, Inc., Jason Baron <jbaron@redhat.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/fs.h>
-
-extern struct mod_debug __start___verbose[];
-extern struct mod_debug __stop___verbose[];
-
-struct debug_name {
-	struct hlist_node hlist;
-	struct hlist_node hlist2;
-	int hash1;
-	int hash2;
-	char *name;
-	int enable;
-	int type;
-};
-
-static int nr_entries;
-static int num_enabled;
-int dynamic_enabled = DYNAMIC_ENABLED_NONE;
-static struct hlist_head module_table[DEBUG_HASH_TABLE_SIZE] =
-	{ [0 ... DEBUG_HASH_TABLE_SIZE-1] = HLIST_HEAD_INIT };
-static struct hlist_head module_table2[DEBUG_HASH_TABLE_SIZE] =
-	{ [0 ... DEBUG_HASH_TABLE_SIZE-1] = HLIST_HEAD_INIT };
-static DECLARE_MUTEX(debug_list_mutex);
-
-/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
- * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
- * use independent hash functions, to reduce the chance of false positives.
- */
-long long dynamic_printk_enabled;
-EXPORT_SYMBOL_GPL(dynamic_printk_enabled);
-long long dynamic_printk_enabled2;
-EXPORT_SYMBOL_GPL(dynamic_printk_enabled2);
-
-/* returns the debug module pointer. */
-static struct debug_name *find_debug_module(char *module_name)
-{
-	int i;
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct debug_name *element;
-
-	element = NULL;
-	for (i = 0; i < DEBUG_HASH_TABLE_SIZE; i++) {
-		head = &module_table[i];
-		hlist_for_each_entry_rcu(element, node, head, hlist)
-			if (!strcmp(element->name, module_name))
-				return element;
-	}
-	return NULL;
-}
-
-/* returns the debug module pointer. */
-static struct debug_name *find_debug_module_hash(char *module_name, int hash)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct debug_name *element;
-
-	element = NULL;
-	head = &module_table[hash];
-	hlist_for_each_entry_rcu(element, node, head, hlist)
-		if (!strcmp(element->name, module_name))
-			return element;
-	return NULL;
-}
-
-/* caller must hold mutex*/
-static int __add_debug_module(char *mod_name, int hash, int hash2)
-{
-	struct debug_name *new;
-	char *module_name;
-	int ret = 0;
-
-	if (find_debug_module(mod_name)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	module_name = kmalloc(strlen(mod_name) + 1, GFP_KERNEL);
-	if (!module_name) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	module_name = strcpy(module_name, mod_name);
-	module_name[strlen(mod_name)] = '\0';
-	new = kzalloc(sizeof(struct debug_name), GFP_KERNEL);
-	if (!new) {
-		kfree(module_name);
-		ret = -ENOMEM;
-		goto out;
-	}
-	INIT_HLIST_NODE(&new->hlist);
-	INIT_HLIST_NODE(&new->hlist2);
-	new->name = module_name;
-	new->hash1 = hash;
-	new->hash2 = hash2;
-	hlist_add_head_rcu(&new->hlist, &module_table[hash]);
-	hlist_add_head_rcu(&new->hlist2, &module_table2[hash2]);
-	nr_entries++;
-out:
-	return ret;
-}
-
-int unregister_dynamic_debug_module(char *mod_name)
-{
-	struct debug_name *element;
-	int ret = 0;
-
-	down(&debug_list_mutex);
-	element = find_debug_module(mod_name);
-	if (!element) {
-		ret = -EINVAL;
-		goto out;
-	}
-	hlist_del_rcu(&element->hlist);
-	hlist_del_rcu(&element->hlist2);
-	synchronize_rcu();
-	kfree(element->name);
-	if (element->enable)
-		num_enabled--;
-	kfree(element);
-	nr_entries--;
-out:
-	up(&debug_list_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_dynamic_debug_module);
-
-int register_dynamic_debug_module(char *mod_name, int type, char *share_name,
-					char *flags, int hash, int hash2)
-{
-	struct debug_name *elem;
-	int ret = 0;
-
-	down(&debug_list_mutex);
-	elem = find_debug_module(mod_name);
-	if (!elem) {
-		if (__add_debug_module(mod_name, hash, hash2))
-			goto out;
-		elem = find_debug_module(mod_name);
-		if (dynamic_enabled == DYNAMIC_ENABLED_ALL &&
-				!strcmp(mod_name, share_name)) {
-			elem->enable = true;
-			num_enabled++;
-		}
-	}
-	elem->type |= type;
-out:
-	up(&debug_list_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(register_dynamic_debug_module);
-
-int __dynamic_dbg_enabled_helper(char *mod_name, int type, int value, int hash)
-{
-	struct debug_name *elem;
-	int ret = 0;
-
-	if (dynamic_enabled == DYNAMIC_ENABLED_ALL)
-		return 1;
-	rcu_read_lock();
-	elem = find_debug_module_hash(mod_name, hash);
-	if (elem && elem->enable)
-		ret = 1;
-	rcu_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__dynamic_dbg_enabled_helper);
-
-static void set_all(bool enable)
-{
-	struct debug_name *e;
-	struct hlist_node *node;
-	int i;
-	long long enable_mask;
-
-	for (i = 0; i < DEBUG_HASH_TABLE_SIZE; i++) {
-		if (module_table[i].first != NULL) {
-			hlist_for_each_entry(e, node, &module_table[i], hlist) {
-				e->enable = enable;
-			}
-		}
-	}
-	if (enable)
-		enable_mask = ULLONG_MAX;
-	else
-		enable_mask = 0;
-	dynamic_printk_enabled = enable_mask;
-	dynamic_printk_enabled2 = enable_mask;
-}
-
-static int disabled_hash(int i, bool first_table)
-{
-	struct debug_name *e;
-	struct hlist_node *node;
-
-	if (first_table) {
-		hlist_for_each_entry(e, node, &module_table[i], hlist) {
-			if (e->enable)
-				return 0;
-		}
-	} else {
-		hlist_for_each_entry(e, node, &module_table2[i], hlist2) {
-			if (e->enable)
-				return 0;
-		}
-	}
-	return 1;
-}
-
-static ssize_t pr_debug_write(struct file *file, const char __user *buf,
-				size_t length, loff_t *ppos)
-{
-	char *buffer, *s, *value_str, *setting_str;
-	int err, value;
-	struct debug_name *elem = NULL;
-	int all = 0;
-
-	if (length > PAGE_SIZE || length < 0)
-		return -EINVAL;
-
-	buffer = (char *)__get_free_page(GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	err = -EFAULT;
-	if (copy_from_user(buffer, buf, length))
-		goto out;
-
-	err = -EINVAL;
-	if (length < PAGE_SIZE)
-		buffer[length] = '\0';
-	else if (buffer[PAGE_SIZE-1])
-		goto out;
-
-	err = -EINVAL;
-	down(&debug_list_mutex);
-
-	if (strncmp("set", buffer, 3))
-		goto out_up;
-	s = buffer + 3;
-	setting_str = strsep(&s, "=");
-	if (s == NULL)
-		goto out_up;
-	setting_str = strstrip(setting_str);
-	value_str = strsep(&s, " ");
-	if (s == NULL)
-		goto out_up;
-	s = strstrip(s);
-	if (!strncmp(s, "all", 3))
-		all = 1;
-	else
-		elem = find_debug_module(s);
-	if (!strncmp(setting_str, "enable", 6)) {
-		value = !!simple_strtol(value_str, NULL, 10);
-		if (all) {
-			if (value) {
-				set_all(true);
-				num_enabled = nr_entries;
-				dynamic_enabled = DYNAMIC_ENABLED_ALL;
-			} else {
-				set_all(false);
-				num_enabled = 0;
-				dynamic_enabled = DYNAMIC_ENABLED_NONE;
-			}
-			err = 0;
-		} else if (elem) {
-			if (value && (elem->enable == 0)) {
-				dynamic_printk_enabled |= (1LL << elem->hash1);
-				dynamic_printk_enabled2 |= (1LL << elem->hash2);
-				elem->enable = 1;
-				num_enabled++;
-				dynamic_enabled = DYNAMIC_ENABLED_SOME;
-				err = 0;
-				printk(KERN_DEBUG
-					"debugging enabled for module %s\n",
-					elem->name);
-			} else if (!value && (elem->enable == 1)) {
-				elem->enable = 0;
-				num_enabled--;
-				if (disabled_hash(elem->hash1, true))
-					dynamic_printk_enabled &=
-							~(1LL << elem->hash1);
-				if (disabled_hash(elem->hash2, false))
-					dynamic_printk_enabled2 &=
-							~(1LL << elem->hash2);
-				if (num_enabled)
-					dynamic_enabled = DYNAMIC_ENABLED_SOME;
-				else
-					dynamic_enabled = DYNAMIC_ENABLED_NONE;
-				err = 0;
-				printk(KERN_DEBUG
-					"debugging disabled for module %s\n",
-					elem->name);
-			}
-		}
-	}
-	if (!err)
-		err = length;
-out_up:
-	up(&debug_list_mutex);
-out:
-	free_page((unsigned long)buffer);
-	return err;
-}
-
-static void *pr_debug_seq_start(struct seq_file *f, loff_t *pos)
-{
-	return (*pos < DEBUG_HASH_TABLE_SIZE) ? pos : NULL;
-}
-
-static void *pr_debug_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	(*pos)++;
-	if (*pos >= DEBUG_HASH_TABLE_SIZE)
-		return NULL;
-	return pos;
-}
-
-static void pr_debug_seq_stop(struct seq_file *s, void *v)
-{
-	/* Nothing to do */
-}
-
-static int pr_debug_seq_show(struct seq_file *s, void *v)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct debug_name *elem;
-	unsigned int i = *(loff_t *) v;
-
-	rcu_read_lock();
-	head = &module_table[i];
-	hlist_for_each_entry_rcu(elem, node, head, hlist) {
-		seq_printf(s, "%s enabled=%d", elem->name, elem->enable);
-		seq_printf(s, "\n");
-	}
-	rcu_read_unlock();
-	return 0;
-}
-
-static struct seq_operations pr_debug_seq_ops = {
-	.start = pr_debug_seq_start,
-	.next  = pr_debug_seq_next,
-	.stop  = pr_debug_seq_stop,
-	.show  = pr_debug_seq_show
-};
-
-static int pr_debug_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &pr_debug_seq_ops);
-}
-
-static const struct file_operations pr_debug_operations = {
-	.open		= pr_debug_open,
-	.read		= seq_read,
-	.write		= pr_debug_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int __init dynamic_printk_init(void)
-{
-	struct dentry *dir, *file;
-	struct mod_debug *iter;
-	unsigned long value;
-
-	dir = debugfs_create_dir("dynamic_printk", NULL);
-	if (!dir)
-		return -ENOMEM;
-	file = debugfs_create_file("modules", 0644, dir, NULL,
-					&pr_debug_operations);
-	if (!file) {
-		debugfs_remove(dir);
-		return -ENOMEM;
-	}
-	for (value = (unsigned long)__start___verbose;
-		value < (unsigned long)__stop___verbose;
-		value += sizeof(struct mod_debug)) {
-			iter = (struct mod_debug *)value;
-			register_dynamic_debug_module(iter->modname,
-				iter->type,
-				iter->logical_modname,
-				iter->flag_names, iter->hash, iter->hash2);
-	}
-	if (dynamic_enabled == DYNAMIC_ENABLED_ALL)
-		set_all(true);
-	return 0;
-}
-module_init(dynamic_printk_init);
-/* may want to move this earlier so we can get traces as early as possible */
-
-static int __init dynamic_printk_setup(char *str)
-{
-	if (str)
-		return -ENOENT;
-	dynamic_enabled = DYNAMIC_ENABLED_ALL;
-	return 0;
-}
-/* Use early_param(), so we can get debug output as early as possible */
-early_param("dynamic_printk", dynamic_printk_setup);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 9e169ef2e854..12bd09dbd36c 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -66,7 +66,7 @@ void
 			     struct nf_conntrack_expect *exp) __read_mostly;
 EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
 
-#if defined(DEBUG) || defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
 /* PptpControlMessageType names */
 const char *const pptp_msg_name[] = {
 	"UNKNOWN_MESSAGE",
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index e06365775bdf..c18fa150b6fe 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -97,7 +97,7 @@ modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
 #hash values
-ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+ifdef CONFIG_DYNAMIC_DEBUG
 debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\
               -D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))"
 else
-- 
cgit v1.2.3-71-gd317


From e6e66b02e11563abdb7f69dcb7a2efbd8d577e77 Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Wed, 11 Mar 2009 21:07:28 +1100
Subject: Dynamic debug: fix pr_fmt() build error

When CONFIG_DYNAMIC_DEBUG is enabled, allow callers of pr_debug()
to provide their own definition of pr_fmt() even if that definition
uses tricks like

#define pr_fmt(fmt) "%s:" fmt, __func__

Signed-off-by: Greg Banks <gnb@sgi.com>
Cc: Jason Baron <jbaron@redhat.com>
Acked-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/dynamic_debug.h | 4 ++--
 include/linux/kernel.h        | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 07781aaa1164..baabf33be244 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -57,7 +57,7 @@ extern int ddebug_remove_module(char *mod_name);
 	{ KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,	\
 		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
 	if (__dynamic_dbg_enabled(descriptor))				\
-		printk(KERN_DEBUG KBUILD_MODNAME ":" fmt,		\
+		printk(KERN_DEBUG KBUILD_MODNAME ":" pr_fmt(fmt),	\
 				##__VA_ARGS__);				\
 	} while (0)
 
@@ -70,7 +70,7 @@ extern int ddebug_remove_module(char *mod_name);
 		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
 	if (__dynamic_dbg_enabled(descriptor))				\
 			dev_printk(KERN_DEBUG, dev,			\
-					KBUILD_MODNAME ": " fmt,	\
+					KBUILD_MODNAME ": " pr_fmt(fmt),\
 					##__VA_ARGS__);			\
 	} while (0)
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b5496ecbec71..914918abfdd1 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -359,8 +359,9 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 #define pr_debug(fmt, ...) \
 	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #elif defined(CONFIG_DYNAMIC_DEBUG)
+/* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */
 #define pr_debug(fmt, ...) do { \
-	dynamic_pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
+	dynamic_pr_debug(fmt, ##__VA_ARGS__); \
 	} while (0)
 #else
 #define pr_debug(fmt, ...) \
-- 
cgit v1.2.3-71-gd317


From 1a660164c291f41b2aa853a7269b310933574ef9 Mon Sep 17 00:00:00 2001
From: Mark Lord <mlord@pobox.com>
Date: Wed, 25 Feb 2009 15:18:32 -0500
Subject: [libata] Export ata_pio_queue_task() so that it can be used from
 sata_mv.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 1 +
 drivers/ata/libata.h      | 2 --
 include/linux/libata.h    | 3 +++
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 060bcd601f57..d4a7b8a96ecd 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6709,6 +6709,7 @@ EXPORT_SYMBOL_GPL(ata_id_c_string);
 EXPORT_SYMBOL_GPL(ata_do_dev_read_id);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
 
+EXPORT_SYMBOL_GPL(ata_pio_queue_task);
 EXPORT_SYMBOL_GPL(ata_pio_need_iordy);
 EXPORT_SYMBOL_GPL(ata_timing_find_mode);
 EXPORT_SYMBOL_GPL(ata_timing_compute);
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index cea8014cd87e..89a1e0018e71 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -79,8 +79,6 @@ extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
 			   u64 block, u32 n_block, unsigned int tf_flags,
 			   unsigned int tag);
 extern u64 ata_tf_read_block(struct ata_taskfile *tf, struct ata_device *dev);
-extern void ata_pio_queue_task(struct ata_port *ap, void *data,
-			       unsigned long delay);
 extern void ata_port_flush_task(struct ata_port *ap);
 extern unsigned ata_exec_internal(struct ata_device *dev,
 				  struct ata_taskfile *tf, const u8 *cdb,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index dc18b87ed722..19af7d22a7f8 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1008,6 +1008,9 @@ extern int ata_cable_sata(struct ata_port *ap);
 extern int ata_cable_ignore(struct ata_port *ap);
 extern int ata_cable_unknown(struct ata_port *ap);
 
+extern void ata_pio_queue_task(struct ata_port *ap, void *data,
+			       unsigned long delay);
+
 /* Timing helpers */
 extern unsigned int ata_pio_need_iordy(const struct ata_device *);
 extern const struct ata_timing *ata_timing_find_mode(u8 xfer_mode);
-- 
cgit v1.2.3-71-gd317


From 22ddbd1e036ce035c1cccb2496aefafac79aba2c Mon Sep 17 00:00:00 2001
From: Erik Inge Bolsø <knan-lkml@anduin.net>
Date: Sat, 14 Mar 2009 21:37:48 +0100
Subject: include/linux/ata.h: add some more transfer masks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Erik Inge Bolsø <knan-lkml@anduin.net>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/ata.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 9a061accd8b8..3901b0022cda 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -108,6 +108,8 @@ enum {
 	ATA_PIO5		= ATA_PIO4 | (1 << 5),
 	ATA_PIO6		= ATA_PIO5 | (1 << 6),
 
+	ATA_PIO4_ONLY		= (1 << 4),
+
 	ATA_SWDMA0		= (1 << 0),
 	ATA_SWDMA1		= ATA_SWDMA0 | (1 << 1),
 	ATA_SWDMA2		= ATA_SWDMA1 | (1 << 2),
@@ -117,6 +119,8 @@ enum {
 	ATA_MWDMA0		= (1 << 0),
 	ATA_MWDMA1		= ATA_MWDMA0 | (1 << 1),
 	ATA_MWDMA2		= ATA_MWDMA1 | (1 << 2),
+	ATA_MWDMA3		= ATA_MWDMA2 | (1 << 3),
+	ATA_MWDMA4		= ATA_MWDMA3 | (1 << 4),
 
 	ATA_MWDMA12_ONLY	= (1 << 1) | (1 << 2),
 	ATA_MWDMA2_ONLY		= (1 << 2),
@@ -131,6 +135,8 @@ enum {
 	ATA_UDMA7		= ATA_UDMA6 | (1 << 7),
 	/* ATA_UDMA7 is just for completeness... doesn't exist (yet?).  */
 
+	ATA_UDMA24_ONLY		= (1 << 2) | (1 << 4),
+
 	ATA_UDMA_MASK_40C	= ATA_UDMA2,	/* udma0-2 */
 
 	/* DMA-related */
-- 
cgit v1.2.3-71-gd317


From 3d47aa8e7e7b2aa09256590388aa8dddc79280f9 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Tue, 24 Mar 2009 10:23:19 +0000
Subject: [libata] Drain data on errors

If the device is signalling that there is data to drain after an error we
should read the bytes out and throw them away. Without this some devices
and controllers get wedged and don't recover.

Based on earlier work by Mark Lord

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-sff.c  | 45 +++++++++++++++++++++++++++++++++++++++++++--
 drivers/ata/pata_pcmcia.c | 34 +++++++++++++++++++++++++++++++++-
 include/linux/libata.h    |  3 +++
 3 files changed, 79 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index f93dc029dfde..9a10cb055ac2 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -52,6 +52,7 @@ const struct ata_port_operations ata_sff_port_ops = {
 	.softreset		= ata_sff_softreset,
 	.hardreset		= sata_sff_hardreset,
 	.postreset		= ata_sff_postreset,
+	.drain_fifo		= ata_sff_drain_fifo,
 	.error_handler		= ata_sff_error_handler,
 	.post_internal_cmd	= ata_sff_post_internal_cmd,
 
@@ -2198,6 +2199,39 @@ void ata_sff_postreset(struct ata_link *link, unsigned int *classes)
 }
 EXPORT_SYMBOL_GPL(ata_sff_postreset);
 
+/**
+ *	ata_sff_drain_fifo - Stock FIFO drain logic for SFF controllers
+ *	@qc: command
+ *
+ *	Drain the FIFO and device of any stuck data following a command
+ *	failing to complete. In some cases this is neccessary before a
+ *	reset will recover the device.
+ *
+ */
+
+void ata_sff_drain_fifo(struct ata_queued_cmd *qc)
+{
+	int count;
+	struct ata_port *ap;
+
+	/* We only need to flush incoming data when a command was running */
+	if (qc == NULL || qc->dma_dir == DMA_TO_DEVICE)
+		return;
+
+	ap = qc->ap;
+	/* Drain up to 64K of data before we give up this recovery method */
+	for (count = 0; (ap->ops->sff_check_status(ap) & ATA_DRQ)
+						&& count < 32768; count++)
+		ioread16(ap->ioaddr.data_addr);
+
+	/* Can become DEBUG later */
+	if (count)
+		ata_port_printk(ap, KERN_DEBUG,
+			"drained %d bytes to clear DRQ.\n", count);
+
+}
+EXPORT_SYMBOL_GPL(ata_sff_drain_fifo);
+
 /**
  *	ata_sff_error_handler - Stock error handler for BMDMA controller
  *	@ap: port to handle error for
@@ -2239,7 +2273,8 @@ void ata_sff_error_handler(struct ata_port *ap)
 		 * really a timeout event, adjust error mask and
 		 * cancel frozen state.
 		 */
-		if (qc->err_mask == AC_ERR_TIMEOUT && (host_stat & ATA_DMA_ERR)) {
+		if (qc->err_mask == AC_ERR_TIMEOUT
+						&& (host_stat & ATA_DMA_ERR)) {
 			qc->err_mask = AC_ERR_HOST_BUS;
 			thaw = 1;
 		}
@@ -2250,6 +2285,13 @@ void ata_sff_error_handler(struct ata_port *ap)
 	ata_sff_sync(ap);		/* FIXME: We don't need this */
 	ap->ops->sff_check_status(ap);
 	ap->ops->sff_irq_clear(ap);
+	/* We *MUST* do FIFO draining before we issue a reset as several
+	 * devices helpfully clear their internal state and will lock solid
+	 * if we touch the data port post reset. Pass qc in case anyone wants
+	 *  to do different PIO/DMA recovery or has per command fixups
+	 */
+	if (ap->ops->drain_fifo)
+		ap->ops->drain_fifo(qc);
 
 	spin_unlock_irqrestore(ap->lock, flags);
 
@@ -2959,4 +3001,3 @@ out:
 EXPORT_SYMBOL_GPL(ata_pci_sff_init_one);
 
 #endif /* CONFIG_PCI */
-
diff --git a/drivers/ata/pata_pcmcia.c b/drivers/ata/pata_pcmcia.c
index a5cbcc280b23..f4d009ed50ac 100644
--- a/drivers/ata/pata_pcmcia.c
+++ b/drivers/ata/pata_pcmcia.c
@@ -42,7 +42,7 @@
 
 
 #define DRV_NAME "pata_pcmcia"
-#define DRV_VERSION "0.3.3"
+#define DRV_VERSION "0.3.5"
 
 /*
  *	Private data structure to glue stuff together
@@ -126,6 +126,37 @@ static unsigned int ata_data_xfer_8bit(struct ata_device *dev,
 	return buflen;
 }
 
+/**
+ *	pcmcia_8bit_drain_fifo - Stock FIFO drain logic for SFF controllers
+ *	@qc: command
+ *
+ *	Drain the FIFO and device of any stuck data following a command
+ *	failing to complete. In some cases this is neccessary before a
+ *	reset will recover the device.
+ *
+ */
+ 
+void pcmcia_8bit_drain_fifo(struct ata_queued_cmd *qc)
+{
+	int count;
+	struct ata_port *ap;
+
+	/* We only need to flush incoming data when a command was running */
+	if (qc == NULL || qc->dma_dir == DMA_TO_DEVICE)
+		return;
+
+	ap = qc->ap;
+
+	/* Drain up to 64K of data before we give up this recovery method */
+	for (count = 0; (ap->ops->sff_check_status(ap) & ATA_DRQ)
+							&& count++ < 65536;)
+		ioread8(ap->ioaddr.data_addr);
+
+	if (count)
+		ata_port_printk(ap, KERN_WARNING, "drained %d bytes to clear DRQ.\n",
+								count);
+
+}
 
 static struct scsi_host_template pcmcia_sht = {
 	ATA_PIO_SHT(DRV_NAME),
@@ -143,6 +174,7 @@ static struct ata_port_operations pcmcia_8bit_port_ops = {
 	.sff_data_xfer	= ata_data_xfer_8bit,
 	.cable_detect	= ata_cable_40wire,
 	.set_mode	= pcmcia_set_mode_8bit,
+	.drain_fifo	= pcmcia_8bit_drain_fifo,
 };
 
 #define CS_CHECK(fn, ret) \
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 19af7d22a7f8..3a07a32dfc2e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -836,6 +836,8 @@ struct ata_port_operations {
 	void (*bmdma_start)(struct ata_queued_cmd *qc);
 	void (*bmdma_stop)(struct ata_queued_cmd *qc);
 	u8   (*bmdma_status)(struct ata_port *ap);
+
+	void (*drain_fifo)(struct ata_queued_cmd *qc);
 #endif /* CONFIG_ATA_SFF */
 
 	ssize_t (*em_show)(struct ata_port *ap, char *buf);
@@ -1587,6 +1589,7 @@ extern int ata_sff_softreset(struct ata_link *link, unsigned int *classes,
 extern int sata_sff_hardreset(struct ata_link *link, unsigned int *class,
 			       unsigned long deadline);
 extern void ata_sff_postreset(struct ata_link *link, unsigned int *classes);
+extern void ata_sff_drain_fifo(struct ata_queued_cmd *qc);
 extern void ata_sff_error_handler(struct ata_port *ap);
 extern void ata_sff_post_internal_cmd(struct ata_queued_cmd *qc);
 extern int ata_sff_port_start(struct ata_port *ap);
-- 
cgit v1.2.3-71-gd317


From c96f1732e25362d10ee7bcac1df8412a2e6b7d23 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Tue, 24 Mar 2009 10:23:46 +0000
Subject: [libata] Improve timeout handling

On a timeout call a device specific handler early in the recovery so that
we can complete and process successful commands which timed out due to IRQ
loss or the like rather more elegantly.

[Revised to exclude the timeout handling on a few devices that inherit from
 SFF but are not SFF enough to use the default timeout handler]

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-eh.c    | 19 +++++++++++++++++--
 drivers/ata/libata-sff.c   | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/ata/pata_isapnp.c  | 12 ++++++++++--
 drivers/ata/pdc_adma.c     |  2 ++
 drivers/ata/sata_mv.c      |  2 ++
 drivers/ata/sata_nv.c      |  1 +
 drivers/ata/sata_promise.c |  2 ++
 drivers/ata/sata_qstor.c   |  1 +
 drivers/ata/sata_vsc.c     |  3 +++
 include/linux/libata.h     |  2 ++
 10 files changed, 85 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index ea890911d4fa..01831312c360 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -547,7 +547,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 
 	/* For new EH, all qcs are finished in one of three ways -
 	 * normal completion, error completion, and SCSI timeout.
-	 * Both cmpletions can race against SCSI timeout.  When normal
+	 * Both completions can race against SCSI timeout.  When normal
 	 * completion wins, the qc never reaches EH.  When error
 	 * completion wins, the qc has ATA_QCFLAG_FAILED set.
 	 *
@@ -562,7 +562,19 @@ void ata_scsi_error(struct Scsi_Host *host)
 		int nr_timedout = 0;
 
 		spin_lock_irqsave(ap->lock, flags);
-
+		
+		/* This must occur under the ap->lock as we don't want
+		   a polled recovery to race the real interrupt handler
+		   
+		   The lost_interrupt handler checks for any completed but
+		   non-notified command and completes much like an IRQ handler.
+		   
+		   We then fall into the error recovery code which will treat
+		   this as if normal completion won the race */
+
+		if (ap->ops->lost_interrupt)
+			ap->ops->lost_interrupt(ap);
+			
 		list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
 			struct ata_queued_cmd *qc;
 
@@ -606,6 +618,9 @@ void ata_scsi_error(struct Scsi_Host *host)
 		ap->eh_tries = ATA_EH_MAX_TRIES;
 	} else
 		spin_unlock_wait(ap->lock);
+		
+	/* If we timed raced normal completion and there is nothing to
+	   recover nr_timedout == 0 why exactly are we doing error recovery ? */
 
  repeat:
 	/* invoke error handler */
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 9a10cb055ac2..8332e97a9de3 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -65,6 +65,8 @@ const struct ata_port_operations ata_sff_port_ops = {
 	.sff_irq_on		= ata_sff_irq_on,
 	.sff_irq_clear		= ata_sff_irq_clear,
 
+	.lost_interrupt		= ata_sff_lost_interrupt,
+
 	.port_start		= ata_sff_port_start,
 };
 EXPORT_SYMBOL_GPL(ata_sff_port_ops);
@@ -1647,7 +1649,7 @@ EXPORT_SYMBOL_GPL(ata_sff_qc_fill_rtf);
  *	RETURNS:
  *	One if interrupt was handled, zero if not (shared irq).
  */
-inline unsigned int ata_sff_host_intr(struct ata_port *ap,
+unsigned int ata_sff_host_intr(struct ata_port *ap,
 				      struct ata_queued_cmd *qc)
 {
 	struct ata_eh_info *ehi = &ap->link.eh_info;
@@ -1775,6 +1777,48 @@ irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
 }
 EXPORT_SYMBOL_GPL(ata_sff_interrupt);
 
+/**
+ *	ata_sff_lost_interrupt	-	Check for an apparent lost interrupt
+ *	@ap: port that appears to have timed out
+ *
+ *	Called from the libata error handlers when the core code suspects
+ *	an interrupt has been lost. If it has complete anything we can and
+ *	then return. Interface must support altstatus for this faster
+ *	recovery to occur.
+ *
+ *	Locking:
+ *	Caller holds host lock
+ */
+
+void ata_sff_lost_interrupt(struct ata_port *ap)
+{
+	u8 status;
+	struct ata_queued_cmd *qc;
+
+	/* Only one outstanding command per SFF channel */
+	qc = ata_qc_from_tag(ap, ap->link.active_tag);
+	/* Check we have a live one.. */
+	if (qc == NULL ||  !(qc->flags & ATA_QCFLAG_ACTIVE))
+		return;
+	/* We cannot lose an interrupt on a polled command */
+	if (qc->tf.flags & ATA_TFLAG_POLLING)
+		return;
+	/* See if the controller thinks it is still busy - if so the command
+	   isn't a lost IRQ but is still in progress */
+	status = ata_sff_altstatus(ap);
+	if (status & ATA_BUSY)
+		return;
+
+	/* There was a command running, we are no longer busy and we have
+	   no interrupt. */
+	ata_port_printk(ap, KERN_WARNING, "lost interrupt (Status 0x%x)\n",
+								status);
+	/* Run the host interrupt logic as if the interrupt had not been
+	   lost */
+	ata_sff_host_intr(ap, qc);
+}
+EXPORT_SYMBOL_GPL(ata_sff_lost_interrupt);
+
 /**
  *	ata_sff_freeze - Freeze SFF controller port
  *	@ap: port to freeze
diff --git a/drivers/ata/pata_isapnp.c b/drivers/ata/pata_isapnp.c
index afa8f704271e..4bceb8803a10 100644
--- a/drivers/ata/pata_isapnp.c
+++ b/drivers/ata/pata_isapnp.c
@@ -17,7 +17,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME "pata_isapnp"
-#define DRV_VERSION "0.2.2"
+#define DRV_VERSION "0.2.5"
 
 static struct scsi_host_template isapnp_sht = {
 	ATA_PIO_SHT(DRV_NAME),
@@ -28,6 +28,13 @@ static struct ata_port_operations isapnp_port_ops = {
 	.cable_detect	= ata_cable_40wire,
 };
 
+static struct ata_port_operations isapnp_noalt_port_ops = {
+	.inherits	= &ata_sff_port_ops,
+	.cable_detect	= ata_cable_40wire,
+	/* No altstatus so we don't want to use the lost interrupt poll */
+	.lost_interrupt = ATA_OP_NULL,
+};
+
 /**
  *	isapnp_init_one		-	attach an isapnp interface
  *	@idev: PnP device
@@ -65,7 +72,7 @@ static int isapnp_init_one(struct pnp_dev *idev, const struct pnp_device_id *dev
 
 	ap = host->ports[0];
 
-	ap->ops = &isapnp_port_ops;
+	ap->ops = &isapnp_noalt_port_ops;
 	ap->pio_mask = ATA_PIO0;
 	ap->flags |= ATA_FLAG_SLAVE_POSS;
 
@@ -76,6 +83,7 @@ static int isapnp_init_one(struct pnp_dev *idev, const struct pnp_device_id *dev
 					   pnp_port_start(idev, 1), 1);
 		ap->ioaddr.altstatus_addr = ctl_addr;
 		ap->ioaddr.ctl_addr = ctl_addr;
+		ap->ops = &isapnp_port_ops;
 	}
 
 	ata_sff_std_ports(&ap->ioaddr);
diff --git a/drivers/ata/pdc_adma.c b/drivers/ata/pdc_adma.c
index c509c206a459..39588178d028 100644
--- a/drivers/ata/pdc_adma.c
+++ b/drivers/ata/pdc_adma.c
@@ -148,6 +148,8 @@ static struct scsi_host_template adma_ata_sht = {
 static struct ata_port_operations adma_ata_ops = {
 	.inherits		= &ata_sff_port_ops,
 
+	.lost_interrupt		= ATA_OP_NULL,
+
 	.check_atapi_dma	= adma_check_atapi_dma,
 	.qc_prep		= adma_qc_prep,
 	.qc_issue		= adma_qc_issue,
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 8a751054c8a1..a377226b81c8 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -646,6 +646,8 @@ static struct scsi_host_template mv6_sht = {
 static struct ata_port_operations mv5_ops = {
 	.inherits		= &ata_sff_port_ops,
 
+	.lost_interrupt		= ATA_OP_NULL,
+
 	.qc_defer		= mv_qc_defer,
 	.qc_prep		= mv_qc_prep,
 	.qc_issue		= mv_qc_issue,
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 2f523f8c27f6..6cda12ba8122 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -408,6 +408,7 @@ static struct scsi_host_template nv_swncq_sht = {
 
 static struct ata_port_operations nv_common_ops = {
 	.inherits		= &ata_bmdma_port_ops,
+	.lost_interrupt		= ATA_OP_NULL,
 	.scr_read		= nv_scr_read,
 	.scr_write		= nv_scr_write,
 };
diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index 3ad2b8863636..b1fd7d62071a 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -176,7 +176,9 @@ static const struct ata_port_operations pdc_common_ops = {
 	.check_atapi_dma	= pdc_check_atapi_dma,
 	.qc_prep		= pdc_qc_prep,
 	.qc_issue		= pdc_qc_issue,
+
 	.sff_irq_clear		= pdc_irq_clear,
+	.lost_interrupt		= ATA_OP_NULL,
 
 	.post_internal_cmd	= pdc_post_internal_cmd,
 	.error_handler		= pdc_error_handler,
diff --git a/drivers/ata/sata_qstor.c b/drivers/ata/sata_qstor.c
index 7112d89fd9ff..c3936d35cdac 100644
--- a/drivers/ata/sata_qstor.c
+++ b/drivers/ata/sata_qstor.c
@@ -147,6 +147,7 @@ static struct ata_port_operations qs_ata_ops = {
 	.softreset		= ATA_OP_NULL,
 	.error_handler		= qs_error_handler,
 	.post_internal_cmd	= ATA_OP_NULL,
+	.lost_interrupt		= ATA_OP_NULL,
 
 	.scr_read		= qs_scr_read,
 	.scr_write		= qs_scr_write,
diff --git a/drivers/ata/sata_vsc.c b/drivers/ata/sata_vsc.c
index ef211f333d7b..ed70bd28fa2c 100644
--- a/drivers/ata/sata_vsc.c
+++ b/drivers/ata/sata_vsc.c
@@ -308,6 +308,9 @@ static struct scsi_host_template vsc_sata_sht = {
 
 static struct ata_port_operations vsc_sata_ops = {
 	.inherits		= &ata_bmdma_port_ops,
+	/* The IRQ handling is not quite standard SFF behaviour so we
+	   cannot use the default lost interrupt handler */
+	.lost_interrupt		= ATA_OP_NULL,
 	.sff_tf_load		= vsc_sata_tf_load,
 	.sff_tf_read		= vsc_sata_tf_read,
 	.freeze			= vsc_freeze,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 3a07a32dfc2e..76262d83656b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -795,6 +795,7 @@ struct ata_port_operations {
 	ata_reset_fn_t		pmp_hardreset;
 	ata_postreset_fn_t	pmp_postreset;
 	void (*error_handler)(struct ata_port *ap);
+	void (*lost_interrupt)(struct ata_port *ap);
 	void (*post_internal_cmd)(struct ata_queued_cmd *qc);
 
 	/*
@@ -1577,6 +1578,7 @@ extern bool ata_sff_qc_fill_rtf(struct ata_queued_cmd *qc);
 extern unsigned int ata_sff_host_intr(struct ata_port *ap,
 				      struct ata_queued_cmd *qc);
 extern irqreturn_t ata_sff_interrupt(int irq, void *dev_instance);
+extern void ata_sff_lost_interrupt(struct ata_port *ap);
 extern void ata_sff_freeze(struct ata_port *ap);
 extern void ata_sff_thaw(struct ata_port *ap);
 extern int ata_sff_prereset(struct ata_link *link, unsigned long deadline);
-- 
cgit v1.2.3-71-gd317


From 54aee6a5f560d0e1bf3f39987c6ebe06daeb0ce1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Mar 2009 09:13:24 -0700
Subject: dmaengine: kill some unused headers

The dmaengine redux left some unneeded headers in
include/linux/dmaengine.h, clean them up.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dmaengine.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1956c8d46d32..96e676e5bf9b 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -23,9 +23,6 @@
 
 #include <linux/device.h>
 #include <linux/uio.h>
-#include <linux/kref.h>
-#include <linux/completion.h>
-#include <linux/rcupdate.h>
 #include <linux/dma-mapping.h>
 
 /**
-- 
cgit v1.2.3-71-gd317


From 06164f3194e01ea4c76941ac60f541d656c8975f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Mar 2009 09:13:25 -0700
Subject: async_tx: provide __async_inline for HAS_DMA=n archs

To allow an async_tx routine to be compiled away on HAS_DMA=n arch it
needs to be declared __always_inline otherwise the compiler may emit
code and cause a link error.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_xor.c | 7 ++-----
 include/linux/async_tx.h    | 9 +++++++++
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 595b78672b36..95fe2c8d6c51 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -30,11 +30,8 @@
 #include <linux/raid/xor.h>
 #include <linux/async_tx.h>
 
-/* do_async_xor - dma map the pages and perform the xor with an engine.
- * 	This routine is marked __always_inline so it can be compiled away
- * 	when CONFIG_DMA_ENGINE=n
- */
-static __always_inline struct dma_async_tx_descriptor *
+/* do_async_xor - dma map the pages and perform the xor with an engine */
+static __async_inline struct dma_async_tx_descriptor *
 do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 	     unsigned int offset, int src_cnt, size_t len,
 	     enum async_tx_flags flags,
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 45f6297821bd..5fc2ef8d97fa 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -21,6 +21,15 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 
+/* on architectures without dma-mapping capabilities we need to ensure
+ * that the asynchronous path compiles away
+ */
+#ifdef CONFIG_HAS_DMA
+#define __async_inline
+#else
+#define __async_inline __always_inline
+#endif
+
 /**
  * dma_chan_ref - object used to manage dma channels received from the
  *   dmaengine core.
-- 
cgit v1.2.3-71-gd317


From 729b5d1b8ec72c28e99840b3f300ba67726e3ab9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 25 Mar 2009 09:13:25 -0700
Subject: dmaengine: allow dma support for async_tx to be toggled

Provide a config option for blocking the allocation of dma channels to
the async_tx api.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c |  6 +++---
 drivers/dma/Kconfig        | 11 +++++++++++
 include/linux/dmaengine.h  | 18 ++++++++++++++++++
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index f21147f3626a..06eb6cc09fef 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -30,7 +30,7 @@
 #ifdef CONFIG_DMA_ENGINE
 static int __init async_tx_init(void)
 {
-	dmaengine_get();
+	async_dmaengine_get();
 
 	printk(KERN_INFO "async_tx: api initialized (async)\n");
 
@@ -39,7 +39,7 @@ static int __init async_tx_init(void)
 
 static void __exit async_tx_exit(void)
 {
-	dmaengine_put();
+	async_dmaengine_put();
 }
 
 /**
@@ -56,7 +56,7 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 	if (depend_tx &&
 	    dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
 		return depend_tx->chan;
-	return dma_find_channel(tx_type);
+	return async_dma_find_channel(tx_type);
 }
 EXPORT_SYMBOL_GPL(__async_tx_find_channel);
 #else
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 48ea59e79672..3b3c01b6f1ee 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -98,6 +98,17 @@ config NET_DMA
 	  Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise
 	  say N.
 
+config ASYNC_TX_DMA
+	bool "Async_tx: Offload support for the async_tx api"
+	depends on DMA_ENGINE
+	help
+	  This allows the async_tx api to take advantage of offload engines for
+	  memcpy, memset, xor, and raid6 p+q operations.  If your platform has
+	  a dma engine that can perform raid operations and you have enabled
+	  MD_RAID456 say Y.
+
+	  If unsure, say N.
+
 config DMATEST
 	tristate "DMA Test client"
 	depends on DMA_ENGINE
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 96e676e5bf9b..2afc2c95e42d 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -288,6 +288,24 @@ static inline void net_dmaengine_put(void)
 }
 #endif
 
+#ifdef CONFIG_ASYNC_TX_DMA
+#define async_dmaengine_get()	dmaengine_get()
+#define async_dmaengine_put()	dmaengine_put()
+#define async_dma_find_channel(type) dma_find_channel(type)
+#else
+static inline void async_dmaengine_get(void)
+{
+}
+static inline void async_dmaengine_put(void)
+{
+}
+static inline struct dma_chan *
+async_dma_find_channel(enum dma_transaction_type type)
+{
+	return NULL;
+}
+#endif
+
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
-- 
cgit v1.2.3-71-gd317


From b8dfe498775de912116f275680ddb57c8799d9ef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 25 Mar 2009 17:31:52 +0100
Subject: netfilter: factorize ifname_compare()

We use same not trivial helper function in four places. We can factorize it.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h | 23 +++++++++++++++++++++++
 net/ipv4/netfilter/arp_tables.c    | 14 +-------------
 net/ipv4/netfilter/ip_tables.c     | 23 ++---------------------
 net/ipv6/netfilter/ip6_tables.c    | 23 ++---------------------
 net/netfilter/xt_physdev.c         | 21 ++-------------------
 5 files changed, 30 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index e8e08d036752..72918b7cbe85 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -435,6 +435,29 @@ extern void xt_free_table_info(struct xt_table_info *info);
 extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
 				    struct xt_table_info *new);
 
+/*
+ * This helper is performance critical and must be inlined
+ */
+static inline unsigned long ifname_compare_aligned(const char *_a,
+						   const char *_b,
+						   const char *_mask)
+{
+	const unsigned long *a = (const unsigned long *)_a;
+	const unsigned long *b = (const unsigned long *)_b;
+	const unsigned long *mask = (const unsigned long *)_mask;
+	unsigned long ret;
+
+	ret = (a[0] ^ b[0]) & mask[0];
+	if (IFNAMSIZ > sizeof(unsigned long))
+		ret |= (a[1] ^ b[1]) & mask[1];
+	if (IFNAMSIZ > 2 * sizeof(unsigned long))
+		ret |= (a[2] ^ b[2]) & mask[2];
+	if (IFNAMSIZ > 3 * sizeof(unsigned long))
+		ret |= (a[3] ^ b[3]) & mask[3];
+	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
+	return ret;
+}
+
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 64a7c6ce0b98..4b35dba7cf7d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -80,19 +80,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
 static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
 {
 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	const unsigned long *a = (const unsigned long *)_a;
-	const unsigned long *b = (const unsigned long *)_b;
-	const unsigned long *mask = (const unsigned long *)_mask;
-	unsigned long ret;
-
-	ret = (a[0] ^ b[0]) & mask[0];
-	if (IFNAMSIZ > sizeof(unsigned long))
-		ret |= (a[1] ^ b[1]) & mask[1];
-	if (IFNAMSIZ > 2 * sizeof(unsigned long))
-		ret |= (a[2] ^ b[2]) & mask[2];
-	if (IFNAMSIZ > 3 * sizeof(unsigned long))
-		ret |= (a[3] ^ b[3]) & mask[3];
-	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
+	unsigned long ret = ifname_compare_aligned(_a, _b, _mask);
 #else
 	unsigned long ret = 0;
 	int i;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e5294aec967d..41c59e391a6a 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -74,25 +74,6 @@ do {								\
 
    Hence the start of any table is given by get_table() below.  */
 
-static unsigned long ifname_compare(const char *_a, const char *_b,
-				    const unsigned char *_mask)
-{
-	const unsigned long *a = (const unsigned long *)_a;
-	const unsigned long *b = (const unsigned long *)_b;
-	const unsigned long *mask = (const unsigned long *)_mask;
-	unsigned long ret;
-
-	ret = (a[0] ^ b[0]) & mask[0];
-	if (IFNAMSIZ > sizeof(unsigned long))
-		ret |= (a[1] ^ b[1]) & mask[1];
-	if (IFNAMSIZ > 2 * sizeof(unsigned long))
-		ret |= (a[2] ^ b[2]) & mask[2];
-	if (IFNAMSIZ > 3 * sizeof(unsigned long))
-		ret |= (a[3] ^ b[3]) & mask[3];
-	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
-	return ret;
-}
-
 /* Returns whether matches rule or not. */
 /* Performance critical - called for every packet */
 static inline bool
@@ -121,7 +102,7 @@ ip_packet_match(const struct iphdr *ip,
 		return false;
 	}
 
-	ret = ifname_compare(indev, ipinfo->iniface, ipinfo->iniface_mask);
+	ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
 
 	if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
 		dprintf("VIA in mismatch (%s vs %s).%s\n",
@@ -130,7 +111,7 @@ ip_packet_match(const struct iphdr *ip,
 		return false;
 	}
 
-	ret = ifname_compare(outdev, ipinfo->outiface, ipinfo->outiface_mask);
+	ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
 
 	if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
 		dprintf("VIA out mismatch (%s vs %s).%s\n",
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 34af7bb8df5f..e59662b3b5b9 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -89,25 +89,6 @@ ip6t_ext_hdr(u8 nexthdr)
 		 (nexthdr == IPPROTO_DSTOPTS) );
 }
 
-static unsigned long ifname_compare(const char *_a, const char *_b,
-				    const unsigned char *_mask)
-{
-	const unsigned long *a = (const unsigned long *)_a;
-	const unsigned long *b = (const unsigned long *)_b;
-	const unsigned long *mask = (const unsigned long *)_mask;
-	unsigned long ret;
-
-	ret = (a[0] ^ b[0]) & mask[0];
-	if (IFNAMSIZ > sizeof(unsigned long))
-		ret |= (a[1] ^ b[1]) & mask[1];
-	if (IFNAMSIZ > 2 * sizeof(unsigned long))
-		ret |= (a[2] ^ b[2]) & mask[2];
-	if (IFNAMSIZ > 3 * sizeof(unsigned long))
-		ret |= (a[3] ^ b[3]) & mask[3];
-	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
-	return ret;
-}
-
 /* Returns whether matches rule or not. */
 /* Performance critical - called for every packet */
 static inline bool
@@ -138,7 +119,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		return false;
 	}
 
-	ret = ifname_compare(indev, ip6info->iniface, ip6info->iniface_mask);
+	ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);
 
 	if (FWINV(ret != 0, IP6T_INV_VIA_IN)) {
 		dprintf("VIA in mismatch (%s vs %s).%s\n",
@@ -147,7 +128,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		return false;
 	}
 
-	ret = ifname_compare(outdev, ip6info->outiface, ip6info->outiface_mask);
+	ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);
 
 	if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) {
 		dprintf("VIA out mismatch (%s vs %s).%s\n",
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 44a234ef4439..8d28ca5848bc 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -20,23 +20,6 @@ MODULE_DESCRIPTION("Xtables: Bridge physical device match");
 MODULE_ALIAS("ipt_physdev");
 MODULE_ALIAS("ip6t_physdev");
 
-static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
-{
-	const unsigned long *a = (const unsigned long *)_a;
-	const unsigned long *b = (const unsigned long *)_b;
-	const unsigned long *mask = (const unsigned long *)_mask;
-	unsigned long ret;
-
-	ret = (a[0] ^ b[0]) & mask[0];
-	if (IFNAMSIZ > sizeof(unsigned long))
-		ret |= (a[1] ^ b[1]) & mask[1];
-	if (IFNAMSIZ > 2 * sizeof(unsigned long))
-		ret |= (a[2] ^ b[2]) & mask[2];
-	if (IFNAMSIZ > 3 * sizeof(unsigned long))
-		ret |= (a[3] ^ b[3]) & mask[3];
-	BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
-	return ret;
-}
 
 static bool
 physdev_mt(const struct sk_buff *skb, const struct xt_match_param *par)
@@ -85,7 +68,7 @@ physdev_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (!(info->bitmask & XT_PHYSDEV_OP_IN))
 		goto match_outdev;
 	indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
-	ret = ifname_compare(indev, info->physindev, info->in_mask);
+	ret = ifname_compare_aligned(indev, info->physindev, info->in_mask);
 
 	if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN))
 		return false;
@@ -95,7 +78,7 @@ match_outdev:
 		return true;
 	outdev = nf_bridge->physoutdev ?
 		 nf_bridge->physoutdev->name : nulldevname;
-	ret = ifname_compare(outdev, info->physoutdev, info->out_mask);
+	ret = ifname_compare_aligned(outdev, info->physoutdev, info->out_mask);
 
 	return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT));
 }
-- 
cgit v1.2.3-71-gd317


From 0361a28d3f9a4315a100c7b37ba0b55cfe15fe07 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oliver@neukum.org>
Date: Wed, 17 Dec 2008 15:38:03 +0100
Subject: HID: autosuspend support for USB HID

This uses the USB busy mechanism for aggessive autosuspend of USB
HID devices. It autosuspends all opened devices supporting remote wakeup
after a timeout unless

- output is being done to the device
- a key is being held down (remote wakeup isn't triggered upon key release)
- LED(s) are lit
- hiddev is opened

As in the current driver closed devices will be autosuspended even if they
don't support remote wakeup.

The patch is quite large because output to devices is done in hard interrupt
context meaning a lot a queuing and locking had to be touched. The LED stuff
has been solved by means of a simple counter. Additions to the generic HID code
could be avoided. In addition it now covers hidraw. It contains an embryonic
version of an API to let the generic HID code tell the lower levels which
capabilities with respect to power management are needed.

Signed-off-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c        |  16 ++
 drivers/hid/hidraw.c          |  17 +-
 drivers/hid/usbhid/hid-core.c | 435 +++++++++++++++++++++++++++++++++---------
 drivers/hid/usbhid/hiddev.c   |  17 +-
 drivers/hid/usbhid/usbhid.h   |  14 +-
 include/linux/hid.h           |   6 +
 6 files changed, 402 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 1cc967448f4d..feaeb6167ea4 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1822,6 +1822,22 @@ static DECLARE_WORK(hid_compat_work, hid_compat_load);
 static struct workqueue_struct *hid_compat_wq;
 #endif
 
+int hid_check_keys_pressed(struct hid_device *hid)
+{
+	struct hid_input *hidinput;
+	int i;
+
+	list_for_each_entry(hidinput, &hid->inputs, list) {
+		for (i = 0; i < BITS_TO_LONGS(KEY_MAX); i++)
+			if (hidinput->input->key[i])
+				return 1;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(hid_check_keys_pressed);
+
 static int __init hid_init(void)
 {
 	int ret;
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 02b19db5442e..e263d4731179 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -181,9 +181,17 @@ static int hidraw_open(struct inode *inode, struct file *file)
 
 	dev = hidraw_table[minor];
 	if (!dev->open++) {
+		if (dev->hid->ll_driver->power) {
+			err = dev->hid->ll_driver->power(dev->hid, PM_HINT_FULLON);
+			if (err < 0)
+				goto out_unlock;
+		}
 		err = dev->hid->ll_driver->open(dev->hid);
-		if (err < 0)
+		if (err < 0) {
+			if (dev->hid->ll_driver->power)
+				dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL);
 			dev->open--;
+		}
 	}
 
 out_unlock:
@@ -209,10 +217,13 @@ static int hidraw_release(struct inode * inode, struct file * file)
 	list_del(&list->node);
 	dev = hidraw_table[minor];
 	if (!--dev->open) {
-		if (list->hidraw->exist)
+		if (list->hidraw->exist) {
+			if (dev->hid->ll_driver->power)
+				dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL);
 			dev->hid->ll_driver->close(dev->hid);
-		else
+		} else {
 			kfree(list->hidraw);
+		}
 	}
 
 	kfree(list);
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index f0a0f72238ab..625e7e8eb373 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -5,6 +5,7 @@
  *  Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz>
  *  Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc
  *  Copyright (c) 2006-2008 Jiri Kosina
+ *  Copyright (c) 2007-2008 Oliver Neukum
  */
 
 /*
@@ -27,6 +28,7 @@
 #include <asm/byteorder.h>
 #include <linux/input.h>
 #include <linux/wait.h>
+#include <linux/workqueue.h>
 
 #include <linux/usb.h>
 
@@ -53,6 +55,10 @@ static unsigned int hid_mousepoll_interval;
 module_param_named(mousepoll, hid_mousepoll_interval, uint, 0644);
 MODULE_PARM_DESC(mousepoll, "Polling interval of mice");
 
+static unsigned int ignoreled;
+module_param_named(ignoreled, ignoreled, uint, 0644);
+MODULE_PARM_DESC(ignoreled, "Autosuspend with active leds");
+
 /* Quirks specified at module load time */
 static char *quirks_param[MAX_USBHID_BOOT_QUIRKS] = { [ 0 ... (MAX_USBHID_BOOT_QUIRKS - 1) ] = NULL };
 module_param_array_named(quirks, quirks_param, charp, NULL, 0444);
@@ -63,8 +69,13 @@ MODULE_PARM_DESC(quirks, "Add/modify USB HID quirks by specifying "
 /*
  * Input submission and I/O error handler.
  */
+static DEFINE_MUTEX(hid_open_mut);
+static struct workqueue_struct *resumption_waker;
 
 static void hid_io_error(struct hid_device *hid);
+static int hid_submit_out(struct hid_device *hid);
+static int hid_submit_ctrl(struct hid_device *hid);
+static void hid_cancel_delayed_stuff(struct usbhid_device *usbhid);
 
 /* Start up the input URB */
 static int hid_start_in(struct hid_device *hid)
@@ -73,15 +84,16 @@ static int hid_start_in(struct hid_device *hid)
 	int rc = 0;
 	struct usbhid_device *usbhid = hid->driver_data;
 
-	spin_lock_irqsave(&usbhid->inlock, flags);
-	if (hid->open > 0 && !test_bit(HID_SUSPENDED, &usbhid->iofl) &&
+	spin_lock_irqsave(&usbhid->lock, flags);
+	if (hid->open > 0 &&
 			!test_bit(HID_DISCONNECTED, &usbhid->iofl) &&
+			!test_bit(HID_REPORTED_IDLE, &usbhid->iofl) &&
 			!test_and_set_bit(HID_IN_RUNNING, &usbhid->iofl)) {
 		rc = usb_submit_urb(usbhid->urbin, GFP_ATOMIC);
 		if (rc != 0)
 			clear_bit(HID_IN_RUNNING, &usbhid->iofl);
 	}
-	spin_unlock_irqrestore(&usbhid->inlock, flags);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
 	return rc;
 }
 
@@ -145,7 +157,7 @@ static void hid_io_error(struct hid_device *hid)
 	unsigned long flags;
 	struct usbhid_device *usbhid = hid->driver_data;
 
-	spin_lock_irqsave(&usbhid->inlock, flags);
+	spin_lock_irqsave(&usbhid->lock, flags);
 
 	/* Stop when disconnected */
 	if (test_bit(HID_DISCONNECTED, &usbhid->iofl))
@@ -175,7 +187,51 @@ static void hid_io_error(struct hid_device *hid)
 	mod_timer(&usbhid->io_retry,
 			jiffies + msecs_to_jiffies(usbhid->retry_delay));
 done:
-	spin_unlock_irqrestore(&usbhid->inlock, flags);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
+}
+
+static void usbhid_mark_busy(struct usbhid_device *usbhid)
+{
+	struct usb_interface *intf = usbhid->intf;
+
+	usb_mark_last_busy(interface_to_usbdev(intf));
+}
+
+static int usbhid_restart_out_queue(struct usbhid_device *usbhid)
+{
+	struct hid_device *hid = usb_get_intfdata(usbhid->intf);
+	int kicked;
+
+	if (!hid)
+		return 0;
+
+	if ((kicked = (usbhid->outhead != usbhid->outtail))) {
+		dbg("Kicking head %d tail %d", usbhid->outhead, usbhid->outtail);
+		if (hid_submit_out(hid)) {
+			clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
+			wake_up(&usbhid->wait);
+		}
+	}
+	return kicked;
+}
+
+static int usbhid_restart_ctrl_queue(struct usbhid_device *usbhid)
+{
+	struct hid_device *hid = usb_get_intfdata(usbhid->intf);
+	int kicked;
+
+	WARN_ON(hid == NULL);
+	if (!hid)
+		return 0;
+
+	if ((kicked = (usbhid->ctrlhead != usbhid->ctrltail))) {
+		dbg("Kicking head %d tail %d", usbhid->ctrlhead, usbhid->ctrltail);
+		if (hid_submit_ctrl(hid)) {
+			clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
+			wake_up(&usbhid->wait);
+		}
+	}
+	return kicked;
 }
 
 /*
@@ -190,12 +246,23 @@ static void hid_irq_in(struct urb *urb)
 
 	switch (urb->status) {
 	case 0:			/* success */
+		usbhid_mark_busy(usbhid);
 		usbhid->retry_delay = 0;
 		hid_input_report(urb->context, HID_INPUT_REPORT,
 				 urb->transfer_buffer,
 				 urb->actual_length, 1);
+		/*
+		 * autosuspend refused while keys are pressed
+		 * because most keyboards don't wake up when
+		 * a key is released
+		 */
+		if (hid_check_keys_pressed(hid))
+			set_bit(HID_KEYS_PRESSED, &usbhid->iofl);
+		else
+			clear_bit(HID_KEYS_PRESSED, &usbhid->iofl);
 		break;
 	case -EPIPE:		/* stall */
+		usbhid_mark_busy(usbhid);
 		clear_bit(HID_IN_RUNNING, &usbhid->iofl);
 		set_bit(HID_CLEAR_HALT, &usbhid->iofl);
 		schedule_work(&usbhid->reset_work);
@@ -209,6 +276,7 @@ static void hid_irq_in(struct urb *urb)
 	case -EPROTO:		/* protocol error or unplug */
 	case -ETIME:		/* protocol error or unplug */
 	case -ETIMEDOUT:	/* Should never happen, but... */
+		usbhid_mark_busy(usbhid);
 		clear_bit(HID_IN_RUNNING, &usbhid->iofl);
 		hid_io_error(hid);
 		return;
@@ -239,16 +307,25 @@ static int hid_submit_out(struct hid_device *hid)
 	report = usbhid->out[usbhid->outtail].report;
 	raw_report = usbhid->out[usbhid->outtail].raw_report;
 
-	usbhid->urbout->transfer_buffer_length = ((report->size - 1) >> 3) + 1 + (report->id > 0);
-	usbhid->urbout->dev = hid_to_usb_dev(hid);
-	memcpy(usbhid->outbuf, raw_report, usbhid->urbout->transfer_buffer_length);
-	kfree(raw_report);
+	if (!test_bit(HID_REPORTED_IDLE, &usbhid->iofl)) {
+		usbhid->urbout->transfer_buffer_length = ((report->size - 1) >> 3) + 1 + (report->id > 0);
+		usbhid->urbout->dev = hid_to_usb_dev(hid);
+		memcpy(usbhid->outbuf, raw_report, usbhid->urbout->transfer_buffer_length);
+		kfree(raw_report);
 
-	dbg_hid("submitting out urb\n");
+		dbg_hid("submitting out urb\n");
 
-	if (usb_submit_urb(usbhid->urbout, GFP_ATOMIC)) {
-		err_hid("usb_submit_urb(out) failed");
-		return -1;
+		if (usb_submit_urb(usbhid->urbout, GFP_ATOMIC)) {
+			err_hid("usb_submit_urb(out) failed");
+			return -1;
+		}
+	} else {
+		/*
+		 * queue work to wake up the device.
+		 * as the work queue is freezeable, this is safe
+		 * with respect to STD and STR
+		 */
+		queue_work(resumption_waker, &usbhid->restart_work);
 	}
 
 	return 0;
@@ -266,41 +343,50 @@ static int hid_submit_ctrl(struct hid_device *hid)
 	raw_report = usbhid->ctrl[usbhid->ctrltail].raw_report;
 	dir = usbhid->ctrl[usbhid->ctrltail].dir;
 
-	len = ((report->size - 1) >> 3) + 1 + (report->id > 0);
-	if (dir == USB_DIR_OUT) {
-		usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0);
-		usbhid->urbctrl->transfer_buffer_length = len;
-		memcpy(usbhid->ctrlbuf, raw_report, len);
-		kfree(raw_report);
-	} else {
-		int maxpacket, padlen;
-
-		usbhid->urbctrl->pipe = usb_rcvctrlpipe(hid_to_usb_dev(hid), 0);
-		maxpacket = usb_maxpacket(hid_to_usb_dev(hid), usbhid->urbctrl->pipe, 0);
-		if (maxpacket > 0) {
-			padlen = DIV_ROUND_UP(len, maxpacket);
-			padlen *= maxpacket;
-			if (padlen > usbhid->bufsize)
-				padlen = usbhid->bufsize;
-		} else
-			padlen = 0;
-		usbhid->urbctrl->transfer_buffer_length = padlen;
-	}
-	usbhid->urbctrl->dev = hid_to_usb_dev(hid);
+	if (!test_bit(HID_REPORTED_IDLE, &usbhid->iofl)) {
+		len = ((report->size - 1) >> 3) + 1 + (report->id > 0);
+		if (dir == USB_DIR_OUT) {
+			usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0);
+			usbhid->urbctrl->transfer_buffer_length = len;
+			memcpy(usbhid->ctrlbuf, raw_report, len);
+			kfree(raw_report);
+		} else {
+			int maxpacket, padlen;
+
+			usbhid->urbctrl->pipe = usb_rcvctrlpipe(hid_to_usb_dev(hid), 0);
+			maxpacket = usb_maxpacket(hid_to_usb_dev(hid), usbhid->urbctrl->pipe, 0);
+			if (maxpacket > 0) {
+				padlen = DIV_ROUND_UP(len, maxpacket);
+				padlen *= maxpacket;
+				if (padlen > usbhid->bufsize)
+					padlen = usbhid->bufsize;
+			} else
+				padlen = 0;
+			usbhid->urbctrl->transfer_buffer_length = padlen;
+		}
+		usbhid->urbctrl->dev = hid_to_usb_dev(hid);
 
-	usbhid->cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | dir;
-	usbhid->cr->bRequest = (dir == USB_DIR_OUT) ? HID_REQ_SET_REPORT : HID_REQ_GET_REPORT;
-	usbhid->cr->wValue = cpu_to_le16(((report->type + 1) << 8) | report->id);
-	usbhid->cr->wIndex = cpu_to_le16(usbhid->ifnum);
-	usbhid->cr->wLength = cpu_to_le16(len);
+		usbhid->cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | dir;
+		usbhid->cr->bRequest = (dir == USB_DIR_OUT) ? HID_REQ_SET_REPORT : HID_REQ_GET_REPORT;
+		usbhid->cr->wValue = cpu_to_le16(((report->type + 1) << 8) | report->id);
+		usbhid->cr->wIndex = cpu_to_le16(usbhid->ifnum);
+		usbhid->cr->wLength = cpu_to_le16(len);
 
-	dbg_hid("submitting ctrl urb: %s wValue=0x%04x wIndex=0x%04x wLength=%u\n",
-		usbhid->cr->bRequest == HID_REQ_SET_REPORT ? "Set_Report" : "Get_Report",
-		usbhid->cr->wValue, usbhid->cr->wIndex, usbhid->cr->wLength);
+		dbg_hid("submitting ctrl urb: %s wValue=0x%04x wIndex=0x%04x wLength=%u\n",
+			usbhid->cr->bRequest == HID_REQ_SET_REPORT ? "Set_Report" : "Get_Report",
+			usbhid->cr->wValue, usbhid->cr->wIndex, usbhid->cr->wLength);
 
-	if (usb_submit_urb(usbhid->urbctrl, GFP_ATOMIC)) {
-		err_hid("usb_submit_urb(ctrl) failed");
-		return -1;
+		if (usb_submit_urb(usbhid->urbctrl, GFP_ATOMIC)) {
+			err_hid("usb_submit_urb(ctrl) failed");
+			return -1;
+		}
+	} else {
+		/*
+		 * queue work to wake up the device.
+		 * as the work queue is freezeable, this is safe
+		 * with respect to STD and STR
+		 */
+		queue_work(resumption_waker, &usbhid->restart_work);
 	}
 
 	return 0;
@@ -332,7 +418,7 @@ static void hid_irq_out(struct urb *urb)
 				"received\n", urb->status);
 	}
 
-	spin_lock_irqsave(&usbhid->outlock, flags);
+	spin_lock_irqsave(&usbhid->lock, flags);
 
 	if (unplug)
 		usbhid->outtail = usbhid->outhead;
@@ -344,12 +430,12 @@ static void hid_irq_out(struct urb *urb)
 			clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
 			wake_up(&usbhid->wait);
 		}
-		spin_unlock_irqrestore(&usbhid->outlock, flags);
+		spin_unlock_irqrestore(&usbhid->lock, flags);
 		return;
 	}
 
 	clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
-	spin_unlock_irqrestore(&usbhid->outlock, flags);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
 	wake_up(&usbhid->wait);
 }
 
@@ -361,12 +447,11 @@ static void hid_ctrl(struct urb *urb)
 {
 	struct hid_device *hid = urb->context;
 	struct usbhid_device *usbhid = hid->driver_data;
-	unsigned long flags;
-	int unplug = 0;
+	int unplug = 0, status = urb->status;
 
-	spin_lock_irqsave(&usbhid->ctrllock, flags);
+	spin_lock(&usbhid->lock);
 
-	switch (urb->status) {
+	switch (status) {
 	case 0:			/* success */
 		if (usbhid->ctrl[usbhid->ctrltail].dir == USB_DIR_IN)
 			hid_input_report(urb->context,
@@ -383,7 +468,7 @@ static void hid_ctrl(struct urb *urb)
 		break;
 	default:		/* error */
 		dev_warn(&urb->dev->dev, "ctrl urb status %d "
-				"received\n", urb->status);
+				"received\n", status);
 	}
 
 	if (unplug)
@@ -396,19 +481,18 @@ static void hid_ctrl(struct urb *urb)
 			clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
 			wake_up(&usbhid->wait);
 		}
-		spin_unlock_irqrestore(&usbhid->ctrllock, flags);
+		spin_unlock(&usbhid->lock);
 		return;
 	}
 
 	clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
-	spin_unlock_irqrestore(&usbhid->ctrllock, flags);
+	spin_unlock(&usbhid->lock);
 	wake_up(&usbhid->wait);
 }
 
-void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir)
+void __usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir)
 {
 	int head;
-	unsigned long flags;
 	struct usbhid_device *usbhid = hid->driver_data;
 	int len = ((report->size - 1) >> 3) + 1 + (report->id > 0);
 
@@ -416,18 +500,13 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 		return;
 
 	if (usbhid->urbout && dir == USB_DIR_OUT && report->type == HID_OUTPUT_REPORT) {
-
-		spin_lock_irqsave(&usbhid->outlock, flags);
-
 		if ((head = (usbhid->outhead + 1) & (HID_OUTPUT_FIFO_SIZE - 1)) == usbhid->outtail) {
-			spin_unlock_irqrestore(&usbhid->outlock, flags);
 			dev_warn(&hid->dev, "output queue full\n");
 			return;
 		}
 
 		usbhid->out[usbhid->outhead].raw_report = kmalloc(len, GFP_ATOMIC);
 		if (!usbhid->out[usbhid->outhead].raw_report) {
-			spin_unlock_irqrestore(&usbhid->outlock, flags);
 			dev_warn(&hid->dev, "output queueing failed\n");
 			return;
 		}
@@ -438,15 +517,10 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 		if (!test_and_set_bit(HID_OUT_RUNNING, &usbhid->iofl))
 			if (hid_submit_out(hid))
 				clear_bit(HID_OUT_RUNNING, &usbhid->iofl);
-
-		spin_unlock_irqrestore(&usbhid->outlock, flags);
 		return;
 	}
 
-	spin_lock_irqsave(&usbhid->ctrllock, flags);
-
 	if ((head = (usbhid->ctrlhead + 1) & (HID_CONTROL_FIFO_SIZE - 1)) == usbhid->ctrltail) {
-		spin_unlock_irqrestore(&usbhid->ctrllock, flags);
 		dev_warn(&hid->dev, "control queue full\n");
 		return;
 	}
@@ -454,7 +528,6 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 	if (dir == USB_DIR_OUT) {
 		usbhid->ctrl[usbhid->ctrlhead].raw_report = kmalloc(len, GFP_ATOMIC);
 		if (!usbhid->ctrl[usbhid->ctrlhead].raw_report) {
-			spin_unlock_irqrestore(&usbhid->ctrllock, flags);
 			dev_warn(&hid->dev, "control queueing failed\n");
 			return;
 		}
@@ -467,15 +540,25 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 	if (!test_and_set_bit(HID_CTRL_RUNNING, &usbhid->iofl))
 		if (hid_submit_ctrl(hid))
 			clear_bit(HID_CTRL_RUNNING, &usbhid->iofl);
+}
 
-	spin_unlock_irqrestore(&usbhid->ctrllock, flags);
+void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir)
+{
+	struct usbhid_device *usbhid = hid->driver_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbhid->lock, flags);
+	__usbhid_submit_report(hid, report, dir);
+	spin_unlock_irqrestore(&usbhid->lock, flags);
 }
 EXPORT_SYMBOL_GPL(usbhid_submit_report);
 
 static int usb_hidinput_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
 {
 	struct hid_device *hid = input_get_drvdata(dev);
+	struct usbhid_device *usbhid = hid->driver_data;
 	struct hid_field *field;
+	unsigned long flags;
 	int offset;
 
 	if (type == EV_FF)
@@ -490,6 +573,15 @@ static int usb_hidinput_input_event(struct input_dev *dev, unsigned int type, un
 	}
 
 	hid_set_field(field, offset, value);
+	if (value) {
+		spin_lock_irqsave(&usbhid->lock, flags);
+		usbhid->ledcount++;
+		spin_unlock_irqrestore(&usbhid->lock, flags);
+	} else {
+		spin_lock_irqsave(&usbhid->lock, flags);
+		usbhid->ledcount--;
+		spin_unlock_irqrestore(&usbhid->lock, flags);
+	}
 	usbhid_submit_report(hid, field->report, USB_DIR_OUT);
 
 	return 0;
@@ -538,15 +630,22 @@ int usbhid_open(struct hid_device *hid)
 	struct usbhid_device *usbhid = hid->driver_data;
 	int res;
 
+	mutex_lock(&hid_open_mut);
 	if (!hid->open++) {
 		res = usb_autopm_get_interface(usbhid->intf);
+		/* the device must be awake to reliable request remote wakeup */
 		if (res < 0) {
 			hid->open--;
+			mutex_unlock(&hid_open_mut);
 			return -EIO;
 		}
+		usbhid->intf->needs_remote_wakeup = 1;
+		if (hid_start_in(hid))
+			hid_io_error(hid);
+ 
+		usb_autopm_put_interface(usbhid->intf);
 	}
-	if (hid_start_in(hid))
-		hid_io_error(hid);
+	mutex_unlock(&hid_open_mut);
 	return 0;
 }
 
@@ -554,10 +653,22 @@ void usbhid_close(struct hid_device *hid)
 {
 	struct usbhid_device *usbhid = hid->driver_data;
 
+	mutex_lock(&hid_open_mut);
+
+	/* protecting hid->open to make sure we don't restart
+	 * data acquistion due to a resumption we no longer
+	 * care about
+	 */
+	spin_lock_irq(&usbhid->lock);
 	if (!--hid->open) {
+		spin_unlock_irq(&usbhid->lock);
 		usb_kill_urb(usbhid->urbin);
-		usb_autopm_put_interface(usbhid->intf);
+		flush_scheduled_work();
+		usbhid->intf->needs_remote_wakeup = 0;
+	} else {
+		spin_unlock_irq(&usbhid->lock);
 	}
+	mutex_unlock(&hid_open_mut);
 }
 
 /*
@@ -687,6 +798,25 @@ static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t co
 	return ret;
 }
 
+static void usbhid_restart_queues(struct usbhid_device *usbhid)
+{
+	if (usbhid->urbout)
+		usbhid_restart_out_queue(usbhid);
+	usbhid_restart_ctrl_queue(usbhid);
+}
+
+static void __usbhid_restart_queues(struct work_struct *work)
+{
+	struct usbhid_device *usbhid =
+		container_of(work, struct usbhid_device, restart_work);
+	int r;
+
+	r = usb_autopm_get_interface(usbhid->intf);
+	if (r < 0)
+		return;
+	usb_autopm_put_interface(usbhid->intf);
+}
+
 static void hid_free_buffers(struct usb_device *dev, struct hid_device *hid)
 {
 	struct usbhid_device *usbhid = hid->driver_data;
@@ -850,11 +980,11 @@ static int usbhid_start(struct hid_device *hid)
 
 	init_waitqueue_head(&usbhid->wait);
 	INIT_WORK(&usbhid->reset_work, hid_reset);
+	INIT_WORK(&usbhid->restart_work, __usbhid_restart_queues);
 	setup_timer(&usbhid->io_retry, hid_retry_timeout, (unsigned long) hid);
 
-	spin_lock_init(&usbhid->inlock);
-	spin_lock_init(&usbhid->outlock);
-	spin_lock_init(&usbhid->ctrllock);
+	spin_lock_init(&usbhid->lock);
+	spin_lock_init(&usbhid->lock);
 
 	usbhid->intf = intf;
 	usbhid->ifnum = interface->desc.bInterfaceNumber;
@@ -906,15 +1036,14 @@ static void usbhid_stop(struct hid_device *hid)
 		return;
 
 	clear_bit(HID_STARTED, &usbhid->iofl);
-	spin_lock_irq(&usbhid->inlock);	/* Sync with error handler */
+	spin_lock_irq(&usbhid->lock);	/* Sync with error handler */
 	set_bit(HID_DISCONNECTED, &usbhid->iofl);
-	spin_unlock_irq(&usbhid->inlock);
+	spin_unlock_irq(&usbhid->lock);
 	usb_kill_urb(usbhid->urbin);
 	usb_kill_urb(usbhid->urbout);
 	usb_kill_urb(usbhid->urbctrl);
 
-	del_timer_sync(&usbhid->io_retry);
-	cancel_work_sync(&usbhid->reset_work);
+	hid_cancel_delayed_stuff(usbhid);
 
 	if (hid->claimed & HID_CLAIMED_INPUT)
 		hidinput_disconnect(hid);
@@ -935,12 +1064,28 @@ static void usbhid_stop(struct hid_device *hid)
 	hid_free_buffers(hid_to_usb_dev(hid), hid);
 }
 
+static int usbhid_power(struct hid_device *hid, int lvl)
+{
+	int r = 0;
+
+	switch (lvl) {
+	case PM_HINT_FULLON:
+		r = usbhid_get_power(hid);
+		break;
+	case PM_HINT_NORMAL:
+		usbhid_put_power(hid);
+		break;
+	}
+	return r;
+}
+
 static struct hid_ll_driver usb_hid_driver = {
 	.parse = usbhid_parse,
 	.start = usbhid_start,
 	.stop = usbhid_stop,
 	.open = usbhid_open,
 	.close = usbhid_close,
+	.power = usbhid_power,
 	.hidinput_input_event = usb_hidinput_input_event,
 };
 
@@ -1049,19 +1194,75 @@ static void hid_disconnect(struct usb_interface *intf)
 	kfree(usbhid);
 }
 
+static void hid_cancel_delayed_stuff(struct usbhid_device *usbhid)
+{
+	del_timer_sync(&usbhid->io_retry);
+	cancel_work_sync(&usbhid->restart_work);
+	cancel_work_sync(&usbhid->reset_work);
+}
+
+static void hid_cease_io(struct usbhid_device *usbhid)
+{
+	del_timer(&usbhid->io_retry);
+	usb_kill_urb(usbhid->urbin);
+	usb_kill_urb(usbhid->urbctrl);
+	usb_kill_urb(usbhid->urbout);
+	flush_scheduled_work();
+}
+
 static int hid_suspend(struct usb_interface *intf, pm_message_t message)
 {
-	struct hid_device *hid = usb_get_intfdata (intf);
+	struct hid_device *hid = usb_get_intfdata(intf);
 	struct usbhid_device *usbhid = hid->driver_data;
+	struct usb_device *udev = interface_to_usbdev(intf);
+	int status;
 
-	if (!test_bit(HID_STARTED, &usbhid->iofl))
-		return 0;
+	if (udev->auto_pm) {
+		spin_lock_irq(&usbhid->lock);	/* Sync with error handler */
+		if (!test_bit(HID_RESET_PENDING, &usbhid->iofl)
+		    && !test_bit(HID_CLEAR_HALT, &usbhid->iofl)
+		    && !test_bit(HID_OUT_RUNNING, &usbhid->iofl)
+		    && !test_bit(HID_CTRL_RUNNING, &usbhid->iofl)
+		    && !test_bit(HID_KEYS_PRESSED, &usbhid->iofl)
+		    && (!usbhid->ledcount || ignoreled))
+		{
+			set_bit(HID_REPORTED_IDLE, &usbhid->iofl);
+			spin_unlock_irq(&usbhid->lock);
+		} else {
+			usbhid_mark_busy(usbhid);
+			spin_unlock_irq(&usbhid->lock);
+			return -EBUSY;
+		}
 
-	spin_lock_irq(&usbhid->inlock);	/* Sync with error handler */
-	set_bit(HID_SUSPENDED, &usbhid->iofl);
-	spin_unlock_irq(&usbhid->inlock);
-	del_timer_sync(&usbhid->io_retry);
-	usb_kill_urb(usbhid->urbin);
+	} else {
+		spin_lock_irq(&usbhid->lock);
+		set_bit(HID_REPORTED_IDLE, &usbhid->iofl);
+		spin_unlock_irq(&usbhid->lock);
+		if (usbhid_wait_io(hid) < 0)
+			return -EIO;
+	}
+
+	if (!ignoreled && udev->auto_pm) {
+		spin_lock_irq(&usbhid->lock);
+		if (test_bit(HID_LED_ON, &usbhid->iofl)) {
+			spin_unlock_irq(&usbhid->lock);
+			usbhid_mark_busy(usbhid);
+			return -EBUSY;
+		}
+		spin_unlock_irq(&usbhid->lock);
+	}
+
+	hid_cancel_delayed_stuff(usbhid);
+	hid_cease_io(usbhid);
+
+	if (udev->auto_pm && test_bit(HID_KEYS_PRESSED, &usbhid->iofl)) {
+		/* lost race against keypresses */
+		status = hid_start_in(hid);
+		if (status < 0)
+			hid_io_error(hid);
+		usbhid_mark_busy(usbhid);
+		return -EBUSY;
+	}
 	dev_dbg(&intf->dev, "suspend\n");
 	return 0;
 }
@@ -1075,18 +1276,33 @@ static int hid_resume(struct usb_interface *intf)
 	if (!test_bit(HID_STARTED, &usbhid->iofl))
 		return 0;
 
-	clear_bit(HID_SUSPENDED, &usbhid->iofl);
+	clear_bit(HID_REPORTED_IDLE, &usbhid->iofl);
+	usbhid_mark_busy(usbhid);
+
+	if (test_bit(HID_CLEAR_HALT, &usbhid->iofl) ||
+	    test_bit(HID_RESET_PENDING, &usbhid->iofl))
+		schedule_work(&usbhid->reset_work);
 	usbhid->retry_delay = 0;
 	status = hid_start_in(hid);
+	if (status < 0)
+		hid_io_error(hid);
+	usbhid_restart_queues(usbhid);
+
 	dev_dbg(&intf->dev, "resume status %d\n", status);
-	return status;
+	return 0;
 }
 
 /* Treat USB reset pretty much the same as suspend/resume */
 static int hid_pre_reset(struct usb_interface *intf)
 {
-	/* FIXME: What if the interface is already suspended? */
-	hid_suspend(intf, PMSG_ON);
+	struct hid_device *hid = usb_get_intfdata(intf);
+	struct usbhid_device *usbhid = hid->driver_data;
+
+	spin_lock_irq(&usbhid->lock);
+	set_bit(HID_RESET_PENDING, &usbhid->iofl);
+	spin_unlock_irq(&usbhid->lock);
+	hid_cease_io(usbhid);
+
 	return 0;
 }
 
@@ -1094,11 +1310,35 @@ static int hid_pre_reset(struct usb_interface *intf)
 static int hid_post_reset(struct usb_interface *intf)
 {
 	struct usb_device *dev = interface_to_usbdev (intf);
-
+	struct hid_device *hid = usb_get_intfdata(intf);
+	struct usbhid_device *usbhid = hid->driver_data;
+	int status;
+ 
+	spin_lock_irq(&usbhid->lock);
+	clear_bit(HID_RESET_PENDING, &usbhid->iofl);
+	spin_unlock_irq(&usbhid->lock);
 	hid_set_idle(dev, intf->cur_altsetting->desc.bInterfaceNumber, 0, 0);
 	/* FIXME: Any more reinitialization needed? */
+	status = hid_start_in(hid);
+	if (status < 0)
+		hid_io_error(hid);
+	usbhid_restart_queues(usbhid);
 
-	return hid_resume(intf);
+	return 0;
+}
+
+int usbhid_get_power(struct hid_device *hid)
+{
+	struct usbhid_device *usbhid = hid->driver_data;
+ 
+	return usb_autopm_get_interface(usbhid->intf);
+}
+
+void usbhid_put_power(struct hid_device *hid)
+{
+	struct usbhid_device *usbhid = hid->driver_data;
+ 
+	usb_autopm_put_interface(usbhid->intf);
 }
 
 static struct usb_device_id hid_usb_ids [] = {
@@ -1134,7 +1374,11 @@ static struct hid_driver hid_usb_driver = {
 
 static int __init hid_init(void)
 {
-	int retval;
+	int retval = -ENOMEM;
+
+	resumption_waker = create_freezeable_workqueue("usbhid_resumer");
+	if (!resumption_waker)
+		goto no_queue;
 	retval = hid_register_driver(&hid_usb_driver);
 	if (retval)
 		goto hid_register_fail;
@@ -1158,6 +1402,8 @@ hiddev_init_fail:
 usbhid_quirks_init_fail:
 	hid_unregister_driver(&hid_usb_driver);
 hid_register_fail:
+	destroy_workqueue(resumption_waker);
+no_queue:
 	return retval;
 }
 
@@ -1167,6 +1413,7 @@ static void __exit hid_exit(void)
 	hiddev_exit();
 	usbhid_quirks_exit();
 	hid_unregister_driver(&hid_usb_driver);
+	destroy_workqueue(resumption_waker);
 }
 
 module_init(hid_init);
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 1f5b5d4c3c34..fd7375627e5d 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -249,10 +249,12 @@ static int hiddev_release(struct inode * inode, struct file * file)
 	spin_unlock_irqrestore(&list->hiddev->list_lock, flags);
 
 	if (!--list->hiddev->open) {
-		if (list->hiddev->exist)
+		if (list->hiddev->exist) {
 			usbhid_close(list->hiddev->hid);
-		else
+			usbhid_put_power(list->hiddev->hid);
+		} else {
 			kfree(list->hiddev);
+		}
 	}
 
 	kfree(list);
@@ -303,6 +305,17 @@ static int hiddev_open(struct inode *inode, struct file *file)
 	list_add_tail(&list->node, &hiddev_table[i]->list);
 	spin_unlock_irq(&list->hiddev->list_lock);
 
+	if (!list->hiddev->open++)
+		if (list->hiddev->exist) {
+			struct hid_device *hid = hiddev_table[i]->hid;
+			res = usbhid_get_power(hid);
+			if (res < 0) {
+				res = -EIO;
+				goto bail;
+			}
+			usbhid_open(hid);
+		}
+
 	return 0;
 bail:
 	file->private_data = NULL;
diff --git a/drivers/hid/usbhid/usbhid.h b/drivers/hid/usbhid/usbhid.h
index 9eb30564be9c..08f505ca2e3d 100644
--- a/drivers/hid/usbhid/usbhid.h
+++ b/drivers/hid/usbhid/usbhid.h
@@ -38,7 +38,10 @@ int usbhid_wait_io(struct hid_device* hid);
 void usbhid_close(struct hid_device *hid);
 int usbhid_open(struct hid_device *hid);
 void usbhid_init_reports(struct hid_device *hid);
-void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir);
+void usbhid_submit_report
+(struct hid_device *hid, struct hid_report *report, unsigned char dir);
+int usbhid_get_power(struct hid_device *hid);
+void usbhid_put_power(struct hid_device *hid);
 
 /* iofl flags */
 #define HID_CTRL_RUNNING	1
@@ -49,6 +52,9 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns
 #define HID_CLEAR_HALT		6
 #define HID_DISCONNECTED	7
 #define HID_STARTED		8
+#define HID_REPORTED_IDLE	9
+#define HID_KEYS_PRESSED	10
+#define HID_LED_ON		11
 
 /*
  * USB-specific HID struct, to be pointed to
@@ -66,7 +72,6 @@ struct usbhid_device {
 	struct urb *urbin;                                              /* Input URB */
 	char *inbuf;                                                    /* Input buffer */
 	dma_addr_t inbuf_dma;                                           /* Input buffer dma */
-	spinlock_t inlock;                                              /* Input fifo spinlock */
 
 	struct urb *urbctrl;                                            /* Control URB */
 	struct usb_ctrlrequest *cr;                                     /* Control request struct */
@@ -75,21 +80,22 @@ struct usbhid_device {
 	unsigned char ctrlhead, ctrltail;                               /* Control fifo head & tail */
 	char *ctrlbuf;                                                  /* Control buffer */
 	dma_addr_t ctrlbuf_dma;                                         /* Control buffer dma */
-	spinlock_t ctrllock;                                            /* Control fifo spinlock */
 
 	struct urb *urbout;                                             /* Output URB */
 	struct hid_output_fifo out[HID_CONTROL_FIFO_SIZE];              /* Output pipe fifo */
 	unsigned char outhead, outtail;                                 /* Output pipe fifo head & tail */
 	char *outbuf;                                                   /* Output buffer */
 	dma_addr_t outbuf_dma;                                          /* Output buffer dma */
-	spinlock_t outlock;                                             /* Output fifo spinlock */
 
+	spinlock_t lock;						/* fifo spinlock */
 	unsigned long iofl;                                             /* I/O flags (CTRL_RUNNING, OUT_RUNNING) */
 	struct timer_list io_retry;                                     /* Retry timer */
 	unsigned long stop_retry;                                       /* Time to give up, in jiffies */
 	unsigned int retry_delay;                                       /* Delay length in ms */
 	struct work_struct reset_work;                                  /* Task context for resets */
+	struct work_struct restart_work;				/* waking up for output to be done in a task */
 	wait_queue_head_t wait;						/* For sleeping */
+	int ledcount;							/* counting the number of active leds */
 };
 
 #define	hid_to_usb_dev(hid_dev) \
diff --git a/include/linux/hid.h b/include/linux/hid.h
index fa8ee9cef7be..6ac7795a8acc 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -603,12 +603,17 @@ struct hid_ll_driver {
 	int (*open)(struct hid_device *hdev);
 	void (*close)(struct hid_device *hdev);
 
+	int (*power)(struct hid_device *hdev, int level);
+
 	int (*hidinput_input_event) (struct input_dev *idev, unsigned int type,
 			unsigned int code, int value);
 
 	int (*parse)(struct hid_device *hdev);
 };
 
+#define	PM_HINT_FULLON	1<<5
+#define PM_HINT_NORMAL	1<<1
+
 /* Applications from HID Usage Tables 4/8/99 Version 1.1 */
 /* We ignore a few input applications that are not widely used */
 #define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || (a == 0x000d0002))
@@ -641,6 +646,7 @@ int hidinput_find_field(struct hid_device *hid, unsigned int type, unsigned int
 void hid_output_report(struct hid_report *report, __u8 *data);
 struct hid_device *hid_allocate_device(void);
 int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size);
+int hid_check_keys_pressed(struct hid_device *hid);
 int hid_connect(struct hid_device *hid, unsigned int connect_mask);
 
 /**
-- 
cgit v1.2.3-71-gd317


From f18df228997fb716990590d248663981a15f17d4 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Tue, 13 Jan 2009 16:43:09 +0100
Subject: quota: Add quota reservation support

Delayed allocation defers the block allocation at the dirty pages
flush-out time, doing quota charge/check at that time is too late.
But we can't charge the quota blocks until blocks are really allocated,
otherwise users could get overcharged after reboot from system crash.

This patch adds quota reservation for delayed allocation. Quota blocks
are reserved in memory, inode and quota won't gets dirtied until later
block allocation time.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/dquot.c               | 117 ++++++++++++++++++++++++++++++++++-------------
 include/linux/quota.h    |   3 ++
 include/linux/quotaops.h |  21 +++++++++
 3 files changed, 110 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dquot.c b/fs/dquot.c
index bca3cac4bee7..9b1c4d3c9d83 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -899,6 +899,11 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
 	dquot->dq_dqb.dqb_curspace += number;
 }
 
+static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace += number;
+}
+
 static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
 	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
@@ -1068,7 +1073,11 @@ err_out:
 	kfree_skb(skb);
 }
 #endif
-
+/*
+ * Write warnings to the console and send warning messages over netlink.
+ *
+ * Note that this function can sleep.
+ */
 static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
 {
 	int i;
@@ -1129,13 +1138,18 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 /* needs dq_data_lock */
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
+	qsize_t tspace;
+
 	*warntype = QUOTA_NL_NOWARN;
 	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
+	tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+		+ space;
+
 	if (dquot->dq_dqb.dqb_bhardlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
+	    tspace > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BHARDWARN;
@@ -1143,7 +1157,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
@@ -1152,7 +1166,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime == 0) {
 		if (!prealloc) {
 			*warntype = QUOTA_NL_BSOFTWARN;
@@ -1306,51 +1320,92 @@ void vfs_dq_drop(struct inode *inode)
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+int __dquot_alloc_space(struct inode *inode, qsize_t number,
+			int warn, int reserve)
 {
-	int cnt, ret = NO_QUOTA;
+	int cnt, ret = QUOTA_OK;
 	char warntype[MAXQUOTAS];
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode)) {
-out_add:
-		inode_add_bytes(inode, number);
-		return QUOTA_OK;
-	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	if (IS_NOQUOTA(inode)) {	/* Now we can do reliable test... */
-		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		goto out_add;
-	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] == NODQUOT)
 			continue;
-		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA)
-			goto warn_put_all;
+		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
+		    == NO_QUOTA) {
+			ret = NO_QUOTA;
+			goto out_unlock;
+		}
 	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] == NODQUOT)
 			continue;
-		dquot_incr_space(inode->i_dquot[cnt], number);
+		if (reserve)
+			dquot_resv_space(inode->i_dquot[cnt], number);
+		else
+			dquot_incr_space(inode->i_dquot[cnt], number);
 	}
-	inode_add_bytes(inode, number);
-	ret = QUOTA_OK;
-warn_put_all:
+	if (!reserve)
+		inode_add_bytes(inode, number);
+out_unlock:
 	spin_unlock(&dq_data_lock);
-	if (ret == QUOTA_OK)
-		/* Dirtify all the dquots - this can block when journalling */
-		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt])
-				mark_dquot_dirty(inode->i_dquot[cnt]);
 	flush_warnings(inode->i_dquot, warntype);
+	return ret;
+}
+
+int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+{
+	int cnt, ret = QUOTA_OK;
+
+	/*
+	 * First test before acquiring mutex - solves deadlocks when we
+	 * re-enter the quota code and are already holding the mutex
+	 */
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out_unlock;
+	}
+
+	ret = __dquot_alloc_space(inode, number, warn, 0);
+	if (ret == NO_QUOTA)
+		goto out_unlock;
+
+	/* Dirtify all the dquots - this can block when journalling */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (inode->i_dquot[cnt])
+			mark_dquot_dirty(inode->i_dquot[cnt]);
+out_unlock:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return ret;
+}
+
+int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
+{
+	int ret = QUOTA_OK;
+
+	if (IS_NOQUOTA(inode))
+		goto out;
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	ret = __dquot_alloc_space(inode, number, warn, 1);
+out_unlock:
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
 	return ret;
 }
+EXPORT_SYMBOL(dquot_reserve_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -2057,7 +2112,7 @@ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	spin_lock(&dq_data_lock);
 	di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
 	di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
-	di->dqb_curspace = dm->dqb_curspace;
+	di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
 	di->dqb_ihardlimit = dm->dqb_ihardlimit;
 	di->dqb_isoftlimit = dm->dqb_isoftlimit;
 	di->dqb_curinodes = dm->dqb_curinodes;
@@ -2097,7 +2152,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 
 	spin_lock(&dq_data_lock);
 	if (di->dqb_valid & QIF_SPACE) {
-		dm->dqb_curspace = di->dqb_curspace;
+		dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
 		check_blim = 1;
 		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
diff --git a/include/linux/quota.h b/include/linux/quota.h
index d72d5d84fde5..54b837fa64f2 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -198,6 +198,7 @@ struct mem_dqblk {
 	qsize_t dqb_bhardlimit;	/* absolute limit on disk blks alloc */
 	qsize_t dqb_bsoftlimit;	/* preferred limit on disk blks */
 	qsize_t dqb_curspace;	/* current used space */
+	qsize_t dqb_rsvspace;   /* current reserved space for delalloc*/
 	qsize_t dqb_ihardlimit;	/* absolute limit on allocated inodes */
 	qsize_t dqb_isoftlimit;	/* preferred inode limit */
 	qsize_t dqb_curinodes;	/* current # allocated inodes */
@@ -308,6 +309,8 @@ struct dquot_operations {
 	int (*release_dquot) (struct dquot *);		/* Quota is going to be deleted from disk */
 	int (*mark_dirty) (struct dquot *);		/* Dquot is marked dirty */
 	int (*write_info) (struct super_block *, int);	/* Write of quota "superblock" */
+	/* reserve quota for delayed block allocation */
+	int (*reserve_space) (struct inode *, qsize_t, int);
 };
 
 /* Operations handling requests from userspace */
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 0b35b3a1be05..3e3a0d2874d9 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -183,6 +183,16 @@ static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
 	return ret;
 }
 
+static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr)
+{
+	if (sb_any_quota_active(inode->i_sb)) {
+		/* Used space is updated in alloc_space() */
+		if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA)
+			return 1;
+	}
+	return 0;
+}
+
 static inline int vfs_dq_alloc_inode(struct inode *inode)
 {
 	if (sb_any_quota_active(inode->i_sb)) {
@@ -339,6 +349,11 @@ static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
 	return 0;
 }
 
+static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr)
+{
+	return 0;
+}
+
 static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	inode_sub_bytes(inode, nr);
@@ -376,6 +391,12 @@ static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr)
 			nr << inode->i_sb->s_blocksize_bits);
 }
 
+static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr)
+{
+	return vfs_dq_reserve_space(inode,
+			nr << inode->i_blkbits);
+}
+
 static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr)
 {
 	vfs_dq_free_space_nodirty(inode, nr << inode->i_sb->s_blocksize_bits);
-- 
cgit v1.2.3-71-gd317


From 740d9dcd949a986c88886a591054a0cdb89ef669 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Tue, 13 Jan 2009 16:43:14 +0100
Subject: quota: Add quota reservation claim and released operations

Reserved quota will be claimed at the block allocation time. Over-booked
quota could be returned back with the release callback function.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/dquot.c               | 110 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/quota.h    |   6 +++
 include/linux/quotaops.h |  53 +++++++++++++++++++++++
 3 files changed, 165 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dquot.c b/fs/dquot.c
index 9b1c4d3c9d83..2916f91ca40c 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -904,6 +904,23 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
 	dquot->dq_dqb.dqb_rsvspace += number;
 }
 
+/*
+ * Claim reserved quota space
+ */
+static void dquot_claim_reserved_space(struct dquot *dquot,
+						qsize_t number)
+{
+	WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+	dquot->dq_dqb.dqb_curspace += number;
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
+static inline
+void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
 static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
 	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
@@ -1452,6 +1469,72 @@ warn_put_all:
 	return ret;
 }
 
+int dquot_claim_space(struct inode *inode, qsize_t number)
+{
+	int cnt;
+	int ret = QUOTA_OK;
+
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))	{
+		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	spin_lock(&dq_data_lock);
+	/* Claim reserved quotas to allocated quotas */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt] != NODQUOT)
+			dquot_claim_reserved_space(inode->i_dquot[cnt],
+							number);
+	}
+	/* Update inode bytes */
+	inode_add_bytes(inode, number);
+	spin_unlock(&dq_data_lock);
+	/* Dirtify all the dquots - this can block when journalling */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (inode->i_dquot[cnt])
+			mark_dquot_dirty(inode->i_dquot[cnt]);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(dquot_claim_space);
+
+/*
+ * Release reserved quota space
+ */
+void dquot_release_reserved_space(struct inode *inode, qsize_t number)
+{
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		goto out;
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	spin_lock(&dq_data_lock);
+	/* Release reserved dquots */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt] != NODQUOT)
+			dquot_free_reserved_space(inode->i_dquot[cnt], number);
+	}
+	spin_unlock(&dq_data_lock);
+
+out_unlock:
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return;
+}
+EXPORT_SYMBOL(dquot_release_reserved_space);
+
 /*
  * This operation can block, but only after everything is updated
  */
@@ -1528,6 +1611,19 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 	return QUOTA_OK;
 }
 
+/*
+ * call back function, get reserved quota space from underlying fs
+ */
+qsize_t dquot_get_reserved_space(struct inode *inode)
+{
+	qsize_t reserved_space = 0;
+
+	if (sb_any_quota_active(inode->i_sb) &&
+	    inode->i_sb->dq_op->get_reserved_space)
+		reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
+	return reserved_space;
+}
+
 /*
  * Transfer the number of inode and blocks from one diskquota to an other.
  *
@@ -1536,7 +1632,8 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
  */
 int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
-	qsize_t space;
+	qsize_t space, cur_space;
+	qsize_t rsv_space = 0;
 	struct dquot *transfer_from[MAXQUOTAS];
 	struct dquot *transfer_to[MAXQUOTAS];
 	int cnt, ret = QUOTA_OK;
@@ -1575,7 +1672,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		goto put_all;
 	}
 	spin_lock(&dq_data_lock);
-	space = inode_get_bytes(inode);
+	cur_space = inode_get_bytes(inode);
+	rsv_space = dquot_get_reserved_space(inode);
+	space = cur_space + rsv_space;
 	/* Build the transfer_from list and check the limits */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (transfer_to[cnt] == NODQUOT)
@@ -1604,11 +1703,14 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 			warntype_from_space[cnt] =
 				info_bdq_free(transfer_from[cnt], space);
 			dquot_decr_inodes(transfer_from[cnt], 1);
-			dquot_decr_space(transfer_from[cnt], space);
+			dquot_decr_space(transfer_from[cnt], cur_space);
+			dquot_free_reserved_space(transfer_from[cnt],
+						  rsv_space);
 		}
 
 		dquot_incr_inodes(transfer_to[cnt], 1);
-		dquot_incr_space(transfer_to[cnt], space);
+		dquot_incr_space(transfer_to[cnt], cur_space);
+		dquot_resv_space(transfer_to[cnt], rsv_space);
 
 		inode->i_dquot[cnt] = transfer_to[cnt];
 	}
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 54b837fa64f2..a510d91561f4 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -311,6 +311,12 @@ struct dquot_operations {
 	int (*write_info) (struct super_block *, int);	/* Write of quota "superblock" */
 	/* reserve quota for delayed block allocation */
 	int (*reserve_space) (struct inode *, qsize_t, int);
+	/* claim reserved quota for delayed alloc */
+	int (*claim_space) (struct inode *, qsize_t);
+	/* release rsved quota for delayed alloc */
+	void (*release_rsv) (struct inode *, qsize_t);
+	/* get reserved quota for delayed alloc */
+	qsize_t (*get_reserved_space) (struct inode *);
 };
 
 /* Operations handling requests from userspace */
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 3e3a0d2874d9..7369d04e0a86 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -35,6 +35,11 @@ void dquot_destroy(struct dquot *dquot);
 int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc);
 int dquot_alloc_inode(const struct inode *inode, qsize_t number);
 
+int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc);
+int dquot_claim_space(struct inode *inode, qsize_t number);
+void dquot_release_reserved_space(struct inode *inode, qsize_t number);
+qsize_t dquot_get_reserved_space(struct inode *inode);
+
 int dquot_free_space(struct inode *inode, qsize_t number);
 int dquot_free_inode(const struct inode *inode, qsize_t number);
 
@@ -203,6 +208,31 @@ static inline int vfs_dq_alloc_inode(struct inode *inode)
 	return 0;
 }
 
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr)
+{
+	if (sb_any_quota_active(inode->i_sb)) {
+		if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA)
+			return 1;
+	} else
+		inode_add_bytes(inode, nr);
+
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+/*
+ * Release reserved (in-memory) quotas
+ */
+static inline
+void vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr)
+{
+	if (sb_any_quota_active(inode->i_sb))
+		inode->i_sb->dq_op->release_rsv(inode, nr);
+}
+
 static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	if (sb_any_quota_active(inode->i_sb))
@@ -354,6 +384,17 @@ static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr)
 	return 0;
 }
 
+static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr)
+{
+	return vfs_dq_alloc_space(inode, nr);
+}
+
+static inline
+int vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr)
+{
+	return 0;
+}
+
 static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	inode_sub_bytes(inode, nr);
@@ -397,6 +438,18 @@ static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr)
 			nr << inode->i_blkbits);
 }
 
+static inline int vfs_dq_claim_block(struct inode *inode, qsize_t nr)
+{
+	return vfs_dq_claim_space(inode,
+			nr << inode->i_blkbits);
+}
+
+static inline
+void vfs_dq_release_reservation_block(struct inode *inode, qsize_t nr)
+{
+	vfs_dq_release_reservation_space(inode, nr << inode->i_blkbits);
+}
+
 static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr)
 {
 	vfs_dq_free_space_nodirty(inode, nr << inode->i_sb->s_blocksize_bits);
-- 
cgit v1.2.3-71-gd317


From 9900ba3487f9ba392db30e12d210f768a90abb13 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Wed, 14 Jan 2009 16:18:57 +0100
Subject: quota: Use inode->i_blkbits to get block bits

Andrew has suggested to use inode->i_blkbits to get the block bits info,
rather than use super block's blockbits. That should be faster and emit
less code.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quotaops.h | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 7369d04e0a86..69b502e5eba0 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -410,38 +410,32 @@ static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
 
 static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_prealloc_space_nodirty(inode,
-			nr << inode->i_sb->s_blocksize_bits);
+	return vfs_dq_prealloc_space_nodirty(inode, nr << inode->i_blkbits);
 }
 
 static inline int vfs_dq_prealloc_block(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_prealloc_space(inode,
-			nr << inode->i_sb->s_blocksize_bits);
+	return vfs_dq_prealloc_space(inode, nr << inode->i_blkbits);
 }
 
 static inline int vfs_dq_alloc_block_nodirty(struct inode *inode, qsize_t nr)
 {
- 	return vfs_dq_alloc_space_nodirty(inode,
-			nr << inode->i_sb->s_blocksize_bits);
+	return vfs_dq_alloc_space_nodirty(inode, nr << inode->i_blkbits);
 }
 
 static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_alloc_space(inode,
-			nr << inode->i_sb->s_blocksize_bits);
+	return vfs_dq_alloc_space(inode, nr << inode->i_blkbits);
 }
 
 static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_reserve_space(inode,
-			nr << inode->i_blkbits);
+	return vfs_dq_reserve_space(inode, nr << inode->i_blkbits);
 }
 
 static inline int vfs_dq_claim_block(struct inode *inode, qsize_t nr)
 {
-	return vfs_dq_claim_space(inode,
-			nr << inode->i_blkbits);
+	return vfs_dq_claim_space(inode, nr << inode->i_blkbits);
 }
 
 static inline
@@ -452,12 +446,12 @@ void vfs_dq_release_reservation_block(struct inode *inode, qsize_t nr)
 
 static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	vfs_dq_free_space_nodirty(inode, nr << inode->i_sb->s_blocksize_bits);
+	vfs_dq_free_space_nodirty(inode, nr << inode->i_blkbits);
 }
 
 static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr)
 {
-	vfs_dq_free_space(inode, nr << inode->i_sb->s_blocksize_bits);
+	vfs_dq_free_space(inode, nr << inode->i_blkbits);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From dd6f3c6d5a26a282521f15a183fdc2d6f35cfa0f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Jan 2009 16:01:43 +0100
Subject: quota: Remove NODQUOT macro

Remove this macro which is just a definition of NULL. Fix a few coding style
issues along the way.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c      | 70 ++++++++++++++++++++++++++-------------------------
 include/linux/quota.h |  2 --
 2 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e840fa2b112e..4881db32e56d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -253,7 +253,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block
 		if (dquot->dq_sb == sb && dquot->dq_id == id && dquot->dq_type == type)
 			return dquot;
 	}
-	return NODQUOT;
+	return NULL;
 }
 
 /* Add a dquot to the tail of the free list */
@@ -696,7 +696,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 
 	dquot = sb->dq_op->alloc_dquot(sb, type);
 	if(!dquot)
-		return NODQUOT;
+		return NULL;
 
 	mutex_init(&dquot->dq_lock);
 	INIT_LIST_HEAD(&dquot->dq_free);
@@ -722,10 +722,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 	unsigned int hashent = hashfn(sb, id, type);
-	struct dquot *dquot = NODQUOT, *empty = NODQUOT;
+	struct dquot *dquot = NULL, *empty = NULL;
 
         if (!sb_has_quota_active(sb, type))
-		return NODQUOT;
+		return NULL;
 we_slept:
 	spin_lock(&dq_list_lock);
 	spin_lock(&dq_state_lock);
@@ -736,15 +736,17 @@ we_slept:
 	}
 	spin_unlock(&dq_state_lock);
 
-	if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
-		if (empty == NODQUOT) {
+	dquot = find_dquot(hashent, sb, id, type);
+	if (!dquot) {
+		if (!empty) {
 			spin_unlock(&dq_list_lock);
-			if ((empty = get_empty_dquot(sb, type)) == NODQUOT)
+			empty = get_empty_dquot(sb, type);
+			if (!empty)
 				schedule();	/* Try to wait for a moment... */
 			goto we_slept;
 		}
 		dquot = empty;
-		empty = NODQUOT;
+		empty = NULL;
 		dquot->dq_id = id;
 		/* all dquots go on the inuse_list */
 		put_inuse(dquot);
@@ -766,7 +768,7 @@ we_slept:
 	/* Read the dquot and instantiate it (everything done only if needed) */
 	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
 		dqput(dquot);
-		dquot = NODQUOT;
+		dquot = NULL;
 		goto out;
 	}
 #ifdef __DQUOT_PARANOIA
@@ -787,9 +789,9 @@ static int dqinit_needed(struct inode *inode, int type)
 	if (IS_NOQUOTA(inode))
 		return 0;
 	if (type != -1)
-		return inode->i_dquot[type] == NODQUOT;
+		return !inode->i_dquot[type];
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			return 1;
 	return 0;
 }
@@ -840,8 +842,8 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 {
 	struct dquot *dquot = inode->i_dquot[type];
 
-	inode->i_dquot[type] = NODQUOT;
-	if (dquot != NODQUOT) {
+	inode->i_dquot[type] = NULL;
+	if (dquot) {
 		if (dqput_blocks(dquot)) {
 #ifdef __DQUOT_PARANOIA
 			if (atomic_read(&dquot->dq_count) != 1)
@@ -1112,7 +1114,7 @@ static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
 	int i;
 
 	for (i = 0; i < MAXQUOTAS; i++)
-		if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN &&
+		if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
 		    !warning_issued(dquots[i], warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 			print_warning(dquots[i], warntype[i]);
@@ -1249,7 +1251,7 @@ int dquot_initialize(struct inode *inode, int type)
 {
 	unsigned int id = 0;
 	int cnt, ret = 0;
-	struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
+	struct dquot *got[MAXQUOTAS] = { NULL, NULL };
 	struct super_block *sb = inode->i_sb;
 
 	/* First test before acquiring mutex - solves deadlocks when we
@@ -1282,9 +1284,9 @@ int dquot_initialize(struct inode *inode, int type)
 		/* Avoid races with quotaoff() */
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		if (inode->i_dquot[cnt] == NODQUOT) {
+		if (!inode->i_dquot[cnt]) {
 			inode->i_dquot[cnt] = got[cnt];
-			got[cnt] = NODQUOT;
+			got[cnt] = NULL;
 		}
 	}
 out_err:
@@ -1307,7 +1309,7 @@ int dquot_drop(struct inode *inode)
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		put[cnt] = inode->i_dquot[cnt];
-		inode->i_dquot[cnt] = NODQUOT;
+		inode->i_dquot[cnt] = NULL;
 	}
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 
@@ -1332,7 +1334,7 @@ void vfs_dq_drop(struct inode *inode)
 		 * must assure that nobody can come after the DQUOT_DROP and
 		 * add quota pointers back anyway */
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt] != NODQUOT)
+			if (inode->i_dquot[cnt])
 				break;
 		if (cnt < MAXQUOTAS)
 			inode->i_sb->dq_op->drop(inode);
@@ -1363,7 +1365,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
 		    == NO_QUOTA) {
@@ -1372,7 +1374,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 		}
 	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		if (reserve)
 			dquot_resv_space(inode->i_dquot[cnt], number);
@@ -1461,14 +1463,14 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) == NO_QUOTA)
 			goto warn_put_all;
 	}
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		dquot_incr_inodes(inode->i_dquot[cnt], number);
 	}
@@ -1506,7 +1508,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
 	spin_lock(&dq_data_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] != NODQUOT)
+		if (inode->i_dquot[cnt])
 			dquot_claim_reserved_space(inode->i_dquot[cnt],
 							number);
 	}
@@ -1540,7 +1542,7 @@ void dquot_release_reserved_space(struct inode *inode, qsize_t number)
 	spin_lock(&dq_data_lock);
 	/* Release reserved dquots */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] != NODQUOT)
+		if (inode->i_dquot[cnt])
 			dquot_free_reserved_space(inode->i_dquot[cnt], number);
 	}
 	spin_unlock(&dq_data_lock);
@@ -1576,7 +1578,7 @@ out_sub:
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
 		dquot_decr_space(inode->i_dquot[cnt], number);
@@ -1614,7 +1616,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
 		dquot_decr_inodes(inode->i_dquot[cnt], number);
@@ -1667,8 +1669,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		return QUOTA_OK;
 	/* Initialize the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		transfer_from[cnt] = NODQUOT;
-		transfer_to[cnt] = NODQUOT;
+		transfer_from[cnt] = NULL;
+		transfer_to[cnt] = NULL;
 		warntype_to[cnt] = QUOTA_NL_NOWARN;
 		switch (cnt) {
 			case USRQUOTA:
@@ -1696,7 +1698,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	space = cur_space + rsv_space;
 	/* Build the transfer_from list and check the limits */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (transfer_to[cnt] == NODQUOT)
+		if (!transfer_to[cnt])
 			continue;
 		transfer_from[cnt] = inode->i_dquot[cnt];
 		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
@@ -1712,7 +1714,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		/*
 		 * Skip changes for same uid or gid or for turned off quota-type.
 		 */
-		if (transfer_to[cnt] == NODQUOT)
+		if (!transfer_to[cnt])
 			continue;
 
 		/* Due to IO error we might not have transfer_from[] structure */
@@ -1743,7 +1745,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (transfer_to[cnt]) {
 			mark_dquot_dirty(transfer_to[cnt]);
 			/* The reference we got is transferred to the inode */
-			transfer_to[cnt] = NODQUOT;
+			transfer_to[cnt] = NULL;
 		}
 	}
 warn_put_all:
@@ -1761,7 +1763,7 @@ over_quota:
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Clear dquot pointers we don't want to dqput() */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		transfer_from[cnt] = NODQUOT;
+		transfer_from[cnt] = NULL;
 	ret = NO_QUOTA;
 	goto warn_put_all;
 }
@@ -2256,7 +2258,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	struct dquot *dquot;
 
 	dquot = dqget(sb, id, type);
-	if (dquot == NODQUOT)
+	if (!dquot)
 		return -ESRCH;
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index a510d91561f4..78c48895b12a 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -277,8 +277,6 @@ struct dquot {
 	struct mem_dqblk dq_dqb;	/* Diskquota usage */
 };
 
-#define NODQUOT (struct dquot *)NULL
-
 #define QUOTA_OK          0
 #define NO_QUOTA          1
 
-- 
cgit v1.2.3-71-gd317


From bf84c82d000b9820b01f516d13d328f354f8a8ee Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Jan 2009 17:37:01 +0100
Subject: quota: Remove uppercase aliases for quota functions.

Since all users have been converted, remove uppercase names of quota functions.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quotaops.h | 31 -------------------------------
 1 file changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 69b502e5eba0..36353d95c8db 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -454,35 +454,4 @@ static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr)
 	vfs_dq_free_space(inode, nr << inode->i_blkbits);
 }
 
-/*
- * Define uppercase equivalents for compatibility with old function names
- * Can go away when we think all users have been converted (15/04/2008)
- */
-#define DQUOT_INIT(inode) vfs_dq_init(inode)
-#define DQUOT_DROP(inode) vfs_dq_drop(inode)
-#define DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr) \
-				vfs_dq_prealloc_space_nodirty(inode, nr)
-#define DQUOT_PREALLOC_SPACE(inode, nr) vfs_dq_prealloc_space(inode, nr)
-#define DQUOT_ALLOC_SPACE_NODIRTY(inode, nr) \
-				vfs_dq_alloc_space_nodirty(inode, nr)
-#define DQUOT_ALLOC_SPACE(inode, nr) vfs_dq_alloc_space(inode, nr)
-#define DQUOT_PREALLOC_BLOCK_NODIRTY(inode, nr) \
-				vfs_dq_prealloc_block_nodirty(inode, nr)
-#define DQUOT_PREALLOC_BLOCK(inode, nr) vfs_dq_prealloc_block(inode, nr)
-#define DQUOT_ALLOC_BLOCK_NODIRTY(inode, nr) \
-				vfs_dq_alloc_block_nodirty(inode, nr)
-#define DQUOT_ALLOC_BLOCK(inode, nr) vfs_dq_alloc_block(inode, nr)
-#define DQUOT_ALLOC_INODE(inode) vfs_dq_alloc_inode(inode)
-#define DQUOT_FREE_SPACE_NODIRTY(inode, nr) \
-				vfs_dq_free_space_nodirty(inode, nr)
-#define DQUOT_FREE_SPACE(inode, nr) vfs_dq_free_space(inode, nr)
-#define DQUOT_FREE_BLOCK_NODIRTY(inode, nr) \
-				vfs_dq_free_block_nodirty(inode, nr)
-#define DQUOT_FREE_BLOCK(inode, nr) vfs_dq_free_block(inode, nr)
-#define DQUOT_FREE_INODE(inode) vfs_dq_free_inode(inode)
-#define DQUOT_TRANSFER(inode, iattr) vfs_dq_transfer(inode, iattr)
-#define DQUOT_SYNC(sb) vfs_dq_sync(sb)
-#define DQUOT_OFF(sb, remount) vfs_dq_off(sb, remount)
-#define DQUOT_ON_REMOUNT(sb) vfs_dq_quota_on_remount(sb)
-
 #endif /* _LINUX_QUOTAOPS_ */
-- 
cgit v1.2.3-71-gd317


From 7058548cd50e5bda8db086bb2e5c1d82f746d047 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 25 Mar 2009 23:35:46 -0400
Subject: ext4: Use WRITE_SYNC for commits which are caused by fsync()

If a commit is triggered by fsync(), set a flag indicating the journal
blocks associated with the transaction should be flushed out using
WRITE_SYNC.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      | 5 ++++-
 fs/jbd2/transaction.c | 2 ++
 include/linux/jbd2.h  | 6 ++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..4ea72377c7a2 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
+	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	if (commit_transaction->t_synchronous_commit)
+		write_op = WRITE_SYNC;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(write_op, bh);
 			}
 			cond_resched();
 			stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598e..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
 		}
 	}
 
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4d248b3f1323..8815a3456b3b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -648,6 +648,12 @@ struct transaction_s
 	 */
 	int t_handle_count;
 
+	/*
+	 * This transaction is being forced and some process is
+	 * waiting for it to finish.
+	 */
+	int t_synchronous_commit:1;
+
 	/*
 	 * For use by the filesystem to store fs-specific data
 	 * structures associated with the transaction
-- 
cgit v1.2.3-71-gd317


From 0f571515c332e00b3515dbe0859ceaa30ab66e00 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Fri, 6 Mar 2009 20:07:14 +0900
Subject: dmaengine: Add privatecnt to revert DMA_PRIVATE property

Currently dma_request_channel() set DMA_PRIVATE capability but never
clear it.  So if a public channel was once grabbed by
dma_request_channel(), the device stay PRIVATE forever.  Add
privatecnt member to dma_device to correctly revert it.

[lg@denx.de: fix bad usage of 'chan' in dma_async_device_register]
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 8 ++++++++
 include/linux/dmaengine.h | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index a41d1ea10fa3..92438e9dacc3 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -507,6 +507,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 			 * published in the general-purpose allocator
 			 */
 			dma_cap_set(DMA_PRIVATE, device->cap_mask);
+			device->privatecnt++;
 			err = dma_chan_get(chan);
 
 			if (err == -ENODEV) {
@@ -518,6 +519,8 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 				       dma_chan_name(chan), err);
 			else
 				break;
+			if (--device->privatecnt == 0)
+				dma_cap_clear(DMA_PRIVATE, device->cap_mask);
 			chan->private = NULL;
 			chan = NULL;
 		}
@@ -537,6 +540,9 @@ void dma_release_channel(struct dma_chan *chan)
 	WARN_ONCE(chan->client_count != 1,
 		  "chan reference count %d != 1\n", chan->client_count);
 	dma_chan_put(chan);
+	/* drop PRIVATE cap enabled by __dma_request_channel() */
+	if (--chan->device->privatecnt == 0)
+		dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask);
 	chan->private = NULL;
 	mutex_unlock(&dma_list_mutex);
 }
@@ -719,6 +725,8 @@ int dma_async_device_register(struct dma_device *device)
 			}
 		}
 	list_add_tail_rcu(&device->global_node, &dma_device_list);
+	if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+		device->privatecnt++;	/* Always private */
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 2afc2c95e42d..2e2aa3df170c 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -202,6 +202,7 @@ struct dma_async_tx_descriptor {
 /**
  * struct dma_device - info on the entity supplying DMA services
  * @chancnt: how many DMA channels are supported
+ * @privatecnt: how many DMA channels are requested by dma_request_channel
  * @channels: the list of struct dma_chan
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
@@ -224,6 +225,7 @@ struct dma_async_tx_descriptor {
 struct dma_device {
 
 	unsigned int chancnt;
+	unsigned int privatecnt;
 	struct list_head channels;
 	struct list_head global_node;
 	dma_cap_mask_t  cap_mask;
@@ -352,6 +354,13 @@ __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
 	set_bit(tx_type, dstp->bits);
 }
 
+#define dma_cap_clear(tx, mask) __dma_cap_clear((tx), &(mask))
+static inline void
+__dma_cap_clear(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
+{
+	clear_bit(tx_type, dstp->bits);
+}
+
 #define dma_cap_zero(mask) __dma_cap_zero(&(mask))
 static inline void __dma_cap_zero(dma_cap_mask_t *dstp)
 {
-- 
cgit v1.2.3-71-gd317


From 9d50638bae05ab7f62d700c9e4a83a1845cf9ef4 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 8 Feb 2009 11:00:25 +0530
Subject: unconditionally include asm/types.h from linux/types.h

Reported-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index c30973ace890..fca82ed55f49 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_TYPES_H
 #define _LINUX_TYPES_H
 
+#include <asm/types.h>
+
 #ifndef __ASSEMBLY__
 #ifdef	__KERNEL__
 
@@ -10,7 +12,6 @@
 #endif
 
 #include <linux/posix_types.h>
-#include <asm/types.h>
 
 #ifndef __KERNEL_STRICT_NAMES
 
-- 
cgit v1.2.3-71-gd317


From 85efde6f4e0de9577256c5f0030088d3fd4347c1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Feb 2009 00:51:39 +0100
Subject: make exported headers use strict posix types

A number of standard posix types are used in exported headers, which
is not allowed if __STRICT_KERNEL_NAMES is defined. In order to
get rid of the non-__STRICT_KERNEL_NAMES part and to make sane headers
the default, we have to change them all to safe types.

There are also still some leftovers in reiserfs_fs.h, elfcore.h
and coda.h, but these files have not compiled in user space for
a long time.

This leaves out the various integer types ({u_,u,}int{8,16,32,64}_t),
which we take care of separately.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: netdev@vger.kernel.org
Cc: linux-ppp@vger.kernel.org
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/fcntl.h               | 12 ++++++------
 include/asm-generic/siginfo.h             | 14 +++++++-------
 include/linux/agpgart.h                   | 14 +++++++-------
 include/linux/cn_proc.h                   | 20 ++++++++++----------
 include/linux/cyclades.h                  |  6 +++---
 include/linux/dvb/video.h                 |  2 +-
 include/linux/if_pppol2tp.h               |  2 +-
 include/linux/mroute6.h                   |  2 +-
 include/linux/netfilter_ipv4/ipt_owner.h  |  8 ++++----
 include/linux/netfilter_ipv6/ip6t_owner.h |  8 ++++----
 include/linux/ppp_defs.h                  |  4 ++--
 include/linux/suspend_ioctls.h            | 11 ++++++-----
 include/linux/time.h                      |  8 ++++----
 include/linux/times.h                     |  8 ++++----
 include/linux/utime.h                     |  4 ++--
 include/linux/xfrm.h                      |  2 +-
 include/mtd/mtd-abi.h                     |  4 ++--
 include/sound/asound.h                    |  4 ++--
 18 files changed, 67 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index b8477414c5c8..4d3e48373e74 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -117,9 +117,9 @@
 struct flock {
 	short	l_type;
 	short	l_whence;
-	off_t	l_start;
-	off_t	l_len;
-	pid_t	l_pid;
+	__kernel_off_t	l_start;
+	__kernel_off_t	l_len;
+	__kernel_pid_t	l_pid;
 	__ARCH_FLOCK_PAD
 };
 #endif
@@ -140,9 +140,9 @@ struct flock {
 struct flock64 {
 	short  l_type;
 	short  l_whence;
-	loff_t l_start;
-	loff_t l_len;
-	pid_t  l_pid;
+	__kernel_loff_t l_start;
+	__kernel_loff_t l_len;
+	__kernel_pid_t  l_pid;
 	__ARCH_FLOCK64_PAD
 };
 #endif
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 969570167e9e..35752dadd6df 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -23,7 +23,7 @@ typedef union sigval {
 #endif
 
 #ifndef __ARCH_SI_UID_T
-#define __ARCH_SI_UID_T	uid_t
+#define __ARCH_SI_UID_T	__kernel_uid32_t
 #endif
 
 /*
@@ -47,13 +47,13 @@ typedef struct siginfo {
 
 		/* kill() */
 		struct {
-			pid_t _pid;		/* sender's pid */
+			__kernel_pid_t _pid;	/* sender's pid */
 			__ARCH_SI_UID_T _uid;	/* sender's uid */
 		} _kill;
 
 		/* POSIX.1b timers */
 		struct {
-			timer_t _tid;		/* timer id */
+			__kernel_timer_t _tid;	/* timer id */
 			int _overrun;		/* overrun count */
 			char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
 			sigval_t _sigval;	/* same as below */
@@ -62,18 +62,18 @@ typedef struct siginfo {
 
 		/* POSIX.1b signals */
 		struct {
-			pid_t _pid;		/* sender's pid */
+			__kernel_pid_t _pid;	/* sender's pid */
 			__ARCH_SI_UID_T _uid;	/* sender's uid */
 			sigval_t _sigval;
 		} _rt;
 
 		/* SIGCHLD */
 		struct {
-			pid_t _pid;		/* which child */
+			__kernel_pid_t _pid;	/* which child */
 			__ARCH_SI_UID_T _uid;	/* sender's uid */
 			int _status;		/* exit code */
-			clock_t _utime;
-			clock_t _stime;
+			__kernel_clock_t _utime;
+			__kernel_clock_t _stime;
 		} _sigchld;
 
 		/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
diff --git a/include/linux/agpgart.h b/include/linux/agpgart.h
index 110c600c885f..f6778eceb8f4 100644
--- a/include/linux/agpgart.h
+++ b/include/linux/agpgart.h
@@ -77,20 +77,20 @@ typedef struct _agp_setup {
  * The "prot" down below needs still a "sleep" flag somehow ...
  */
 typedef struct _agp_segment {
-	off_t pg_start;		/* starting page to populate    */
-	size_t pg_count;	/* number of pages              */
-	int prot;		/* prot flags for mmap          */
+	__kernel_off_t pg_start;	/* starting page to populate    */
+	__kernel_size_t pg_count;	/* number of pages              */
+	int prot;			/* prot flags for mmap          */
 } agp_segment;
 
 typedef struct _agp_region {
-	pid_t pid;		/* pid of process               */
-	size_t seg_count;	/* number of segments           */
+	__kernel_pid_t pid;		/* pid of process       */
+	__kernel_size_t seg_count;	/* number of segments   */
 	struct _agp_segment *seg_list;
 } agp_region;
 
 typedef struct _agp_allocate {
 	int key;		/* tag of allocation            */
-	size_t pg_count;	/* number of pages              */
+	__kernel_size_t pg_count;/* number of pages             */
 	__u32 type;		/* 0 == normal, other devspec   */
    	__u32 physical;         /* device specific (some devices  
 				 * need a phys address of the     
@@ -100,7 +100,7 @@ typedef struct _agp_allocate {
 
 typedef struct _agp_bind {
 	int key;		/* tag of allocation            */
-	off_t pg_start;		/* starting page to populate    */
+	__kernel_off_t pg_start;/* starting page to populate    */
 } agp_bind;
 
 typedef struct _agp_unbind {
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 1c86d65bc4b9..b8125b2eb665 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -65,20 +65,20 @@ struct proc_event {
 		} ack;
 
 		struct fork_proc_event {
-			pid_t parent_pid;
-			pid_t parent_tgid;
-			pid_t child_pid;
-			pid_t child_tgid;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
+			__kernel_pid_t child_pid;
+			__kernel_pid_t child_tgid;
 		} fork;
 
 		struct exec_proc_event {
-			pid_t process_pid;
-			pid_t process_tgid;
+			__kernel_pid_t process_pid;
+			__kernel_pid_t process_tgid;
 		} exec;
 
 		struct id_proc_event {
-			pid_t process_pid;
-			pid_t process_tgid;
+			__kernel_pid_t process_pid;
+			__kernel_pid_t process_tgid;
 			union {
 				__u32 ruid; /* task uid */
 				__u32 rgid; /* task gid */
@@ -90,8 +90,8 @@ struct proc_event {
 		} id;
 
 		struct exit_proc_event {
-			pid_t process_pid;
-			pid_t process_tgid;
+			__kernel_pid_t process_pid;
+			__kernel_pid_t process_tgid;
 			__u32 exit_code, exit_signal;
 		} exit;
 	} event_data;
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index d06fbf286346..788850ba4e75 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -82,9 +82,9 @@ struct cyclades_monitor {
  * open)
  */
 struct cyclades_idle_stats {
-    time_t	   in_use;	/* Time device has been in use (secs) */
-    time_t	   recv_idle;	/* Time since last char received (secs) */
-    time_t	   xmit_idle;	/* Time since last char transmitted (secs) */
+    __kernel_time_t in_use;	/* Time device has been in use (secs) */
+    __kernel_time_t recv_idle;	/* Time since last char received (secs) */
+    __kernel_time_t xmit_idle;	/* Time since last char transmitted (secs) */
     unsigned long  recv_bytes;	/* Bytes received */
     unsigned long  xmit_bytes;	/* Bytes transmitted */
     unsigned long  overruns;	/* Input overruns */
diff --git a/include/linux/dvb/video.h b/include/linux/dvb/video.h
index bd49c3ebf916..ee5d2df2d78d 100644
--- a/include/linux/dvb/video.h
+++ b/include/linux/dvb/video.h
@@ -137,7 +137,7 @@ struct video_event {
 #define VIDEO_EVENT_FRAME_RATE_CHANGED	2
 #define VIDEO_EVENT_DECODER_STOPPED 	3
 #define VIDEO_EVENT_VSYNC 		4
-	time_t timestamp;
+	__kernel_time_t timestamp;
 	union {
 		video_size_t size;
 		unsigned int frame_rate;	/* in frames per 1000sec */
diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h
index c7a66882b6d0..3a14b088c8ec 100644
--- a/include/linux/if_pppol2tp.h
+++ b/include/linux/if_pppol2tp.h
@@ -26,7 +26,7 @@
  */
 struct pppol2tp_addr
 {
-	pid_t	pid;			/* pid that owns the fd.
+	__kernel_pid_t	pid;		/* pid that owns the fd.
 					 * 0 => current */
 	int	fd;			/* FD of UDP socket to use */
 
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 5375faca1f72..43dc97e32183 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -65,7 +65,7 @@ struct mif6ctl {
 	mifi_t	mif6c_mifi;		/* Index of MIF */
 	unsigned char mif6c_flags;	/* MIFF_ flags */
 	unsigned char vifc_threshold;	/* ttl limit */
-	u_short	 mif6c_pifi;		/* the index of the physical IF */
+	__u16	 mif6c_pifi;		/* the index of the physical IF */
 	unsigned int vifc_rate_limit;	/* Rate limiter values (NI) */
 };
 
diff --git a/include/linux/netfilter_ipv4/ipt_owner.h b/include/linux/netfilter_ipv4/ipt_owner.h
index 92f4bdac54ef..a78445be9992 100644
--- a/include/linux/netfilter_ipv4/ipt_owner.h
+++ b/include/linux/netfilter_ipv4/ipt_owner.h
@@ -9,10 +9,10 @@
 #define IPT_OWNER_COMM	0x10
 
 struct ipt_owner_info {
-    uid_t uid;
-    gid_t gid;
-    pid_t pid;
-    pid_t sid;
+    __kernel_uid32_t uid;
+    __kernel_gid32_t gid;
+    __kernel_pid_t pid;
+    __kernel_pid_t sid;
     char comm[16];
     u_int8_t match, invert;	/* flags */
 };
diff --git a/include/linux/netfilter_ipv6/ip6t_owner.h b/include/linux/netfilter_ipv6/ip6t_owner.h
index 19937da3d101..ec5cc7a38c42 100644
--- a/include/linux/netfilter_ipv6/ip6t_owner.h
+++ b/include/linux/netfilter_ipv6/ip6t_owner.h
@@ -8,10 +8,10 @@
 #define IP6T_OWNER_SID	0x08
 
 struct ip6t_owner_info {
-    uid_t uid;
-    gid_t gid;
-    pid_t pid;
-    pid_t sid;
+    __kernel_uid32_t uid;
+    __kernel_gid32_t gid;
+    __kernel_pid_t pid;
+    __kernel_pid_t sid;
     u_int8_t match, invert;	/* flags */
 };
 
diff --git a/include/linux/ppp_defs.h b/include/linux/ppp_defs.h
index 1c866bda2018..0f93ed6b4a88 100644
--- a/include/linux/ppp_defs.h
+++ b/include/linux/ppp_defs.h
@@ -177,8 +177,8 @@ struct ppp_comp_stats {
  * the last NP packet was sent or received.
  */
 struct ppp_idle {
-    time_t xmit_idle;		/* time since last NP packet sent */
-    time_t recv_idle;		/* time since last NP packet received */
+    __kernel_time_t xmit_idle;	/* time since last NP packet sent */
+    __kernel_time_t recv_idle;	/* time since last NP packet received */
 };
 
 #endif /* _PPP_DEFS_H_ */
diff --git a/include/linux/suspend_ioctls.h b/include/linux/suspend_ioctls.h
index 2c6faec96bde..0b30382984fe 100644
--- a/include/linux/suspend_ioctls.h
+++ b/include/linux/suspend_ioctls.h
@@ -1,14 +1,15 @@
 #ifndef _LINUX_SUSPEND_IOCTLS_H
 #define _LINUX_SUSPEND_IOCTLS_H
 
+#include <linux/types.h>
 /*
  * This structure is used to pass the values needed for the identification
  * of the resume swap area from a user space to the kernel via the
  * SNAPSHOT_SET_SWAP_AREA ioctl
  */
 struct resume_swap_area {
-	loff_t offset;
-	u_int32_t dev;
+	__kernel_loff_t offset;
+	__u32 dev;
 } __attribute__((packed));
 
 #define SNAPSHOT_IOC_MAGIC	'3'
@@ -20,13 +21,13 @@ struct resume_swap_area {
 #define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
 #define SNAPSHOT_SET_SWAP_AREA		_IOW(SNAPSHOT_IOC_MAGIC, 13, \
 							struct resume_swap_area)
-#define SNAPSHOT_GET_IMAGE_SIZE		_IOR(SNAPSHOT_IOC_MAGIC, 14, loff_t)
+#define SNAPSHOT_GET_IMAGE_SIZE		_IOR(SNAPSHOT_IOC_MAGIC, 14, __kernel_loff_t)
 #define SNAPSHOT_PLATFORM_SUPPORT	_IO(SNAPSHOT_IOC_MAGIC, 15)
 #define SNAPSHOT_POWER_OFF		_IO(SNAPSHOT_IOC_MAGIC, 16)
 #define SNAPSHOT_CREATE_IMAGE		_IOW(SNAPSHOT_IOC_MAGIC, 17, int)
 #define SNAPSHOT_PREF_IMAGE_SIZE	_IO(SNAPSHOT_IOC_MAGIC, 18)
-#define SNAPSHOT_AVAIL_SWAP_SIZE	_IOR(SNAPSHOT_IOC_MAGIC, 19, loff_t)
-#define SNAPSHOT_ALLOC_SWAP_PAGE	_IOR(SNAPSHOT_IOC_MAGIC, 20, loff_t)
+#define SNAPSHOT_AVAIL_SWAP_SIZE	_IOR(SNAPSHOT_IOC_MAGIC, 19, __kernel_loff_t)
+#define SNAPSHOT_ALLOC_SWAP_PAGE	_IOR(SNAPSHOT_IOC_MAGIC, 20, __kernel_loff_t)
 #define SNAPSHOT_IOC_MAXNR	20
 
 #endif /* _LINUX_SUSPEND_IOCTLS_H */
diff --git a/include/linux/time.h b/include/linux/time.h
index fbbd2a1c92ba..242f62499bb7 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -12,14 +12,14 @@
 #ifndef _STRUCT_TIMESPEC
 #define _STRUCT_TIMESPEC
 struct timespec {
-	time_t	tv_sec;		/* seconds */
-	long	tv_nsec;	/* nanoseconds */
+	__kernel_time_t	tv_sec;			/* seconds */
+	long		tv_nsec;		/* nanoseconds */
 };
 #endif
 
 struct timeval {
-	time_t		tv_sec;		/* seconds */
-	suseconds_t	tv_usec;	/* microseconds */
+	__kernel_time_t		tv_sec;		/* seconds */
+	__kernel_suseconds_t	tv_usec;	/* microseconds */
 };
 
 struct timezone {
diff --git a/include/linux/times.h b/include/linux/times.h
index e2d3020742a6..87b62615cedd 100644
--- a/include/linux/times.h
+++ b/include/linux/times.h
@@ -4,10 +4,10 @@
 #include <linux/types.h>
 
 struct tms {
-	clock_t tms_utime;
-	clock_t tms_stime;
-	clock_t tms_cutime;
-	clock_t tms_cstime;
+	__kernel_clock_t tms_utime;
+	__kernel_clock_t tms_stime;
+	__kernel_clock_t tms_cutime;
+	__kernel_clock_t tms_cstime;
 };
 
 #endif
diff --git a/include/linux/utime.h b/include/linux/utime.h
index 640be6a1959e..5cdf673afbdb 100644
--- a/include/linux/utime.h
+++ b/include/linux/utime.h
@@ -4,8 +4,8 @@
 #include <linux/types.h>
 
 struct utimbuf {
-	time_t actime;
-	time_t modtime;
+	__kernel_time_t actime;
+	__kernel_time_t modtime;
 };
 
 #endif
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 52f3abd453a1..2d4ec15abaca 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -58,7 +58,7 @@ struct xfrm_selector
 	__u8	prefixlen_s;
 	__u8	proto;
 	int	ifindex;
-	uid_t	user;
+	__kernel_uid32_t	user;
 };
 
 #define XFRM_INF (~(__u64)0)
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index c6c61cd5a254..fb672013299c 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -84,8 +84,8 @@ struct otp_info {
 #define MEMGETREGIONINFO	_IOWR('M', 8, struct region_info_user)
 #define MEMSETOOBSEL		_IOW('M', 9, struct nand_oobinfo)
 #define MEMGETOOBSEL		_IOR('M', 10, struct nand_oobinfo)
-#define MEMGETBADBLOCK		_IOW('M', 11, loff_t)
-#define MEMSETBADBLOCK		_IOW('M', 12, loff_t)
+#define MEMGETBADBLOCK		_IOW('M', 11, __kernel_loff_t)
+#define MEMSETBADBLOCK		_IOW('M', 12, __kernel_loff_t)
 #define OTPSELECT		_IOR('M', 13, int)
 #define OTPGETREGIONCOUNT	_IOW('M', 14, int)
 #define OTPGETREGIONINFO	_IOW('M', 15, struct otp_info)
diff --git a/include/sound/asound.h b/include/sound/asound.h
index 1c02ed1d7c4a..16684c5a608c 100644
--- a/include/sound/asound.h
+++ b/include/sound/asound.h
@@ -385,7 +385,7 @@ struct snd_pcm_sw_params {
 
 struct snd_pcm_channel_info {
 	unsigned int channel;
-	off_t offset;			/* mmap offset */
+	__kernel_off_t offset;		/* mmap offset */
 	unsigned int first;		/* offset to first sample in bits */
 	unsigned int step;		/* samples distance in bits */
 };
@@ -789,7 +789,7 @@ struct snd_ctl_elem_info {
 	snd_ctl_elem_type_t type;	/* R: value type - SNDRV_CTL_ELEM_TYPE_* */
 	unsigned int access;		/* R: value access (bitmask) - SNDRV_CTL_ELEM_ACCESS_* */
 	unsigned int count;		/* count of values */
-	pid_t owner;			/* owner's PID of this control */
+	__kernel_pid_t owner;		/* owner's PID of this control */
 	union {
 		struct {
 			long min;		/* R: minimum value */
-- 
cgit v1.2.3-71-gd317


From 9adfbfb611307060db54691bc7e6d53fdc12312b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Feb 2009 00:51:40 +0100
Subject: make most exported headers use strict integer types

This takes care of all files that have only a small number
of non-strict integer type uses.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: netdev@vger.kernel.org
Cc: linux-ppp@vger.kernel.org
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/atmlec.h          |   5 +-
 include/linux/atmmpc.h          |  45 ++++----
 include/linux/cm4000_cs.h       |  10 +-
 include/linux/dlm_netlink.h     |  18 +--
 include/linux/dm-ioctl.h        |  42 +++----
 include/linux/dvb/audio.h       |   2 +-
 include/linux/dvb/video.h       |  22 ++--
 include/linux/if_arcnet.h       |  27 ++---
 include/linux/ip_vs.h           |  26 ++---
 include/linux/ivtvfb.h          |   2 +-
 include/linux/matroxfb.h        |   2 +-
 include/linux/pfkeyv2.h         | 242 ++++++++++++++++++++--------------------
 include/linux/selinux_netlink.h |   6 +-
 include/sound/asound.h          |   5 +-
 include/sound/emu10k1.h         |  12 +-
 15 files changed, 240 insertions(+), 226 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atmlec.h b/include/linux/atmlec.h
index 6f5a1bab8f50..39c917fd1b96 100644
--- a/include/linux/atmlec.h
+++ b/include/linux/atmlec.h
@@ -11,6 +11,7 @@
 #include <linux/atmioc.h>
 #include <linux/atm.h>
 #include <linux/if_ether.h>
+#include <linux/types.h>
 
 /* ATM lec daemon control socket */
 #define ATMLEC_CTRL	_IO('a', ATMIOC_LANE)
@@ -78,8 +79,8 @@ struct atmlec_msg {
 		} normal;
 		struct atmlec_config_msg config;
 		struct {
-			uint16_t lec_id;			/* requestor lec_id  */
-			uint32_t tran_id;			/* transaction id    */
+			__u16 lec_id;				/* requestor lec_id  */
+			__u32 tran_id;				/* transaction id    */
 			unsigned char mac_addr[ETH_ALEN];	/* dst mac addr      */
 			unsigned char atm_addr[ATM_ESA_LEN];	/* reqestor ATM addr */
 		} proxy;	/*
diff --git a/include/linux/atmmpc.h b/include/linux/atmmpc.h
index ea1650425a12..2aba5787fa63 100644
--- a/include/linux/atmmpc.h
+++ b/include/linux/atmmpc.h
@@ -4,6 +4,7 @@
 #include <linux/atmapi.h>
 #include <linux/atmioc.h>
 #include <linux/atm.h>
+#include <linux/types.h>
 
 #define ATMMPC_CTRL _IO('a', ATMIOC_MPOA)
 #define ATMMPC_DATA _IO('a', ATMIOC_MPOA+1)
@@ -18,39 +19,39 @@ struct atmmpc_ioc {
 };
 
 typedef struct in_ctrl_info {
-        uint8_t   Last_NHRP_CIE_code;
-        uint8_t   Last_Q2931_cause_value;     
-        uint8_t   eg_MPC_ATM_addr[ATM_ESA_LEN];
+        __u8   Last_NHRP_CIE_code;
+        __u8   Last_Q2931_cause_value;
+        __u8   eg_MPC_ATM_addr[ATM_ESA_LEN];
         __be32  tag;
         __be32  in_dst_ip;      /* IP address this ingress MPC sends packets to */
-        uint16_t  holding_time;
-        uint32_t  request_id;
+        __u16  holding_time;
+        __u32  request_id;
 } in_ctrl_info;
 
 typedef struct eg_ctrl_info {
-        uint8_t   DLL_header[256];
-        uint8_t   DH_length;
+        __u8   DLL_header[256];
+        __u8   DH_length;
         __be32  cache_id;
         __be32  tag;
         __be32  mps_ip;
         __be32  eg_dst_ip;      /* IP address to which ingress MPC sends packets */
-        uint8_t   in_MPC_data_ATM_addr[ATM_ESA_LEN];
-        uint16_t  holding_time;
+        __u8   in_MPC_data_ATM_addr[ATM_ESA_LEN];
+        __u16  holding_time;
 } eg_ctrl_info;
 
 struct mpc_parameters {
-        uint16_t mpc_p1;   /* Shortcut-Setup Frame Count    */
-        uint16_t mpc_p2;   /* Shortcut-Setup Frame Time     */
-        uint8_t mpc_p3[8]; /* Flow-detection Protocols      */
-        uint16_t mpc_p4;   /* MPC Initial Retry Time        */
-        uint16_t mpc_p5;   /* MPC Retry Time Maximum        */
-        uint16_t mpc_p6;   /* Hold Down Time                */      
+        __u16 mpc_p1;   /* Shortcut-Setup Frame Count    */
+        __u16 mpc_p2;   /* Shortcut-Setup Frame Time     */
+        __u8 mpc_p3[8]; /* Flow-detection Protocols      */
+        __u16 mpc_p4;   /* MPC Initial Retry Time        */
+        __u16 mpc_p5;   /* MPC Retry Time Maximum        */
+        __u16 mpc_p6;   /* Hold Down Time                */
 } ;
 
 struct k_message {
-        uint16_t type;
+        __u16 type;
         __be32 ip_mask;
-        uint8_t  MPS_ctrl[ATM_ESA_LEN];
+        __u8  MPS_ctrl[ATM_ESA_LEN];
         union {
                 in_ctrl_info in_info;
                 eg_ctrl_info eg_info;
@@ -61,11 +62,11 @@ struct k_message {
 
 struct llc_snap_hdr {
 	/* RFC 1483 LLC/SNAP encapsulation for routed IP PDUs */
-        uint8_t  dsap;    /* Destination Service Access Point (0xAA)     */
-        uint8_t  ssap;    /* Source Service Access Point      (0xAA)     */
-        uint8_t  ui;      /* Unnumbered Information           (0x03)     */
-        uint8_t  org[3];  /* Organizational identification    (0x000000) */
-        uint8_t  type[2]; /* Ether type (for IP)              (0x0800)   */
+        __u8  dsap;    /* Destination Service Access Point (0xAA)     */
+        __u8  ssap;    /* Source Service Access Point      (0xAA)     */
+        __u8  ui;      /* Unnumbered Information           (0x03)     */
+        __u8  org[3];  /* Organizational identification    (0x000000) */
+        __u8  type[2]; /* Ether type (for IP)              (0x0800)   */
 };
 
 /* TLVs this MPC recognizes */
diff --git a/include/linux/cm4000_cs.h b/include/linux/cm4000_cs.h
index 605ebe24bb2e..72bfefdbd767 100644
--- a/include/linux/cm4000_cs.h
+++ b/include/linux/cm4000_cs.h
@@ -1,6 +1,8 @@
 #ifndef	_CM4000_H_
 #define	_CM4000_H_
 
+#include <linux/types.h>
+
 #define	MAX_ATR			33
 
 #define	CM4000_MAX_DEV		4
@@ -10,9 +12,9 @@
  * not to break compilation of userspace apps. -HW */
 
 typedef struct atreq {
-	int32_t atr_len;
+	__s32 atr_len;
 	unsigned char atr[64];
-	int32_t power_act;
+	__s32 power_act;
 	unsigned char bIFSD;
 	unsigned char bIFSC;
 } atreq_t;
@@ -22,13 +24,13 @@ typedef struct atreq {
  * member sizes. This leads to CONFIG_COMPAT breakage, since 32bit userspace
  * will lay out the structure members differently than the 64bit kernel.
  *
- * I've changed "ptsreq.protocol" from "unsigned long" to "u_int32_t".
+ * I've changed "ptsreq.protocol" from "unsigned long" to "__u32".
  * On 32bit this will make no difference.  With 64bit kernels, it will make
  * 32bit apps work, too.
  */
 
 typedef struct ptsreq {
-	u_int32_t protocol; /*T=0: 2^0, T=1:  2^1*/
+	__u32 protocol; /*T=0: 2^0, T=1:  2^1*/
  	unsigned char flags;
  	unsigned char pts1;
  	unsigned char pts2;
diff --git a/include/linux/dlm_netlink.h b/include/linux/dlm_netlink.h
index 19276332707a..647c8ef27227 100644
--- a/include/linux/dlm_netlink.h
+++ b/include/linux/dlm_netlink.h
@@ -9,6 +9,8 @@
 #ifndef _DLM_NETLINK_H
 #define _DLM_NETLINK_H
 
+#include <linux/types.h>
+
 enum {
 	DLM_STATUS_WAITING = 1,
 	DLM_STATUS_GRANTED = 2,
@@ -18,16 +20,16 @@ enum {
 #define DLM_LOCK_DATA_VERSION 1
 
 struct dlm_lock_data {
-	uint16_t version;
-	uint32_t lockspace_id;
+	__u16 version;
+	__u32 lockspace_id;
 	int nodeid;
 	int ownpid;
-	uint32_t id;
-	uint32_t remid;
-	uint64_t xid;
-	int8_t status;
-	int8_t grmode;
-	int8_t rqmode;
+	__u32 id;
+	__u32 remid;
+	__u64 xid;
+	__s8 status;
+	__s8 grmode;
+	__s8 rqmode;
 	unsigned long timestamp;
 	int resource_namelen;
 	char resource_name[DLM_RESNAME_MAXLEN];
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 28c2940eb30d..48e44ee2b466 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -113,20 +113,20 @@ struct dm_ioctl {
 	 * return -ENOTTY) fill out this field, even if the
 	 * command failed.
 	 */
-	uint32_t version[3];	/* in/out */
-	uint32_t data_size;	/* total size of data passed in
+	__u32 version[3];	/* in/out */
+	__u32 data_size;	/* total size of data passed in
 				 * including this struct */
 
-	uint32_t data_start;	/* offset to start of data
+	__u32 data_start;	/* offset to start of data
 				 * relative to start of this struct */
 
-	uint32_t target_count;	/* in/out */
-	int32_t open_count;	/* out */
-	uint32_t flags;		/* in/out */
-	uint32_t event_nr;      	/* in/out */
-	uint32_t padding;
+	__u32 target_count;	/* in/out */
+	__s32 open_count;	/* out */
+	__u32 flags;		/* in/out */
+	__u32 event_nr;      	/* in/out */
+	__u32 padding;
 
-	uint64_t dev;		/* in/out */
+	__u64 dev;		/* in/out */
 
 	char name[DM_NAME_LEN];	/* device name */
 	char uuid[DM_UUID_LEN];	/* unique identifier for
@@ -139,9 +139,9 @@ struct dm_ioctl {
  * dm_ioctl.
  */
 struct dm_target_spec {
-	uint64_t sector_start;
-	uint64_t length;
-	int32_t status;		/* used when reading from kernel only */
+	__u64 sector_start;
+	__u64 length;
+	__s32 status;		/* used when reading from kernel only */
 
 	/*
 	 * Location of the next dm_target_spec.
@@ -153,7 +153,7 @@ struct dm_target_spec {
 	 *   (that follows the dm_ioctl struct) to the start of the "next"
 	 *   dm_target_spec.
 	 */
-	uint32_t next;
+	__u32 next;
 
 	char target_type[DM_MAX_TYPE_NAME];
 
@@ -168,17 +168,17 @@ struct dm_target_spec {
  * Used to retrieve the target dependencies.
  */
 struct dm_target_deps {
-	uint32_t count;	/* Array size */
-	uint32_t padding;	/* unused */
-	uint64_t dev[0];	/* out */
+	__u32 count;	/* Array size */
+	__u32 padding;	/* unused */
+	__u64 dev[0];	/* out */
 };
 
 /*
  * Used to get a list of all dm devices.
  */
 struct dm_name_list {
-	uint64_t dev;
-	uint32_t next;		/* offset to the next record from
+	__u64 dev;
+	__u32 next;		/* offset to the next record from
 				   the _start_ of this */
 	char name[0];
 };
@@ -187,8 +187,8 @@ struct dm_name_list {
  * Used to retrieve the target versions
  */
 struct dm_target_versions {
-        uint32_t next;
-        uint32_t version[3];
+        __u32 next;
+        __u32 version[3];
 
         char name[0];
 };
@@ -197,7 +197,7 @@ struct dm_target_versions {
  * Used to pass message to a target
  */
 struct dm_target_msg {
-	uint64_t sector;	/* Device sector */
+	__u64 sector;	/* Device sector */
 
 	char message[0];
 };
diff --git a/include/linux/dvb/audio.h b/include/linux/dvb/audio.h
index bb0df2aaebfa..fec66bd24f22 100644
--- a/include/linux/dvb/audio.h
+++ b/include/linux/dvb/audio.h
@@ -76,7 +76,7 @@ struct audio_karaoke{  /* if Vocal1 or Vocal2 are non-zero, they get mixed  */
 } audio_karaoke_t;     /* into left and right  */
 
 
-typedef uint16_t audio_attributes_t;
+typedef __u16 audio_attributes_t;
 /*   bits: descr. */
 /*   15-13 audio coding mode (0=ac3, 2=mpeg1, 3=mpeg2ext, 4=LPCM, 6=DTS, */
 /*   12    multichannel extension */
diff --git a/include/linux/dvb/video.h b/include/linux/dvb/video.h
index ee5d2df2d78d..1d750c0fd86e 100644
--- a/include/linux/dvb/video.h
+++ b/include/linux/dvb/video.h
@@ -132,7 +132,7 @@ struct video_command {
 #define VIDEO_VSYNC_FIELD_PROGRESSIVE	(3)
 
 struct video_event {
-	int32_t type;
+	__s32 type;
 #define VIDEO_EVENT_SIZE_CHANGED	1
 #define VIDEO_EVENT_FRAME_RATE_CHANGED	2
 #define VIDEO_EVENT_DECODER_STOPPED 	3
@@ -157,25 +157,25 @@ struct video_status {
 
 struct video_still_picture {
 	char __user *iFrame;        /* pointer to a single iframe in memory */
-	int32_t size;
+	__s32 size;
 };
 
 
 typedef
 struct video_highlight {
 	int     active;      /*    1=show highlight, 0=hide highlight */
-	uint8_t contrast1;   /*    7- 4  Pattern pixel contrast */
+	__u8    contrast1;   /*    7- 4  Pattern pixel contrast */
 			     /*    3- 0  Background pixel contrast */
-	uint8_t contrast2;   /*    7- 4  Emphasis pixel-2 contrast */
+	__u8    contrast2;   /*    7- 4  Emphasis pixel-2 contrast */
 			     /*    3- 0  Emphasis pixel-1 contrast */
-	uint8_t color1;      /*    7- 4  Pattern pixel color */
+	__u8    color1;      /*    7- 4  Pattern pixel color */
 			     /*    3- 0  Background pixel color */
-	uint8_t color2;      /*    7- 4  Emphasis pixel-2 color */
+	__u8    color2;      /*    7- 4  Emphasis pixel-2 color */
 			     /*    3- 0  Emphasis pixel-1 color */
-	uint32_t ypos;       /*   23-22  auto action mode */
+	__u32    ypos;       /*   23-22  auto action mode */
 			     /*   21-12  start y */
 			     /*    9- 0  end y */
-	uint32_t xpos;       /*   23-22  button color number */
+	__u32    xpos;       /*   23-22  button color number */
 			     /*   21-12  start x */
 			     /*    9- 0  end x */
 } video_highlight_t;
@@ -189,17 +189,17 @@ typedef struct video_spu {
 
 typedef struct video_spu_palette {      /* SPU Palette information */
 	int length;
-	uint8_t __user *palette;
+	__u8 __user *palette;
 } video_spu_palette_t;
 
 
 typedef struct video_navi_pack {
 	int length;          /* 0 ... 1024 */
-	uint8_t data[1024];
+	__u8 data[1024];
 } video_navi_pack_t;
 
 
-typedef uint16_t video_attributes_t;
+typedef __u16 video_attributes_t;
 /*   bits: descr. */
 /*   15-14 Video compression mode (0=MPEG-1, 1=MPEG-2) */
 /*   13-12 TV system (0=525/60, 1=625/50) */
diff --git a/include/linux/if_arcnet.h b/include/linux/if_arcnet.h
index 27ea2ac445ad..0835debab115 100644
--- a/include/linux/if_arcnet.h
+++ b/include/linux/if_arcnet.h
@@ -16,6 +16,7 @@
 #ifndef _LINUX_IF_ARCNET_H
 #define _LINUX_IF_ARCNET_H
 
+#include <linux/types.h>
 #include <linux/if_ether.h>
 
 
@@ -57,10 +58,10 @@
  */
 struct arc_rfc1201
 {
-    uint8_t  proto;		/* protocol ID field - varies		*/
-    uint8_t  split_flag;	/* for use with split packets		*/
+    __u8  proto;		/* protocol ID field - varies		*/
+    __u8  split_flag;	/* for use with split packets		*/
     __be16   sequence;		/* sequence number			*/
-    uint8_t  payload[0];	/* space remaining in packet (504 bytes)*/
+    __u8  payload[0];	/* space remaining in packet (504 bytes)*/
 };
 #define RFC1201_HDR_SIZE 4
 
@@ -70,8 +71,8 @@ struct arc_rfc1201
  */
 struct arc_rfc1051
 {
-    uint8_t proto;		/* ARC_P_RFC1051_ARP/RFC1051_IP	*/
-    uint8_t payload[0];		/* 507 bytes			*/
+    __u8 proto;		/* ARC_P_RFC1051_ARP/RFC1051_IP	*/
+    __u8 payload[0];		/* 507 bytes			*/
 };
 #define RFC1051_HDR_SIZE 1
 
@@ -82,20 +83,20 @@ struct arc_rfc1051
  */
 struct arc_eth_encap
 {
-    uint8_t proto;		/* Always ARC_P_ETHER			*/
+    __u8 proto;		/* Always ARC_P_ETHER			*/
     struct ethhdr eth;		/* standard ethernet header (yuck!)	*/
-    uint8_t payload[0];		/* 493 bytes				*/
+    __u8 payload[0];		/* 493 bytes				*/
 };
 #define ETH_ENCAP_HDR_SIZE 14
 
 
 struct arc_cap
 {
-	uint8_t proto;
-	uint8_t cookie[sizeof(int)];   /* Actually NOT sent over the network */
+	__u8 proto;
+	__u8 cookie[sizeof(int)];   /* Actually NOT sent over the network */
 	union {
-		uint8_t ack;
-		uint8_t raw[0];		/* 507 bytes */
+		__u8 ack;
+		__u8 raw[0];		/* 507 bytes */
 	} mes;
 };
 
@@ -109,7 +110,7 @@ struct arc_cap
  */
 struct arc_hardware
 {
-    uint8_t  source,		/* source ARCnet - filled in automagically */
+    __u8  source,		/* source ARCnet - filled in automagically */
              dest,		/* destination ARCnet - 0 for broadcast    */
     	     offset[2];		/* offset bytes (some weird semantics)     */
 };
@@ -130,7 +131,7 @@ struct archdr
 	struct arc_rfc1051   rfc1051;
 	struct arc_eth_encap eth_encap;
 	struct arc_cap       cap;
-	uint8_t raw[0];		/* 508 bytes				*/
+	__u8 raw[0];		/* 508 bytes				*/
     } soft;
 };
 
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index 0f434a28fb58..148265e63e8d 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -96,10 +96,10 @@
  */
 struct ip_vs_service_user {
 	/* virtual service addresses */
-	u_int16_t		protocol;
+	__u16		protocol;
 	__be32			addr;		/* virtual ip address */
 	__be16			port;
-	u_int32_t		fwmark;		/* firwall mark of service */
+	__u32		fwmark;		/* firwall mark of service */
 
 	/* virtual service options */
 	char			sched_name[IP_VS_SCHEDNAME_MAXLEN];
@@ -119,8 +119,8 @@ struct ip_vs_dest_user {
 	int			weight;		/* destination weight */
 
 	/* thresholds for active connections */
-	u_int32_t		u_threshold;	/* upper threshold */
-	u_int32_t		l_threshold;	/* lower threshold */
+	__u32		u_threshold;	/* upper threshold */
+	__u32		l_threshold;	/* lower threshold */
 };
 
 
@@ -159,10 +159,10 @@ struct ip_vs_getinfo {
 /* The argument to IP_VS_SO_GET_SERVICE */
 struct ip_vs_service_entry {
 	/* which service: user fills in these */
-	u_int16_t		protocol;
+	__u16		protocol;
 	__be32			addr;		/* virtual address */
 	__be16			port;
-	u_int32_t		fwmark;		/* firwall mark of service */
+	__u32		fwmark;		/* firwall mark of service */
 
 	/* service options */
 	char			sched_name[IP_VS_SCHEDNAME_MAXLEN];
@@ -184,12 +184,12 @@ struct ip_vs_dest_entry {
 	unsigned		conn_flags;	/* connection flags */
 	int			weight;		/* destination weight */
 
-	u_int32_t		u_threshold;	/* upper threshold */
-	u_int32_t		l_threshold;	/* lower threshold */
+	__u32		u_threshold;	/* upper threshold */
+	__u32		l_threshold;	/* lower threshold */
 
-	u_int32_t		activeconns;	/* active connections */
-	u_int32_t		inactconns;	/* inactive connections */
-	u_int32_t		persistconns;	/* persistent connections */
+	__u32		activeconns;	/* active connections */
+	__u32		inactconns;	/* inactive connections */
+	__u32		persistconns;	/* persistent connections */
 
 	/* statistics */
 	struct ip_vs_stats_user stats;
@@ -199,10 +199,10 @@ struct ip_vs_dest_entry {
 /* The argument to IP_VS_SO_GET_DESTS */
 struct ip_vs_get_dests {
 	/* which service: user fills in these */
-	u_int16_t		protocol;
+	__u16		protocol;
 	__be32			addr;		/* virtual address */
 	__be16			port;
-	u_int32_t		fwmark;		/* firwall mark of service */
+	__u32		fwmark;		/* firwall mark of service */
 
 	/* number of real servers */
 	unsigned int		num_dests;
diff --git a/include/linux/ivtvfb.h b/include/linux/ivtvfb.h
index e20af47b59ad..9d88b29ddf55 100644
--- a/include/linux/ivtvfb.h
+++ b/include/linux/ivtvfb.h
@@ -33,6 +33,6 @@ struct ivtvfb_dma_frame {
 };
 
 #define IVTVFB_IOC_DMA_FRAME 	_IOW('V', BASE_VIDIOC_PRIVATE+0, struct ivtvfb_dma_frame)
-#define FBIO_WAITFORVSYNC	_IOW('F', 0x20, u_int32_t)
+#define FBIO_WAITFORVSYNC	_IOW('F', 0x20, __u32)
 
 #endif
diff --git a/include/linux/matroxfb.h b/include/linux/matroxfb.h
index 404f678e734b..2203121a43e9 100644
--- a/include/linux/matroxfb.h
+++ b/include/linux/matroxfb.h
@@ -37,7 +37,7 @@ enum matroxfb_ctrl_id {
   MATROXFB_CID_LAST
 };
 
-#define FBIO_WAITFORVSYNC	_IOW('F', 0x20, u_int32_t)
+#define FBIO_WAITFORVSYNC	_IOW('F', 0x20, __u32)
 
 #endif
 
diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index 01b262959f2e..228b0b6306b0 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -12,187 +12,187 @@
 #define PFKEYV2_REVISION	199806L
 
 struct sadb_msg {
-	uint8_t		sadb_msg_version;
-	uint8_t		sadb_msg_type;
-	uint8_t		sadb_msg_errno;
-	uint8_t		sadb_msg_satype;
-	uint16_t	sadb_msg_len;
-	uint16_t	sadb_msg_reserved;
-	uint32_t	sadb_msg_seq;
-	uint32_t	sadb_msg_pid;
+	__u8		sadb_msg_version;
+	__u8		sadb_msg_type;
+	__u8		sadb_msg_errno;
+	__u8		sadb_msg_satype;
+	__u16	sadb_msg_len;
+	__u16	sadb_msg_reserved;
+	__u32	sadb_msg_seq;
+	__u32	sadb_msg_pid;
 } __attribute__((packed));
 /* sizeof(struct sadb_msg) == 16 */
 
 struct sadb_ext {
-	uint16_t	sadb_ext_len;
-	uint16_t	sadb_ext_type;
+	__u16	sadb_ext_len;
+	__u16	sadb_ext_type;
 } __attribute__((packed));
 /* sizeof(struct sadb_ext) == 4 */
 
 struct sadb_sa {
-	uint16_t	sadb_sa_len;
-	uint16_t	sadb_sa_exttype;
+	__u16	sadb_sa_len;
+	__u16	sadb_sa_exttype;
 	__be32		sadb_sa_spi;
-	uint8_t		sadb_sa_replay;
-	uint8_t		sadb_sa_state;
-	uint8_t		sadb_sa_auth;
-	uint8_t		sadb_sa_encrypt;
-	uint32_t	sadb_sa_flags;
+	__u8		sadb_sa_replay;
+	__u8		sadb_sa_state;
+	__u8		sadb_sa_auth;
+	__u8		sadb_sa_encrypt;
+	__u32	sadb_sa_flags;
 } __attribute__((packed));
 /* sizeof(struct sadb_sa) == 16 */
 
 struct sadb_lifetime {
-	uint16_t	sadb_lifetime_len;
-	uint16_t	sadb_lifetime_exttype;
-	uint32_t	sadb_lifetime_allocations;
-	uint64_t	sadb_lifetime_bytes;
-	uint64_t	sadb_lifetime_addtime;
-	uint64_t	sadb_lifetime_usetime;
+	__u16	sadb_lifetime_len;
+	__u16	sadb_lifetime_exttype;
+	__u32	sadb_lifetime_allocations;
+	__u64	sadb_lifetime_bytes;
+	__u64	sadb_lifetime_addtime;
+	__u64	sadb_lifetime_usetime;
 } __attribute__((packed));
 /* sizeof(struct sadb_lifetime) == 32 */
 
 struct sadb_address {
-	uint16_t	sadb_address_len;
-	uint16_t	sadb_address_exttype;
-	uint8_t		sadb_address_proto;
-	uint8_t		sadb_address_prefixlen;
-	uint16_t	sadb_address_reserved;
+	__u16	sadb_address_len;
+	__u16	sadb_address_exttype;
+	__u8		sadb_address_proto;
+	__u8		sadb_address_prefixlen;
+	__u16	sadb_address_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_address) == 8 */
 
 struct sadb_key {
-	uint16_t	sadb_key_len;
-	uint16_t	sadb_key_exttype;
-	uint16_t	sadb_key_bits;
-	uint16_t	sadb_key_reserved;
+	__u16	sadb_key_len;
+	__u16	sadb_key_exttype;
+	__u16	sadb_key_bits;
+	__u16	sadb_key_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_key) == 8 */
 
 struct sadb_ident {
-	uint16_t	sadb_ident_len;
-	uint16_t	sadb_ident_exttype;
-	uint16_t	sadb_ident_type;
-	uint16_t	sadb_ident_reserved;
-	uint64_t	sadb_ident_id;
+	__u16	sadb_ident_len;
+	__u16	sadb_ident_exttype;
+	__u16	sadb_ident_type;
+	__u16	sadb_ident_reserved;
+	__u64	sadb_ident_id;
 } __attribute__((packed));
 /* sizeof(struct sadb_ident) == 16 */
 
 struct sadb_sens {
-	uint16_t	sadb_sens_len;
-	uint16_t	sadb_sens_exttype;
-	uint32_t	sadb_sens_dpd;
-	uint8_t		sadb_sens_sens_level;
-	uint8_t		sadb_sens_sens_len;
-	uint8_t		sadb_sens_integ_level;
-	uint8_t		sadb_sens_integ_len;
-	uint32_t	sadb_sens_reserved;
+	__u16	sadb_sens_len;
+	__u16	sadb_sens_exttype;
+	__u32	sadb_sens_dpd;
+	__u8		sadb_sens_sens_level;
+	__u8		sadb_sens_sens_len;
+	__u8		sadb_sens_integ_level;
+	__u8		sadb_sens_integ_len;
+	__u32	sadb_sens_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_sens) == 16 */
 
 /* followed by:
-	uint64_t	sadb_sens_bitmap[sens_len];
-	uint64_t	sadb_integ_bitmap[integ_len];  */
+	__u64	sadb_sens_bitmap[sens_len];
+	__u64	sadb_integ_bitmap[integ_len];  */
 
 struct sadb_prop {
-	uint16_t	sadb_prop_len;
-	uint16_t	sadb_prop_exttype;
-	uint8_t		sadb_prop_replay;
-	uint8_t		sadb_prop_reserved[3];
+	__u16	sadb_prop_len;
+	__u16	sadb_prop_exttype;
+	__u8		sadb_prop_replay;
+	__u8		sadb_prop_reserved[3];
 } __attribute__((packed));
 /* sizeof(struct sadb_prop) == 8 */
 
 /* followed by:
 	struct sadb_comb sadb_combs[(sadb_prop_len +
-		sizeof(uint64_t) - sizeof(struct sadb_prop)) /
+		sizeof(__u64) - sizeof(struct sadb_prop)) /
 		sizeof(struct sadb_comb)]; */
 
 struct sadb_comb {
-	uint8_t		sadb_comb_auth;
-	uint8_t		sadb_comb_encrypt;
-	uint16_t	sadb_comb_flags;
-	uint16_t	sadb_comb_auth_minbits;
-	uint16_t	sadb_comb_auth_maxbits;
-	uint16_t	sadb_comb_encrypt_minbits;
-	uint16_t	sadb_comb_encrypt_maxbits;
-	uint32_t	sadb_comb_reserved;
-	uint32_t	sadb_comb_soft_allocations;
-	uint32_t	sadb_comb_hard_allocations;
-	uint64_t	sadb_comb_soft_bytes;
-	uint64_t	sadb_comb_hard_bytes;
-	uint64_t	sadb_comb_soft_addtime;
-	uint64_t	sadb_comb_hard_addtime;
-	uint64_t	sadb_comb_soft_usetime;
-	uint64_t	sadb_comb_hard_usetime;
+	__u8		sadb_comb_auth;
+	__u8		sadb_comb_encrypt;
+	__u16	sadb_comb_flags;
+	__u16	sadb_comb_auth_minbits;
+	__u16	sadb_comb_auth_maxbits;
+	__u16	sadb_comb_encrypt_minbits;
+	__u16	sadb_comb_encrypt_maxbits;
+	__u32	sadb_comb_reserved;
+	__u32	sadb_comb_soft_allocations;
+	__u32	sadb_comb_hard_allocations;
+	__u64	sadb_comb_soft_bytes;
+	__u64	sadb_comb_hard_bytes;
+	__u64	sadb_comb_soft_addtime;
+	__u64	sadb_comb_hard_addtime;
+	__u64	sadb_comb_soft_usetime;
+	__u64	sadb_comb_hard_usetime;
 } __attribute__((packed));
 /* sizeof(struct sadb_comb) == 72 */
 
 struct sadb_supported {
-	uint16_t	sadb_supported_len;
-	uint16_t	sadb_supported_exttype;
-	uint32_t	sadb_supported_reserved;
+	__u16	sadb_supported_len;
+	__u16	sadb_supported_exttype;
+	__u32	sadb_supported_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_supported) == 8 */
 
 /* followed by:
 	struct sadb_alg sadb_algs[(sadb_supported_len +
-		sizeof(uint64_t) - sizeof(struct sadb_supported)) /
+		sizeof(__u64) - sizeof(struct sadb_supported)) /
 		sizeof(struct sadb_alg)]; */
 
 struct sadb_alg {
-	uint8_t		sadb_alg_id;
-	uint8_t		sadb_alg_ivlen;
-	uint16_t	sadb_alg_minbits;
-	uint16_t	sadb_alg_maxbits;
-	uint16_t	sadb_alg_reserved;
+	__u8		sadb_alg_id;
+	__u8		sadb_alg_ivlen;
+	__u16	sadb_alg_minbits;
+	__u16	sadb_alg_maxbits;
+	__u16	sadb_alg_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_alg) == 8 */
 
 struct sadb_spirange {
-	uint16_t	sadb_spirange_len;
-	uint16_t	sadb_spirange_exttype;
-	uint32_t	sadb_spirange_min;
-	uint32_t	sadb_spirange_max;
-	uint32_t	sadb_spirange_reserved;
+	__u16	sadb_spirange_len;
+	__u16	sadb_spirange_exttype;
+	__u32	sadb_spirange_min;
+	__u32	sadb_spirange_max;
+	__u32	sadb_spirange_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_spirange) == 16 */
 
 struct sadb_x_kmprivate {
-	uint16_t	sadb_x_kmprivate_len;
-	uint16_t	sadb_x_kmprivate_exttype;
-	uint32_t	sadb_x_kmprivate_reserved;
+	__u16	sadb_x_kmprivate_len;
+	__u16	sadb_x_kmprivate_exttype;
+	__u32	sadb_x_kmprivate_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_kmprivate) == 8 */
 
 struct sadb_x_sa2 {
-	uint16_t	sadb_x_sa2_len;
-	uint16_t	sadb_x_sa2_exttype;
-	uint8_t		sadb_x_sa2_mode;
-	uint8_t		sadb_x_sa2_reserved1;
-	uint16_t	sadb_x_sa2_reserved2;
-	uint32_t	sadb_x_sa2_sequence;
-	uint32_t	sadb_x_sa2_reqid;
+	__u16	sadb_x_sa2_len;
+	__u16	sadb_x_sa2_exttype;
+	__u8		sadb_x_sa2_mode;
+	__u8		sadb_x_sa2_reserved1;
+	__u16	sadb_x_sa2_reserved2;
+	__u32	sadb_x_sa2_sequence;
+	__u32	sadb_x_sa2_reqid;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_sa2) == 16 */
 
 struct sadb_x_policy {
-	uint16_t	sadb_x_policy_len;
-	uint16_t	sadb_x_policy_exttype;
-	uint16_t	sadb_x_policy_type;
-	uint8_t		sadb_x_policy_dir;
-	uint8_t		sadb_x_policy_reserved;
-	uint32_t	sadb_x_policy_id;
-	uint32_t	sadb_x_policy_priority;
+	__u16	sadb_x_policy_len;
+	__u16	sadb_x_policy_exttype;
+	__u16	sadb_x_policy_type;
+	__u8		sadb_x_policy_dir;
+	__u8		sadb_x_policy_reserved;
+	__u32	sadb_x_policy_id;
+	__u32	sadb_x_policy_priority;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_policy) == 16 */
 
 struct sadb_x_ipsecrequest {
-	uint16_t	sadb_x_ipsecrequest_len;
-	uint16_t	sadb_x_ipsecrequest_proto;
-	uint8_t		sadb_x_ipsecrequest_mode;
-	uint8_t		sadb_x_ipsecrequest_level;
-	uint16_t	sadb_x_ipsecrequest_reserved1;
-	uint32_t	sadb_x_ipsecrequest_reqid;
-	uint32_t	sadb_x_ipsecrequest_reserved2;
+	__u16	sadb_x_ipsecrequest_len;
+	__u16	sadb_x_ipsecrequest_proto;
+	__u8		sadb_x_ipsecrequest_mode;
+	__u8		sadb_x_ipsecrequest_level;
+	__u16	sadb_x_ipsecrequest_reserved1;
+	__u32	sadb_x_ipsecrequest_reqid;
+	__u32	sadb_x_ipsecrequest_reserved2;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_ipsecrequest) == 16 */
 
@@ -200,38 +200,38 @@ struct sadb_x_ipsecrequest {
  * type of NAT-T is supported, draft-ietf-ipsec-udp-encaps-06
  */
 struct sadb_x_nat_t_type {
-	uint16_t	sadb_x_nat_t_type_len;
-	uint16_t	sadb_x_nat_t_type_exttype;
-	uint8_t		sadb_x_nat_t_type_type;
-	uint8_t		sadb_x_nat_t_type_reserved[3];
+	__u16	sadb_x_nat_t_type_len;
+	__u16	sadb_x_nat_t_type_exttype;
+	__u8		sadb_x_nat_t_type_type;
+	__u8		sadb_x_nat_t_type_reserved[3];
 } __attribute__((packed));
 /* sizeof(struct sadb_x_nat_t_type) == 8 */
 
 /* Pass a NAT Traversal port (Source or Dest port) */
 struct sadb_x_nat_t_port {
-	uint16_t	sadb_x_nat_t_port_len;
-	uint16_t	sadb_x_nat_t_port_exttype;
+	__u16	sadb_x_nat_t_port_len;
+	__u16	sadb_x_nat_t_port_exttype;
 	__be16		sadb_x_nat_t_port_port;
-	uint16_t	sadb_x_nat_t_port_reserved;
+	__u16	sadb_x_nat_t_port_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_nat_t_port) == 8 */
 
 /* Generic LSM security context */
 struct sadb_x_sec_ctx {
-	uint16_t	sadb_x_sec_len;
-	uint16_t	sadb_x_sec_exttype;
-	uint8_t		sadb_x_ctx_alg;  /* LSMs: e.g., selinux == 1 */
-	uint8_t		sadb_x_ctx_doi;
-	uint16_t	sadb_x_ctx_len;
+	__u16	sadb_x_sec_len;
+	__u16	sadb_x_sec_exttype;
+	__u8		sadb_x_ctx_alg;  /* LSMs: e.g., selinux == 1 */
+	__u8		sadb_x_ctx_doi;
+	__u16	sadb_x_ctx_len;
 } __attribute__((packed));
 /* sizeof(struct sadb_sec_ctx) = 8 */
 
 /* Used by MIGRATE to pass addresses IKE will use to perform
  * negotiation with the peer */
 struct sadb_x_kmaddress {
-	uint16_t	sadb_x_kmaddress_len;
-	uint16_t	sadb_x_kmaddress_exttype;
-	uint32_t	sadb_x_kmaddress_reserved;
+	__u16	sadb_x_kmaddress_len;
+	__u16	sadb_x_kmaddress_exttype;
+	__u32	sadb_x_kmaddress_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_kmaddress) == 8 */
 
diff --git a/include/linux/selinux_netlink.h b/include/linux/selinux_netlink.h
index bbf489decd84..d239797785cf 100644
--- a/include/linux/selinux_netlink.h
+++ b/include/linux/selinux_netlink.h
@@ -12,6 +12,8 @@
 #ifndef _LINUX_SELINUX_NETLINK_H
 #define _LINUX_SELINUX_NETLINK_H
 
+#include <linux/types.h>
+
 /* Message types. */
 #define SELNL_MSG_BASE 0x10
 enum {
@@ -38,11 +40,11 @@ enum selinux_nlgroups {
 
 /* Message structures */
 struct selnl_msg_setenforce {
-	int32_t		val;
+	__s32		val;
 };
 
 struct selnl_msg_policyload {
-	u_int32_t	seqno;
+	__u32	seqno;
 };
 
 #endif /* _LINUX_SELINUX_NETLINK_H */
diff --git a/include/sound/asound.h b/include/sound/asound.h
index 16684c5a608c..d9beda5f74a7 100644
--- a/include/sound/asound.h
+++ b/include/sound/asound.h
@@ -23,9 +23,10 @@
 #ifndef __SOUND_ASOUND_H
 #define __SOUND_ASOUND_H
 
+#include <linux/types.h>
+
 #ifdef __KERNEL__
 #include <linux/ioctl.h>
-#include <linux/types.h>
 #include <linux/time.h>
 #include <asm/byteorder.h>
 
@@ -342,7 +343,7 @@ struct snd_interval {
 #define SNDRV_MASK_MAX	256
 
 struct snd_mask {
-	u_int32_t bits[(SNDRV_MASK_MAX+31)/32];
+	__u32 bits[(SNDRV_MASK_MAX+31)/32];
 };
 
 struct snd_pcm_hw_params {
diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 10ee28eac018..c380056ff26d 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1,6 +1,8 @@
 #ifndef __SOUND_EMU10K1_H
 #define __SOUND_EMU10K1_H
 
+#include <linux/types.h>
+
 /*
  *  Copyright (c) by Jaroslav Kysela <perex@perex.cz>,
  *		     Creative Labs, Inc.
@@ -34,6 +36,8 @@
 #include <sound/timer.h>
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
+#include <linux/types.h>
+
 #include <asm/io.h>
 
 /* ------------------- DEFINES -------------------- */
@@ -2171,7 +2175,7 @@ struct snd_emu10k1_fx8010_code {
 	char name[128];
 
 	DECLARE_BITMAP(gpr_valid, 0x200); /* bitmask of valid initializers */
-	u_int32_t __user *gpr_map;	  /* initializers */
+	__u32 __user *gpr_map;		/* initializers */
 
 	unsigned int gpr_add_control_count; /* count of GPR controls to add/replace */
 	struct snd_emu10k1_fx8010_control_gpr __user *gpr_add_controls; /* GPR controls to add/replace */
@@ -2184,11 +2188,11 @@ struct snd_emu10k1_fx8010_code {
 	struct snd_emu10k1_fx8010_control_gpr __user *gpr_list_controls; /* listed GPR controls */
 
 	DECLARE_BITMAP(tram_valid, 0x100); /* bitmask of valid initializers */
-	u_int32_t __user *tram_data_map;  /* data initializers */
-	u_int32_t __user *tram_addr_map;  /* map initializers */
+	__u32 __user *tram_data_map;	  /* data initializers */
+	__u32 __user *tram_addr_map;	  /* map initializers */
 
 	DECLARE_BITMAP(code_valid, 1024); /* bitmask of valid instructions */
-	u_int32_t __user *code;		  /* one instruction - 64 bits */
+	__u32 __user *code;		  /* one instruction - 64 bits */
 };
 
 struct snd_emu10k1_fx8010_tram {
-- 
cgit v1.2.3-71-gd317


From ccef7ab534347e2e1e1ef398d2ec987d37e519f3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Feb 2009 00:51:41 +0100
Subject: make MTD headers use strict integer types

The MTD headers traditionally use stdint types rather than
the kernel integer types. This converts them to do the
same as all the others.

Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/jffs2.h    | 27 +++++++++---------
 include/mtd/inftl-user.h | 36 ++++++++++++------------
 include/mtd/jffs2-user.h |  5 ++--
 include/mtd/mtd-abi.h    | 66 +++++++++++++++++++++++---------------------
 include/mtd/nftl-user.h  | 32 +++++++++++----------
 include/mtd/ubi-user.h   | 72 +++++++++++++++++++++++++-----------------------
 6 files changed, 123 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index da720bc3eb15..2b32d638147d 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -12,6 +12,7 @@
 #ifndef __LINUX_JFFS2_H__
 #define __LINUX_JFFS2_H__
 
+#include <linux/types.h>
 #include <linux/magic.h>
 
 /* You must include something which defines the C99 uintXX_t types. 
@@ -91,15 +92,15 @@
    byteswapping */
 
 typedef struct {
-	uint32_t v32;
+	__u32 v32;
 } __attribute__((packed)) jint32_t;
 
 typedef struct {
-	uint32_t m;
+	__u32 m;
 } __attribute__((packed)) jmode_t;
 
 typedef struct {
-	uint16_t v16;
+	__u16 v16;
 } __attribute__((packed)) jint16_t;
 
 struct jffs2_unknown_node
@@ -121,12 +122,12 @@ struct jffs2_raw_dirent
 	jint32_t version;
 	jint32_t ino; /* == zero for unlink */
 	jint32_t mctime;
-	uint8_t nsize;
-	uint8_t type;
-	uint8_t unused[2];
+	__u8 nsize;
+	__u8 type;
+	__u8 unused[2];
 	jint32_t node_crc;
 	jint32_t name_crc;
-	uint8_t name[0];
+	__u8 name[0];
 };
 
 /* The JFFS2 raw inode structure: Used for storage on physical media.  */
@@ -153,12 +154,12 @@ struct jffs2_raw_inode
 	jint32_t offset;     /* Where to begin to write.  */
 	jint32_t csize;      /* (Compressed) data size */
 	jint32_t dsize;	     /* Size of the node's data. (after decompression) */
-	uint8_t compr;       /* Compression algorithm used */
-	uint8_t usercompr;   /* Compression algorithm requested by the user */
+	__u8 compr;       /* Compression algorithm used */
+	__u8 usercompr;   /* Compression algorithm requested by the user */
 	jint16_t flags;	     /* See JFFS2_INO_FLAG_* */
 	jint32_t data_crc;   /* CRC for the (compressed) data.  */
 	jint32_t node_crc;   /* CRC for the raw inode (excluding data)  */
-	uint8_t data[0];
+	__u8 data[0];
 };
 
 struct jffs2_raw_xattr {
@@ -168,12 +169,12 @@ struct jffs2_raw_xattr {
 	jint32_t hdr_crc;
 	jint32_t xid;		/* XATTR identifier number */
 	jint32_t version;
-	uint8_t xprefix;
-	uint8_t name_len;
+	__u8 xprefix;
+	__u8 name_len;
 	jint16_t value_len;
 	jint32_t data_crc;
 	jint32_t node_crc;
-	uint8_t data[0];
+	__u8 data[0];
 } __attribute__((packed));
 
 struct jffs2_raw_xref
diff --git a/include/mtd/inftl-user.h b/include/mtd/inftl-user.h
index d409d489d900..8376bd1a9e01 100644
--- a/include/mtd/inftl-user.h
+++ b/include/mtd/inftl-user.h
@@ -16,33 +16,33 @@
 /* Block Control Information */
 
 struct inftl_bci {
-	uint8_t ECCsig[6];
-	uint8_t Status;
-	uint8_t Status1;
+	__u8 ECCsig[6];
+	__u8 Status;
+	__u8 Status1;
 } __attribute__((packed));
 
 struct inftl_unithead1 {
-	uint16_t virtualUnitNo;
-	uint16_t prevUnitNo;
-	uint8_t ANAC;
-	uint8_t NACs;
-	uint8_t parityPerField;
-	uint8_t discarded;
+	__u16 virtualUnitNo;
+	__u16 prevUnitNo;
+	__u8 ANAC;
+	__u8 NACs;
+	__u8 parityPerField;
+	__u8 discarded;
 } __attribute__((packed));
 
 struct inftl_unithead2 {
-	uint8_t parityPerField;
-	uint8_t ANAC;
-	uint16_t prevUnitNo;
-	uint16_t virtualUnitNo;
-	uint8_t NACs;
-	uint8_t discarded;
+	__u8 parityPerField;
+	__u8 ANAC;
+	__u16 prevUnitNo;
+	__u16 virtualUnitNo;
+	__u8 NACs;
+	__u8 discarded;
 } __attribute__((packed));
 
 struct inftl_unittail {
-	uint8_t Reserved[4];
-	uint16_t EraseMark;
-	uint16_t EraseMark1;
+	__u8 Reserved[4];
+	__u16 EraseMark;
+	__u16 EraseMark1;
 } __attribute__((packed));
 
 union inftl_uci {
diff --git a/include/mtd/jffs2-user.h b/include/mtd/jffs2-user.h
index 001685d7fa88..fa94b0eb67c1 100644
--- a/include/mtd/jffs2-user.h
+++ b/include/mtd/jffs2-user.h
@@ -7,6 +7,7 @@
 
 /* This file is blessed for inclusion by userspace */
 #include <linux/jffs2.h>
+#include <linux/types.h>
 #include <endian.h>
 #include <byteswap.h>
 
@@ -19,8 +20,8 @@
 
 extern int target_endian;
 
-#define t16(x) ({ uint16_t __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_16(__b); })
-#define t32(x) ({ uint32_t __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_32(__b); })
+#define t16(x) ({ __u16 __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_16(__b); })
+#define t32(x) ({ __u32 __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_32(__b); })
 
 #define cpu_to_je16(x) ((jint16_t){t16(x)})
 #define cpu_to_je32(x) ((jint32_t){t32(x)})
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index fb672013299c..b6595b3c68b6 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -5,14 +5,16 @@
 #ifndef __MTD_ABI_H__
 #define __MTD_ABI_H__
 
+#include <linux/types.h>
+
 struct erase_info_user {
-	uint32_t start;
-	uint32_t length;
+	__u32 start;
+	__u32 length;
 };
 
 struct mtd_oob_buf {
-	uint32_t start;
-	uint32_t length;
+	__u32 start;
+	__u32 length;
 	unsigned char __user *ptr;
 };
 
@@ -48,30 +50,30 @@ struct mtd_oob_buf {
 #define MTD_OTP_USER		2
 
 struct mtd_info_user {
-	uint8_t type;
-	uint32_t flags;
-	uint32_t size;	 // Total size of the MTD
-	uint32_t erasesize;
-	uint32_t writesize;
-	uint32_t oobsize;   // Amount of OOB data per block (e.g. 16)
+	__u8 type;
+	__u32 flags;
+	__u32 size;	 // Total size of the MTD
+	__u32 erasesize;
+	__u32 writesize;
+	__u32 oobsize;   // Amount of OOB data per block (e.g. 16)
 	/* The below two fields are obsolete and broken, do not use them
 	 * (TODO: remove at some point) */
-	uint32_t ecctype;
-	uint32_t eccsize;
+	__u32 ecctype;
+	__u32 eccsize;
 };
 
 struct region_info_user {
-	uint32_t offset;		/* At which this region starts,
+	__u32 offset;		/* At which this region starts,
 					 * from the beginning of the MTD */
-	uint32_t erasesize;		/* For this region */
-	uint32_t numblocks;		/* Number of blocks in this region */
-	uint32_t regionindex;
+	__u32 erasesize;		/* For this region */
+	__u32 numblocks;		/* Number of blocks in this region */
+	__u32 regionindex;
 };
 
 struct otp_info {
-	uint32_t start;
-	uint32_t length;
-	uint32_t locked;
+	__u32 start;
+	__u32 length;
+	__u32 locked;
 };
 
 #define MEMGETINFO		_IOR('M', 1, struct mtd_info_user)
@@ -99,15 +101,15 @@ struct otp_info {
  * interfaces
  */
 struct nand_oobinfo {
-	uint32_t useecc;
-	uint32_t eccbytes;
-	uint32_t oobfree[8][2];
-	uint32_t eccpos[32];
+	__u32 useecc;
+	__u32 eccbytes;
+	__u32 oobfree[8][2];
+	__u32 eccpos[32];
 };
 
 struct nand_oobfree {
-	uint32_t offset;
-	uint32_t length;
+	__u32 offset;
+	__u32 length;
 };
 
 #define MTD_MAX_OOBFREE_ENTRIES	8
@@ -116,9 +118,9 @@ struct nand_oobfree {
  * diagnosis and to allow creation of raw images
  */
 struct nand_ecclayout {
-	uint32_t eccbytes;
-	uint32_t eccpos[64];
-	uint32_t oobavail;
+	__u32 eccbytes;
+	__u32 eccpos[64];
+	__u32 oobavail;
 	struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES];
 };
 
@@ -131,10 +133,10 @@ struct nand_ecclayout {
  * @bbtblocks:	number of blocks reserved for bad block tables
  */
 struct mtd_ecc_stats {
-	uint32_t corrected;
-	uint32_t failed;
-	uint32_t badblocks;
-	uint32_t bbtblocks;
+	__u32 corrected;
+	__u32 failed;
+	__u32 badblocks;
+	__u32 bbtblocks;
 };
 
 /*
diff --git a/include/mtd/nftl-user.h b/include/mtd/nftl-user.h
index 390d21c080aa..98e9e57f22de 100644
--- a/include/mtd/nftl-user.h
+++ b/include/mtd/nftl-user.h
@@ -6,33 +6,35 @@
 #ifndef __MTD_NFTL_USER_H__
 #define __MTD_NFTL_USER_H__
 
+#include <linux/types.h>
+
 /* Block Control Information */
 
 struct nftl_bci {
 	unsigned char ECCSig[6];
-	uint8_t Status;
-	uint8_t Status1;
+	__u8 Status;
+	__u8 Status1;
 }__attribute__((packed));
 
 /* Unit Control Information */
 
 struct nftl_uci0 {
-	uint16_t VirtUnitNum;
-	uint16_t ReplUnitNum;
-	uint16_t SpareVirtUnitNum;
-	uint16_t SpareReplUnitNum;
+	__u16 VirtUnitNum;
+	__u16 ReplUnitNum;
+	__u16 SpareVirtUnitNum;
+	__u16 SpareReplUnitNum;
 } __attribute__((packed));
 
 struct nftl_uci1 {
-	uint32_t WearInfo;
-	uint16_t EraseMark;
-	uint16_t EraseMark1;
+	__u32 WearInfo;
+	__u16 EraseMark;
+	__u16 EraseMark1;
 } __attribute__((packed));
 
 struct nftl_uci2 {
-        uint16_t FoldMark;
-        uint16_t FoldMark1;
-	uint32_t unused;
+        __u16 FoldMark;
+        __u16 FoldMark1;
+	__u32 unused;
 } __attribute__((packed));
 
 union nftl_uci {
@@ -50,9 +52,9 @@ struct nftl_oob {
 
 struct NFTLMediaHeader {
 	char DataOrgID[6];
-	uint16_t NumEraseUnits;
-	uint16_t FirstPhysicalEUN;
-	uint32_t FormattedSize;
+	__u16 NumEraseUnits;
+	__u16 FirstPhysicalEUN;
+	__u32 FormattedSize;
 	unsigned char UnitSizeFactor;
 } __attribute__((packed));
 
diff --git a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h
index 296efae3525e..466a8320f1e6 100644
--- a/include/mtd/ubi-user.h
+++ b/include/mtd/ubi-user.h
@@ -21,6 +21,8 @@
 #ifndef __UBI_USER_H__
 #define __UBI_USER_H__
 
+#include <linux/types.h>
+
 /*
  * UBI device creation (the same as MTD device attachment)
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -152,7 +154,7 @@
 /* Create an UBI volume */
 #define UBI_IOCMKVOL _IOW(UBI_IOC_MAGIC, 0, struct ubi_mkvol_req)
 /* Remove an UBI volume */
-#define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, int32_t)
+#define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, __s32)
 /* Re-size an UBI volume */
 #define UBI_IOCRSVOL _IOW(UBI_IOC_MAGIC, 2, struct ubi_rsvol_req)
 /* Re-name volumes */
@@ -165,24 +167,24 @@
 /* Attach an MTD device */
 #define UBI_IOCATT _IOW(UBI_CTRL_IOC_MAGIC, 64, struct ubi_attach_req)
 /* Detach an MTD device */
-#define UBI_IOCDET _IOW(UBI_CTRL_IOC_MAGIC, 65, int32_t)
+#define UBI_IOCDET _IOW(UBI_CTRL_IOC_MAGIC, 65, __s32)
 
 /* ioctl commands of UBI volume character devices */
 
 #define UBI_VOL_IOC_MAGIC 'O'
 
 /* Start UBI volume update */
-#define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, int64_t)
+#define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, __s64)
 /* LEB erasure command, used for debugging, disabled by default */
-#define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, int32_t)
+#define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, __s32)
 /* Atomic LEB change command */
-#define UBI_IOCEBCH _IOW(UBI_VOL_IOC_MAGIC, 2, int32_t)
+#define UBI_IOCEBCH _IOW(UBI_VOL_IOC_MAGIC, 2, __s32)
 /* Map LEB command */
 #define UBI_IOCEBMAP _IOW(UBI_VOL_IOC_MAGIC, 3, struct ubi_map_req)
 /* Unmap LEB command */
-#define UBI_IOCEBUNMAP _IOW(UBI_VOL_IOC_MAGIC, 4, int32_t)
+#define UBI_IOCEBUNMAP _IOW(UBI_VOL_IOC_MAGIC, 4, __s32)
 /* Check if LEB is mapped command */
-#define UBI_IOCEBISMAP _IOR(UBI_VOL_IOC_MAGIC, 5, int32_t)
+#define UBI_IOCEBISMAP _IOR(UBI_VOL_IOC_MAGIC, 5, __s32)
 /* Set an UBI volume property */
 #define UBI_IOCSETPROP _IOW(UBI_VOL_IOC_MAGIC, 6, struct ubi_set_prop_req)
 
@@ -260,10 +262,10 @@ enum {
  * sub-page of the first page and add needed padding.
  */
 struct ubi_attach_req {
-	int32_t ubi_num;
-	int32_t mtd_num;
-	int32_t vid_hdr_offset;
-	int8_t padding[12];
+	__s32 ubi_num;
+	__s32 mtd_num;
+	__s32 vid_hdr_offset;
+	__s8 padding[12];
 };
 
 /**
@@ -298,13 +300,13 @@ struct ubi_attach_req {
  * BLOBs, without caring about how to properly align them.
  */
 struct ubi_mkvol_req {
-	int32_t vol_id;
-	int32_t alignment;
-	int64_t bytes;
-	int8_t vol_type;
-	int8_t padding1;
-	int16_t name_len;
-	int8_t padding2[4];
+	__s32 vol_id;
+	__s32 alignment;
+	__s64 bytes;
+	__s8 vol_type;
+	__s8 padding1;
+	__s16 name_len;
+	__s8 padding2[4];
 	char name[UBI_MAX_VOLUME_NAME + 1];
 } __attribute__ ((packed));
 
@@ -320,8 +322,8 @@ struct ubi_mkvol_req {
  * zero number of bytes).
  */
 struct ubi_rsvol_req {
-	int64_t bytes;
-	int32_t vol_id;
+	__s64 bytes;
+	__s32 vol_id;
 } __attribute__ ((packed));
 
 /**
@@ -356,12 +358,12 @@ struct ubi_rsvol_req {
  * re-name request.
  */
 struct ubi_rnvol_req {
-	int32_t count;
-	int8_t padding1[12];
+	__s32 count;
+	__s8 padding1[12];
 	struct {
-		int32_t vol_id;
-		int16_t name_len;
-		int8_t  padding2[2];
+		__s32 vol_id;
+		__s16 name_len;
+		__s8  padding2[2];
 		char    name[UBI_MAX_VOLUME_NAME + 1];
 	} ents[UBI_MAX_RNVOL];
 } __attribute__ ((packed));
@@ -375,10 +377,10 @@ struct ubi_rnvol_req {
  * @padding: reserved for future, not used, has to be zeroed
  */
 struct ubi_leb_change_req {
-	int32_t lnum;
-	int32_t bytes;
-	int8_t  dtype;
-	int8_t  padding[7];
+	__s32 lnum;
+	__s32 bytes;
+	__s8  dtype;
+	__s8  padding[7];
 } __attribute__ ((packed));
 
 /**
@@ -388,9 +390,9 @@ struct ubi_leb_change_req {
  * @padding: reserved for future, not used, has to be zeroed
  */
 struct ubi_map_req {
-	int32_t lnum;
-	int8_t  dtype;
-	int8_t  padding[3];
+	__s32 lnum;
+	__s8  dtype;
+	__s8  padding[3];
 } __attribute__ ((packed));
 
 
@@ -402,9 +404,9 @@ struct ubi_map_req {
  * @value: value to set
  */
 struct ubi_set_prop_req {
-       uint8_t  property;
-       uint8_t  padding[7];
-       uint64_t value;
+       __u8  property;
+       __u8  padding[7];
+       __u64 value;
 }  __attribute__ ((packed));
 
 #endif /* __UBI_USER_H__ */
-- 
cgit v1.2.3-71-gd317


From 60c195c729532815c5209c81442fa0eb26ace706 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Feb 2009 00:51:43 +0100
Subject: make netfilter use strict integer types

Netfilter traditionally uses BSD integer types in its
interface headers. This changes it to use the Linux
strict integer types, like everyone else.

Cc: netfilter-devel@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/netfilter/nf_conntrack_tcp.h |  6 +++--
 include/linux/netfilter/nfnetlink.h        |  4 ++--
 include/linux/netfilter/nfnetlink_compat.h |  7 ++++--
 include/linux/netfilter/nfnetlink_log.h    | 32 +++++++++++++-------------
 include/linux/netfilter/nfnetlink_queue.h  | 24 ++++++++++----------
 include/linux/netfilter/x_tables.h         | 30 +++++++++++++------------
 include/linux/netfilter/xt_CLASSIFY.h      |  4 +++-
 include/linux/netfilter/xt_CONNMARK.h      |  8 ++++---
 include/linux/netfilter/xt_CONNSECMARK.h   |  4 +++-
 include/linux/netfilter/xt_DSCP.h          |  7 +++---
 include/linux/netfilter/xt_MARK.h          |  6 +++--
 include/linux/netfilter/xt_NFLOG.h         | 12 +++++-----
 include/linux/netfilter/xt_NFQUEUE.h       |  4 +++-
 include/linux/netfilter/xt_RATEEST.h       |  6 +++--
 include/linux/netfilter/xt_SECMARK.h       |  6 +++--
 include/linux/netfilter/xt_TCPMSS.h        |  4 +++-
 include/linux/netfilter/xt_connbytes.h     |  6 +++--
 include/linux/netfilter/xt_connmark.h      |  8 ++++---
 include/linux/netfilter/xt_conntrack.h     | 12 +++++-----
 include/linux/netfilter/xt_dccp.h          | 14 +++++++-----
 include/linux/netfilter/xt_dscp.h          | 12 +++++-----
 include/linux/netfilter/xt_esp.h           |  6 +++--
 include/linux/netfilter/xt_hashlimit.h     | 32 +++++++++++++-------------
 include/linux/netfilter/xt_iprange.h       |  4 +++-
 include/linux/netfilter/xt_length.h        |  6 +++--
 include/linux/netfilter/xt_limit.h         | 10 +++++----
 include/linux/netfilter/xt_mark.h          |  8 ++++---
 include/linux/netfilter/xt_multiport.h     | 18 ++++++++-------
 include/linux/netfilter/xt_owner.h         |  8 ++++---
 include/linux/netfilter/xt_physdev.h       |  6 +++--
 include/linux/netfilter/xt_policy.h        | 14 +++++++-----
 include/linux/netfilter/xt_rateest.h       | 14 +++++++-----
 include/linux/netfilter/xt_realm.h         |  8 ++++---
 include/linux/netfilter/xt_recent.h        | 12 +++++-----
 include/linux/netfilter/xt_sctp.h          | 36 ++++++++++++++++--------------
 include/linux/netfilter/xt_statistic.h     | 14 +++++++-----
 include/linux/netfilter/xt_string.h        | 12 +++++-----
 include/linux/netfilter/xt_tcpmss.h        |  6 +++--
 include/linux/netfilter/xt_tcpudp.h        | 20 +++++++++--------
 39 files changed, 260 insertions(+), 190 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h
index a049df4f2236..3066789b972a 100644
--- a/include/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/linux/netfilter/nf_conntrack_tcp.h
@@ -2,6 +2,8 @@
 #define _NF_CONNTRACK_TCP_H
 /* TCP tracking. */
 
+#include <linux/types.h>
+
 /* This is exposed to userspace (ctnetlink) */
 enum tcp_conntrack {
 	TCP_CONNTRACK_NONE,
@@ -34,8 +36,8 @@ enum tcp_conntrack {
 #define IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED	0x10
 
 struct nf_ct_tcp_flags {
-	u_int8_t flags;
-	u_int8_t mask;
+	__u8 flags;
+	__u8 mask;
 };
 
 #ifdef __KERNEL__
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 7d8e0455ccac..e53546cfa353 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -25,8 +25,8 @@ enum nfnetlink_groups {
 /* General form of address family dependent message.
  */
 struct nfgenmsg {
-	u_int8_t  nfgen_family;		/* AF_xxx */
-	u_int8_t  version;		/* nfnetlink version */
+	__u8  nfgen_family;		/* AF_xxx */
+	__u8  version;		/* nfnetlink version */
 	__be16    res_id;		/* resource id */
 };
 
diff --git a/include/linux/netfilter/nfnetlink_compat.h b/include/linux/netfilter/nfnetlink_compat.h
index e1451760c9cd..eda55cabceec 100644
--- a/include/linux/netfilter/nfnetlink_compat.h
+++ b/include/linux/netfilter/nfnetlink_compat.h
@@ -1,5 +1,8 @@
 #ifndef _NFNETLINK_COMPAT_H
 #define _NFNETLINK_COMPAT_H
+
+#include <linux/types.h>
+
 #ifndef __KERNEL__
 /* Old nfnetlink macros for userspace */
 
@@ -20,8 +23,8 @@
 
 struct nfattr
 {
-	u_int16_t nfa_len;
-	u_int16_t nfa_type;	/* we use 15 bits for the type, and the highest
+	__u16 nfa_len;
+	__u16 nfa_type;	/* we use 15 bits for the type, and the highest
 				 * bit to indicate whether the payload is nested */
 };
 
diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h
index f661731f3cb1..d3bab7a2c9b7 100644
--- a/include/linux/netfilter/nfnetlink_log.h
+++ b/include/linux/netfilter/nfnetlink_log.h
@@ -17,14 +17,14 @@ enum nfulnl_msg_types {
 
 struct nfulnl_msg_packet_hdr {
 	__be16		hw_protocol;	/* hw protocol (network order) */
-	u_int8_t	hook;		/* netfilter hook */
-	u_int8_t	_pad;
+	__u8	hook;		/* netfilter hook */
+	__u8	_pad;
 };
 
 struct nfulnl_msg_packet_hw {
 	__be16		hw_addrlen;
-	u_int16_t	_pad;
-	u_int8_t	hw_addr[8];
+	__u16	_pad;
+	__u8	hw_addr[8];
 };
 
 struct nfulnl_msg_packet_timestamp {
@@ -35,12 +35,12 @@ struct nfulnl_msg_packet_timestamp {
 enum nfulnl_attr_type {
 	NFULA_UNSPEC,
 	NFULA_PACKET_HDR,
-	NFULA_MARK,			/* u_int32_t nfmark */
+	NFULA_MARK,			/* __u32 nfmark */
 	NFULA_TIMESTAMP,		/* nfulnl_msg_packet_timestamp */
-	NFULA_IFINDEX_INDEV,		/* u_int32_t ifindex */
-	NFULA_IFINDEX_OUTDEV,		/* u_int32_t ifindex */
-	NFULA_IFINDEX_PHYSINDEV,	/* u_int32_t ifindex */
-	NFULA_IFINDEX_PHYSOUTDEV,	/* u_int32_t ifindex */
+	NFULA_IFINDEX_INDEV,		/* __u32 ifindex */
+	NFULA_IFINDEX_OUTDEV,		/* __u32 ifindex */
+	NFULA_IFINDEX_PHYSINDEV,	/* __u32 ifindex */
+	NFULA_IFINDEX_PHYSOUTDEV,	/* __u32 ifindex */
 	NFULA_HWADDR,			/* nfulnl_msg_packet_hw */
 	NFULA_PAYLOAD,			/* opaque data payload */
 	NFULA_PREFIX,			/* string prefix */
@@ -65,23 +65,23 @@ enum nfulnl_msg_config_cmds {
 };
 
 struct nfulnl_msg_config_cmd {
-	u_int8_t	command;	/* nfulnl_msg_config_cmds */
+	__u8	command;	/* nfulnl_msg_config_cmds */
 } __attribute__ ((packed));
 
 struct nfulnl_msg_config_mode {
 	__be32		copy_range;
-	u_int8_t	copy_mode;
-	u_int8_t	_pad;
+	__u8	copy_mode;
+	__u8	_pad;
 } __attribute__ ((packed));
 
 enum nfulnl_attr_config {
 	NFULA_CFG_UNSPEC,
 	NFULA_CFG_CMD,			/* nfulnl_msg_config_cmd */
 	NFULA_CFG_MODE,			/* nfulnl_msg_config_mode */
-	NFULA_CFG_NLBUFSIZ,		/* u_int32_t buffer size */
-	NFULA_CFG_TIMEOUT,		/* u_int32_t in 1/100 s */
-	NFULA_CFG_QTHRESH,		/* u_int32_t */
-	NFULA_CFG_FLAGS,		/* u_int16_t */
+	NFULA_CFG_NLBUFSIZ,		/* __u32 buffer size */
+	NFULA_CFG_TIMEOUT,		/* __u32 in 1/100 s */
+	NFULA_CFG_QTHRESH,		/* __u32 */
+	NFULA_CFG_FLAGS,		/* __u16 */
 	__NFULA_CFG_MAX
 };
 #define NFULA_CFG_MAX (__NFULA_CFG_MAX -1)
diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
index 83e789633e35..2455fe5f4e01 100644
--- a/include/linux/netfilter/nfnetlink_queue.h
+++ b/include/linux/netfilter/nfnetlink_queue.h
@@ -15,13 +15,13 @@ enum nfqnl_msg_types {
 struct nfqnl_msg_packet_hdr {
 	__be32		packet_id;	/* unique ID of packet in queue */
 	__be16		hw_protocol;	/* hw protocol (network order) */
-	u_int8_t	hook;		/* netfilter hook */
+	__u8	hook;		/* netfilter hook */
 } __attribute__ ((packed));
 
 struct nfqnl_msg_packet_hw {
 	__be16		hw_addrlen;
-	u_int16_t	_pad;
-	u_int8_t	hw_addr[8];
+	__u16	_pad;
+	__u8	hw_addr[8];
 };
 
 struct nfqnl_msg_packet_timestamp {
@@ -33,12 +33,12 @@ enum nfqnl_attr_type {
 	NFQA_UNSPEC,
 	NFQA_PACKET_HDR,
 	NFQA_VERDICT_HDR,		/* nfqnl_msg_verdict_hrd */
-	NFQA_MARK,			/* u_int32_t nfmark */
+	NFQA_MARK,			/* __u32 nfmark */
 	NFQA_TIMESTAMP,			/* nfqnl_msg_packet_timestamp */
-	NFQA_IFINDEX_INDEV,		/* u_int32_t ifindex */
-	NFQA_IFINDEX_OUTDEV,		/* u_int32_t ifindex */
-	NFQA_IFINDEX_PHYSINDEV,		/* u_int32_t ifindex */
-	NFQA_IFINDEX_PHYSOUTDEV,	/* u_int32_t ifindex */
+	NFQA_IFINDEX_INDEV,		/* __u32 ifindex */
+	NFQA_IFINDEX_OUTDEV,		/* __u32 ifindex */
+	NFQA_IFINDEX_PHYSINDEV,		/* __u32 ifindex */
+	NFQA_IFINDEX_PHYSOUTDEV,	/* __u32 ifindex */
 	NFQA_HWADDR,			/* nfqnl_msg_packet_hw */
 	NFQA_PAYLOAD,			/* opaque data payload */
 
@@ -61,8 +61,8 @@ enum nfqnl_msg_config_cmds {
 };
 
 struct nfqnl_msg_config_cmd {
-	u_int8_t	command;	/* nfqnl_msg_config_cmds */
-	u_int8_t	_pad;
+	__u8	command;	/* nfqnl_msg_config_cmds */
+	__u8	_pad;
 	__be16		pf;		/* AF_xxx for PF_[UN]BIND */
 };
 
@@ -74,7 +74,7 @@ enum nfqnl_config_mode {
 
 struct nfqnl_msg_config_params {
 	__be32		copy_range;
-	u_int8_t	copy_mode;	/* enum nfqnl_config_mode */
+	__u8	copy_mode;	/* enum nfqnl_config_mode */
 } __attribute__ ((packed));
 
 
@@ -82,7 +82,7 @@ enum nfqnl_attr_config {
 	NFQA_CFG_UNSPEC,
 	NFQA_CFG_CMD,			/* nfqnl_msg_config_cmd */
 	NFQA_CFG_PARAMS,		/* nfqnl_msg_config_params */
-	NFQA_CFG_QUEUE_MAXLEN,		/* u_int32_t */
+	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
 	__NFQA_CFG_MAX
 };
 #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index c7ee8744d26b..33fd9c949d80 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -1,6 +1,8 @@
 #ifndef _X_TABLES_H
 #define _X_TABLES_H
 
+#include <linux/types.h>
+
 #define XT_FUNCTION_MAXNAMELEN 30
 #define XT_TABLE_MAXNAMELEN 32
 
@@ -8,22 +10,22 @@ struct xt_entry_match
 {
 	union {
 		struct {
-			u_int16_t match_size;
+			__u16 match_size;
 
 			/* Used by userspace */
 			char name[XT_FUNCTION_MAXNAMELEN-1];
 
-			u_int8_t revision;
+			__u8 revision;
 		} user;
 		struct {
-			u_int16_t match_size;
+			__u16 match_size;
 
 			/* Used inside the kernel */
 			struct xt_match *match;
 		} kernel;
 
 		/* Total length */
-		u_int16_t match_size;
+		__u16 match_size;
 	} u;
 
 	unsigned char data[0];
@@ -33,22 +35,22 @@ struct xt_entry_target
 {
 	union {
 		struct {
-			u_int16_t target_size;
+			__u16 target_size;
 
 			/* Used by userspace */
 			char name[XT_FUNCTION_MAXNAMELEN-1];
 
-			u_int8_t revision;
+			__u8 revision;
 		} user;
 		struct {
-			u_int16_t target_size;
+			__u16 target_size;
 
 			/* Used inside the kernel */
 			struct xt_target *target;
 		} kernel;
 
 		/* Total length */
-		u_int16_t target_size;
+		__u16 target_size;
 	} u;
 
 	unsigned char data[0];
@@ -74,7 +76,7 @@ struct xt_get_revision
 {
 	char name[XT_FUNCTION_MAXNAMELEN-1];
 
-	u_int8_t revision;
+	__u8 revision;
 };
 
 /* CONTINUE verdict for targets */
@@ -90,10 +92,10 @@ struct xt_get_revision
  */
 struct _xt_align
 {
-	u_int8_t u8;
-	u_int16_t u16;
-	u_int32_t u32;
-	u_int64_t u64;
+	__u8 u8;
+	__u16 u16;
+	__u32 u32;
+	__u64 u64;
 };
 
 #define XT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) 	\
@@ -109,7 +111,7 @@ struct _xt_align
 
 struct xt_counters
 {
-	u_int64_t pcnt, bcnt;			/* Packet and byte counters */
+	__u64 pcnt, bcnt;			/* Packet and byte counters */
 };
 
 /* The argument to IPT_SO_ADD_COUNTERS. */
diff --git a/include/linux/netfilter/xt_CLASSIFY.h b/include/linux/netfilter/xt_CLASSIFY.h
index 58111355255d..a813bf14dd63 100644
--- a/include/linux/netfilter/xt_CLASSIFY.h
+++ b/include/linux/netfilter/xt_CLASSIFY.h
@@ -1,8 +1,10 @@
 #ifndef _XT_CLASSIFY_H
 #define _XT_CLASSIFY_H
 
+#include <linux/types.h>
+
 struct xt_classify_target_info {
-	u_int32_t priority;
+	__u32 priority;
 };
 
 #endif /*_XT_CLASSIFY_H */
diff --git a/include/linux/netfilter/xt_CONNMARK.h b/include/linux/netfilter/xt_CONNMARK.h
index 4e58ba43c289..7635c8ffdadb 100644
--- a/include/linux/netfilter/xt_CONNMARK.h
+++ b/include/linux/netfilter/xt_CONNMARK.h
@@ -1,6 +1,8 @@
 #ifndef _XT_CONNMARK_H_target
 #define _XT_CONNMARK_H_target
 
+#include <linux/types.h>
+
 /* Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
  * by Henrik Nordstrom <hno@marasystems.com>
  *
@@ -19,12 +21,12 @@ enum {
 struct xt_connmark_target_info {
 	unsigned long mark;
 	unsigned long mask;
-	u_int8_t mode;
+	__u8 mode;
 };
 
 struct xt_connmark_tginfo1 {
-	u_int32_t ctmark, ctmask, nfmask;
-	u_int8_t mode;
+	__u32 ctmark, ctmask, nfmask;
+	__u8 mode;
 };
 
 #endif /*_XT_CONNMARK_H_target*/
diff --git a/include/linux/netfilter/xt_CONNSECMARK.h b/include/linux/netfilter/xt_CONNSECMARK.h
index c6bd75469ba2..b973ff80fa1e 100644
--- a/include/linux/netfilter/xt_CONNSECMARK.h
+++ b/include/linux/netfilter/xt_CONNSECMARK.h
@@ -1,13 +1,15 @@
 #ifndef _XT_CONNSECMARK_H_target
 #define _XT_CONNSECMARK_H_target
 
+#include <linux/types.h>
+
 enum {
 	CONNSECMARK_SAVE = 1,
 	CONNSECMARK_RESTORE,
 };
 
 struct xt_connsecmark_target_info {
-	u_int8_t mode;
+	__u8 mode;
 };
 
 #endif /*_XT_CONNSECMARK_H_target */
diff --git a/include/linux/netfilter/xt_DSCP.h b/include/linux/netfilter/xt_DSCP.h
index 14da1968e2c6..648e0b3bed29 100644
--- a/include/linux/netfilter/xt_DSCP.h
+++ b/include/linux/netfilter/xt_DSCP.h
@@ -11,15 +11,16 @@
 #ifndef _XT_DSCP_TARGET_H
 #define _XT_DSCP_TARGET_H
 #include <linux/netfilter/xt_dscp.h>
+#include <linux/types.h>
 
 /* target info */
 struct xt_DSCP_info {
-	u_int8_t dscp;
+	__u8 dscp;
 };
 
 struct xt_tos_target_info {
-	u_int8_t tos_value;
-	u_int8_t tos_mask;
+	__u8 tos_value;
+	__u8 tos_mask;
 };
 
 #endif /* _XT_DSCP_TARGET_H */
diff --git a/include/linux/netfilter/xt_MARK.h b/include/linux/netfilter/xt_MARK.h
index 778b278fd9f2..028304bcc0b1 100644
--- a/include/linux/netfilter/xt_MARK.h
+++ b/include/linux/netfilter/xt_MARK.h
@@ -1,6 +1,8 @@
 #ifndef _XT_MARK_H_target
 #define _XT_MARK_H_target
 
+#include <linux/types.h>
+
 /* Version 0 */
 struct xt_mark_target_info {
 	unsigned long mark;
@@ -15,11 +17,11 @@ enum {
 
 struct xt_mark_target_info_v1 {
 	unsigned long mark;
-	u_int8_t mode;
+	__u8 mode;
 };
 
 struct xt_mark_tginfo2 {
-	u_int32_t mark, mask;
+	__u32 mark, mask;
 };
 
 #endif /*_XT_MARK_H_target */
diff --git a/include/linux/netfilter/xt_NFLOG.h b/include/linux/netfilter/xt_NFLOG.h
index cdcd0ed58f7a..eaac7b5226e9 100644
--- a/include/linux/netfilter/xt_NFLOG.h
+++ b/include/linux/netfilter/xt_NFLOG.h
@@ -1,17 +1,19 @@
 #ifndef _XT_NFLOG_TARGET
 #define _XT_NFLOG_TARGET
 
+#include <linux/types.h>
+
 #define XT_NFLOG_DEFAULT_GROUP		0x1
 #define XT_NFLOG_DEFAULT_THRESHOLD	1
 
 #define XT_NFLOG_MASK			0x0
 
 struct xt_nflog_info {
-	u_int32_t	len;
-	u_int16_t	group;
-	u_int16_t	threshold;
-	u_int16_t	flags;
-	u_int16_t	pad;
+	__u32	len;
+	__u16	group;
+	__u16	threshold;
+	__u16	flags;
+	__u16	pad;
 	char		prefix[64];
 };
 
diff --git a/include/linux/netfilter/xt_NFQUEUE.h b/include/linux/netfilter/xt_NFQUEUE.h
index 9a9af79f74d2..982a89f78272 100644
--- a/include/linux/netfilter/xt_NFQUEUE.h
+++ b/include/linux/netfilter/xt_NFQUEUE.h
@@ -8,9 +8,11 @@
 #ifndef _XT_NFQ_TARGET_H
 #define _XT_NFQ_TARGET_H
 
+#include <linux/types.h>
+
 /* target info */
 struct xt_NFQ_info {
-	u_int16_t queuenum;
+	__u16 queuenum;
 };
 
 #endif /* _XT_NFQ_TARGET_H */
diff --git a/include/linux/netfilter/xt_RATEEST.h b/include/linux/netfilter/xt_RATEEST.h
index f79e3133cbea..6605e20ad8cf 100644
--- a/include/linux/netfilter/xt_RATEEST.h
+++ b/include/linux/netfilter/xt_RATEEST.h
@@ -1,10 +1,12 @@
 #ifndef _XT_RATEEST_TARGET_H
 #define _XT_RATEEST_TARGET_H
 
+#include <linux/types.h>
+
 struct xt_rateest_target_info {
 	char			name[IFNAMSIZ];
-	int8_t			interval;
-	u_int8_t		ewma_log;
+	__s8			interval;
+	__u8		ewma_log;
 
 	/* Used internally by the kernel */
 	struct xt_rateest	*est __attribute__((aligned(8)));
diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h
index c53fbffa997d..6fcd3448b186 100644
--- a/include/linux/netfilter/xt_SECMARK.h
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -1,6 +1,8 @@
 #ifndef _XT_SECMARK_H_target
 #define _XT_SECMARK_H_target
 
+#include <linux/types.h>
+
 /*
  * This is intended for use by various security subsystems (but not
  * at the same time).
@@ -12,12 +14,12 @@
 #define SECMARK_SELCTX_MAX	256
 
 struct xt_secmark_target_selinux_info {
-	u_int32_t selsid;
+	__u32 selsid;
 	char selctx[SECMARK_SELCTX_MAX];
 };
 
 struct xt_secmark_target_info {
-	u_int8_t mode;
+	__u8 mode;
 	union {
 		struct xt_secmark_target_selinux_info sel;
 	} u;
diff --git a/include/linux/netfilter/xt_TCPMSS.h b/include/linux/netfilter/xt_TCPMSS.h
index 53a292cd47f3..9a6960afc134 100644
--- a/include/linux/netfilter/xt_TCPMSS.h
+++ b/include/linux/netfilter/xt_TCPMSS.h
@@ -1,8 +1,10 @@
 #ifndef _XT_TCPMSS_H
 #define _XT_TCPMSS_H
 
+#include <linux/types.h>
+
 struct xt_tcpmss_info {
-	u_int16_t mss;
+	__u16 mss;
 };
 
 #define XT_TCPMSS_CLAMP_PMTU 0xffff
diff --git a/include/linux/netfilter/xt_connbytes.h b/include/linux/netfilter/xt_connbytes.h
index c022c989754d..52bd6153b996 100644
--- a/include/linux/netfilter/xt_connbytes.h
+++ b/include/linux/netfilter/xt_connbytes.h
@@ -1,6 +1,8 @@
 #ifndef _XT_CONNBYTES_H
 #define _XT_CONNBYTES_H
 
+#include <linux/types.h>
+
 enum xt_connbytes_what {
 	XT_CONNBYTES_PKTS,
 	XT_CONNBYTES_BYTES,
@@ -19,7 +21,7 @@ struct xt_connbytes_info
 		aligned_u64 from;	/* count to be matched */
 		aligned_u64 to;		/* count to be matched */
 	} count;
-	u_int8_t what;		/* ipt_connbytes_what */
-	u_int8_t direction;	/* ipt_connbytes_direction */
+	__u8 what;		/* ipt_connbytes_what */
+	__u8 direction;	/* ipt_connbytes_direction */
 };
 #endif
diff --git a/include/linux/netfilter/xt_connmark.h b/include/linux/netfilter/xt_connmark.h
index 359ef86918dc..571e266d004c 100644
--- a/include/linux/netfilter/xt_connmark.h
+++ b/include/linux/netfilter/xt_connmark.h
@@ -1,6 +1,8 @@
 #ifndef _XT_CONNMARK_H
 #define _XT_CONNMARK_H
 
+#include <linux/types.h>
+
 /* Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
  * by Henrik Nordstrom <hno@marasystems.com>
  *
@@ -12,12 +14,12 @@
 
 struct xt_connmark_info {
 	unsigned long mark, mask;
-	u_int8_t invert;
+	__u8 invert;
 };
 
 struct xt_connmark_mtinfo1 {
-	u_int32_t mark, mask;
-	u_int8_t invert;
+	__u32 mark, mask;
+	__u8 invert;
 };
 
 #endif /*_XT_CONNMARK_H*/
diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h
index 8f5345275393..3430c7751948 100644
--- a/include/linux/netfilter/xt_conntrack.h
+++ b/include/linux/netfilter/xt_conntrack.h
@@ -63,9 +63,9 @@ struct xt_conntrack_info
 	unsigned long expires_min, expires_max;
 
 	/* Flags word */
-	u_int8_t flags;
+	__u8 flags;
 	/* Inverse flags */
-	u_int8_t invflags;
+	__u8 invflags;
 };
 
 struct xt_conntrack_mtinfo1 {
@@ -73,12 +73,12 @@ struct xt_conntrack_mtinfo1 {
 	union nf_inet_addr origdst_addr, origdst_mask;
 	union nf_inet_addr replsrc_addr, replsrc_mask;
 	union nf_inet_addr repldst_addr, repldst_mask;
-	u_int32_t expires_min, expires_max;
-	u_int16_t l4proto;
+	__u32 expires_min, expires_max;
+	__u16 l4proto;
 	__be16 origsrc_port, origdst_port;
 	__be16 replsrc_port, repldst_port;
-	u_int16_t match_flags, invert_flags;
-	u_int8_t state_mask, status_mask;
+	__u16 match_flags, invert_flags;
+	__u8 state_mask, status_mask;
 };
 
 #endif /*_XT_CONNTRACK_H*/
diff --git a/include/linux/netfilter/xt_dccp.h b/include/linux/netfilter/xt_dccp.h
index e0221b9d32cb..a579e1b6f040 100644
--- a/include/linux/netfilter/xt_dccp.h
+++ b/include/linux/netfilter/xt_dccp.h
@@ -1,6 +1,8 @@
 #ifndef _XT_DCCP_H_
 #define _XT_DCCP_H_
 
+#include <linux/types.h>
+
 #define XT_DCCP_SRC_PORTS	        0x01
 #define XT_DCCP_DEST_PORTS	        0x02
 #define XT_DCCP_TYPE			0x04
@@ -9,14 +11,14 @@
 #define XT_DCCP_VALID_FLAGS		0x0f
 
 struct xt_dccp_info {
-	u_int16_t dpts[2];  /* Min, Max */
-	u_int16_t spts[2];  /* Min, Max */
+	__u16 dpts[2];  /* Min, Max */
+	__u16 spts[2];  /* Min, Max */
 
-	u_int16_t flags;
-	u_int16_t invflags;
+	__u16 flags;
+	__u16 invflags;
 
-	u_int16_t typemask;
-	u_int8_t option;
+	__u16 typemask;
+	__u8 option;
 };
 
 #endif /* _XT_DCCP_H_ */
diff --git a/include/linux/netfilter/xt_dscp.h b/include/linux/netfilter/xt_dscp.h
index f49bc1a648dc..15f8932ad5ce 100644
--- a/include/linux/netfilter/xt_dscp.h
+++ b/include/linux/netfilter/xt_dscp.h
@@ -10,20 +10,22 @@
 #ifndef _XT_DSCP_H
 #define _XT_DSCP_H
 
+#include <linux/types.h>
+
 #define XT_DSCP_MASK	0xfc	/* 11111100 */
 #define XT_DSCP_SHIFT	2
 #define XT_DSCP_MAX	0x3f	/* 00111111 */
 
 /* match info */
 struct xt_dscp_info {
-	u_int8_t dscp;
-	u_int8_t invert;
+	__u8 dscp;
+	__u8 invert;
 };
 
 struct xt_tos_match_info {
-	u_int8_t tos_mask;
-	u_int8_t tos_value;
-	u_int8_t invert;
+	__u8 tos_mask;
+	__u8 tos_value;
+	__u8 invert;
 };
 
 #endif /* _XT_DSCP_H */
diff --git a/include/linux/netfilter/xt_esp.h b/include/linux/netfilter/xt_esp.h
index 9380fb1c27da..ef6fa4747d0a 100644
--- a/include/linux/netfilter/xt_esp.h
+++ b/include/linux/netfilter/xt_esp.h
@@ -1,10 +1,12 @@
 #ifndef _XT_ESP_H
 #define _XT_ESP_H
 
+#include <linux/types.h>
+
 struct xt_esp
 {
-	u_int32_t spis[2];	/* Security Parameter Index */
-	u_int8_t  invflags;	/* Inverse flags */
+	__u32 spis[2];	/* Security Parameter Index */
+	__u8  invflags;	/* Inverse flags */
 };
 
 /* Values for "invflags" field in struct xt_esp. */
diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h
index 51b18d83b477..b1925b5925e9 100644
--- a/include/linux/netfilter/xt_hashlimit.h
+++ b/include/linux/netfilter/xt_hashlimit.h
@@ -1,6 +1,8 @@
 #ifndef _XT_HASHLIMIT_H
 #define _XT_HASHLIMIT_H
 
+#include <linux/types.h>
+
 /* timings are in milliseconds. */
 #define XT_HASHLIMIT_SCALE 10000
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
@@ -18,15 +20,15 @@ enum {
 };
 
 struct hashlimit_cfg {
-	u_int32_t mode;	  /* bitmask of XT_HASHLIMIT_HASH_* */
-	u_int32_t avg;    /* Average secs between packets * scale */
-	u_int32_t burst;  /* Period multiplier for upper limit. */
+	__u32 mode;	  /* bitmask of XT_HASHLIMIT_HASH_* */
+	__u32 avg;    /* Average secs between packets * scale */
+	__u32 burst;  /* Period multiplier for upper limit. */
 
 	/* user specified */
-	u_int32_t size;		/* how many buckets */
-	u_int32_t max;		/* max number of entries */
-	u_int32_t gc_interval;	/* gc interval */
-	u_int32_t expire;	/* when do entries expire? */
+	__u32 size;		/* how many buckets */
+	__u32 max;		/* max number of entries */
+	__u32 gc_interval;	/* gc interval */
+	__u32 expire;	/* when do entries expire? */
 };
 
 struct xt_hashlimit_info {
@@ -42,17 +44,17 @@ struct xt_hashlimit_info {
 };
 
 struct hashlimit_cfg1 {
-	u_int32_t mode;	  /* bitmask of XT_HASHLIMIT_HASH_* */
-	u_int32_t avg;    /* Average secs between packets * scale */
-	u_int32_t burst;  /* Period multiplier for upper limit. */
+	__u32 mode;	  /* bitmask of XT_HASHLIMIT_HASH_* */
+	__u32 avg;    /* Average secs between packets * scale */
+	__u32 burst;  /* Period multiplier for upper limit. */
 
 	/* user specified */
-	u_int32_t size;		/* how many buckets */
-	u_int32_t max;		/* max number of entries */
-	u_int32_t gc_interval;	/* gc interval */
-	u_int32_t expire;	/* when do entries expire? */
+	__u32 size;		/* how many buckets */
+	__u32 max;		/* max number of entries */
+	__u32 gc_interval;	/* gc interval */
+	__u32 expire;	/* when do entries expire? */
 
-	u_int8_t srcmask, dstmask;
+	__u8 srcmask, dstmask;
 };
 
 struct xt_hashlimit_mtinfo1 {
diff --git a/include/linux/netfilter/xt_iprange.h b/include/linux/netfilter/xt_iprange.h
index a4299c7d3680..c1f21a779a45 100644
--- a/include/linux/netfilter/xt_iprange.h
+++ b/include/linux/netfilter/xt_iprange.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_NETFILTER_XT_IPRANGE_H
 #define _LINUX_NETFILTER_XT_IPRANGE_H 1
 
+#include <linux/types.h>
+
 enum {
 	IPRANGE_SRC     = 1 << 0,	/* match source IP address */
 	IPRANGE_DST     = 1 << 1,	/* match destination IP address */
@@ -11,7 +13,7 @@ enum {
 struct xt_iprange_mtinfo {
 	union nf_inet_addr src_min, src_max;
 	union nf_inet_addr dst_min, dst_max;
-	u_int8_t flags;
+	__u8 flags;
 };
 
 #endif /* _LINUX_NETFILTER_XT_IPRANGE_H */
diff --git a/include/linux/netfilter/xt_length.h b/include/linux/netfilter/xt_length.h
index 7c2b439f73fe..b82ed7c4b1e0 100644
--- a/include/linux/netfilter/xt_length.h
+++ b/include/linux/netfilter/xt_length.h
@@ -1,9 +1,11 @@
 #ifndef _XT_LENGTH_H
 #define _XT_LENGTH_H
 
+#include <linux/types.h>
+
 struct xt_length_info {
-    u_int16_t	min, max;
-    u_int8_t	invert;
+    __u16	min, max;
+    __u8	invert;
 };
 
 #endif /*_XT_LENGTH_H*/
diff --git a/include/linux/netfilter/xt_limit.h b/include/linux/netfilter/xt_limit.h
index b3ce65375ecb..190e98b1f7c9 100644
--- a/include/linux/netfilter/xt_limit.h
+++ b/include/linux/netfilter/xt_limit.h
@@ -1,19 +1,21 @@
 #ifndef _XT_RATE_H
 #define _XT_RATE_H
 
+#include <linux/types.h>
+
 /* timings are in milliseconds. */
 #define XT_LIMIT_SCALE 10000
 
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
    seconds, or one every 59 hours. */
 struct xt_rateinfo {
-	u_int32_t avg;    /* Average secs between packets * scale */
-	u_int32_t burst;  /* Period multiplier for upper limit. */
+	__u32 avg;    /* Average secs between packets * scale */
+	__u32 burst;  /* Period multiplier for upper limit. */
 
 	/* Used internally by the kernel */
 	unsigned long prev;
-	u_int32_t credit;
-	u_int32_t credit_cap, cost;
+	__u32 credit;
+	__u32 credit_cap, cost;
 
 	/* Ugly, ugly fucker. */
 	struct xt_rateinfo *master;
diff --git a/include/linux/netfilter/xt_mark.h b/include/linux/netfilter/xt_mark.h
index fae74bc3f34e..6fa460a3cc29 100644
--- a/include/linux/netfilter/xt_mark.h
+++ b/include/linux/netfilter/xt_mark.h
@@ -1,14 +1,16 @@
 #ifndef _XT_MARK_H
 #define _XT_MARK_H
 
+#include <linux/types.h>
+
 struct xt_mark_info {
     unsigned long mark, mask;
-    u_int8_t invert;
+    __u8 invert;
 };
 
 struct xt_mark_mtinfo1 {
-	u_int32_t mark, mask;
-	u_int8_t invert;
+	__u32 mark, mask;
+	__u8 invert;
 };
 
 #endif /*_XT_MARK_H*/
diff --git a/include/linux/netfilter/xt_multiport.h b/include/linux/netfilter/xt_multiport.h
index d49ee4183710..185db499fcbc 100644
--- a/include/linux/netfilter/xt_multiport.h
+++ b/include/linux/netfilter/xt_multiport.h
@@ -1,6 +1,8 @@
 #ifndef _XT_MULTIPORT_H
 #define _XT_MULTIPORT_H
 
+#include <linux/types.h>
+
 enum xt_multiport_flags
 {
 	XT_MULTIPORT_SOURCE,
@@ -13,18 +15,18 @@ enum xt_multiport_flags
 /* Must fit inside union xt_matchinfo: 16 bytes */
 struct xt_multiport
 {
-	u_int8_t flags;				/* Type of comparison */
-	u_int8_t count;				/* Number of ports */
-	u_int16_t ports[XT_MULTI_PORTS];	/* Ports */
+	__u8 flags;				/* Type of comparison */
+	__u8 count;				/* Number of ports */
+	__u16 ports[XT_MULTI_PORTS];	/* Ports */
 };
 
 struct xt_multiport_v1
 {
-	u_int8_t flags;				/* Type of comparison */
-	u_int8_t count;				/* Number of ports */
-	u_int16_t ports[XT_MULTI_PORTS];	/* Ports */
-	u_int8_t pflags[XT_MULTI_PORTS];	/* Port flags */
-	u_int8_t invert;			/* Invert flag */
+	__u8 flags;				/* Type of comparison */
+	__u8 count;				/* Number of ports */
+	__u16 ports[XT_MULTI_PORTS];	/* Ports */
+	__u8 pflags[XT_MULTI_PORTS];	/* Port flags */
+	__u8 invert;			/* Invert flag */
 };
 
 #endif /*_XT_MULTIPORT_H*/
diff --git a/include/linux/netfilter/xt_owner.h b/include/linux/netfilter/xt_owner.h
index c84e52cfe415..2081761714b5 100644
--- a/include/linux/netfilter/xt_owner.h
+++ b/include/linux/netfilter/xt_owner.h
@@ -1,6 +1,8 @@
 #ifndef _XT_OWNER_MATCH_H
 #define _XT_OWNER_MATCH_H
 
+#include <linux/types.h>
+
 enum {
 	XT_OWNER_UID    = 1 << 0,
 	XT_OWNER_GID    = 1 << 1,
@@ -8,9 +10,9 @@ enum {
 };
 
 struct xt_owner_match_info {
-	u_int32_t uid_min, uid_max;
-	u_int32_t gid_min, gid_max;
-	u_int8_t match, invert;
+	__u32 uid_min, uid_max;
+	__u32 gid_min, gid_max;
+	__u8 match, invert;
 };
 
 #endif /* _XT_OWNER_MATCH_H */
diff --git a/include/linux/netfilter/xt_physdev.h b/include/linux/netfilter/xt_physdev.h
index 25a7a1815b5b..8555e399886d 100644
--- a/include/linux/netfilter/xt_physdev.h
+++ b/include/linux/netfilter/xt_physdev.h
@@ -1,6 +1,8 @@
 #ifndef _XT_PHYSDEV_H
 #define _XT_PHYSDEV_H
 
+#include <linux/types.h>
+
 #ifdef __KERNEL__
 #include <linux/if.h>
 #endif
@@ -17,8 +19,8 @@ struct xt_physdev_info {
 	char in_mask[IFNAMSIZ];
 	char physoutdev[IFNAMSIZ];
 	char out_mask[IFNAMSIZ];
-	u_int8_t invert;
-	u_int8_t bitmask;
+	__u8 invert;
+	__u8 bitmask;
 };
 
 #endif /*_XT_PHYSDEV_H*/
diff --git a/include/linux/netfilter/xt_policy.h b/include/linux/netfilter/xt_policy.h
index 053d8cc65464..7bb64e7c853d 100644
--- a/include/linux/netfilter/xt_policy.h
+++ b/include/linux/netfilter/xt_policy.h
@@ -1,6 +1,8 @@
 #ifndef _XT_POLICY_H
 #define _XT_POLICY_H
 
+#include <linux/types.h>
+
 #define XT_POLICY_MAX_ELEM	4
 
 enum xt_policy_flags
@@ -19,7 +21,7 @@ enum xt_policy_modes
 
 struct xt_policy_spec
 {
-	u_int8_t	saddr:1,
+	__u8	saddr:1,
 			daddr:1,
 			proto:1,
 			mode:1,
@@ -55,9 +57,9 @@ struct xt_policy_elem
 #endif
 	};
 	__be32			spi;
-	u_int32_t		reqid;
-	u_int8_t		proto;
-	u_int8_t		mode;
+	__u32		reqid;
+	__u8		proto;
+	__u8		mode;
 
 	struct xt_policy_spec	match;
 	struct xt_policy_spec	invert;
@@ -66,8 +68,8 @@ struct xt_policy_elem
 struct xt_policy_info
 {
 	struct xt_policy_elem pol[XT_POLICY_MAX_ELEM];
-	u_int16_t flags;
-	u_int16_t len;
+	__u16 flags;
+	__u16 len;
 };
 
 #endif /* _XT_POLICY_H */
diff --git a/include/linux/netfilter/xt_rateest.h b/include/linux/netfilter/xt_rateest.h
index 2010cb74250f..d40a6196842a 100644
--- a/include/linux/netfilter/xt_rateest.h
+++ b/include/linux/netfilter/xt_rateest.h
@@ -1,6 +1,8 @@
 #ifndef _XT_RATEEST_MATCH_H
 #define _XT_RATEEST_MATCH_H
 
+#include <linux/types.h>
+
 enum xt_rateest_match_flags {
 	XT_RATEEST_MATCH_INVERT	= 1<<0,
 	XT_RATEEST_MATCH_ABS	= 1<<1,
@@ -20,12 +22,12 @@ enum xt_rateest_match_mode {
 struct xt_rateest_match_info {
 	char			name1[IFNAMSIZ];
 	char			name2[IFNAMSIZ];
-	u_int16_t		flags;
-	u_int16_t		mode;
-	u_int32_t		bps1;
-	u_int32_t		pps1;
-	u_int32_t		bps2;
-	u_int32_t		pps2;
+	__u16		flags;
+	__u16		mode;
+	__u32		bps1;
+	__u32		pps1;
+	__u32		bps2;
+	__u32		pps2;
 
 	/* Used internally by the kernel */
 	struct xt_rateest	*est1 __attribute__((aligned(8)));
diff --git a/include/linux/netfilter/xt_realm.h b/include/linux/netfilter/xt_realm.h
index 220e87245716..d4a82ee56a02 100644
--- a/include/linux/netfilter/xt_realm.h
+++ b/include/linux/netfilter/xt_realm.h
@@ -1,10 +1,12 @@
 #ifndef _XT_REALM_H
 #define _XT_REALM_H
 
+#include <linux/types.h>
+
 struct xt_realm_info {
-	u_int32_t id;
-	u_int32_t mask;
-	u_int8_t invert;
+	__u32 id;
+	__u32 mask;
+	__u8 invert;
 };
 
 #endif /* _XT_REALM_H */
diff --git a/include/linux/netfilter/xt_recent.h b/include/linux/netfilter/xt_recent.h
index 5cfeb81c6794..d2c276609925 100644
--- a/include/linux/netfilter/xt_recent.h
+++ b/include/linux/netfilter/xt_recent.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_NETFILTER_XT_RECENT_H
 #define _LINUX_NETFILTER_XT_RECENT_H 1
 
+#include <linux/types.h>
+
 enum {
 	XT_RECENT_CHECK    = 1 << 0,
 	XT_RECENT_SET      = 1 << 1,
@@ -15,12 +17,12 @@ enum {
 };
 
 struct xt_recent_mtinfo {
-	u_int32_t seconds;
-	u_int32_t hit_count;
-	u_int8_t check_set;
-	u_int8_t invert;
+	__u32 seconds;
+	__u32 hit_count;
+	__u8 check_set;
+	__u8 invert;
 	char name[XT_RECENT_NAME_LEN];
-	u_int8_t side;
+	__u8 side;
 };
 
 #endif /* _LINUX_NETFILTER_XT_RECENT_H */
diff --git a/include/linux/netfilter/xt_sctp.h b/include/linux/netfilter/xt_sctp.h
index 32000ba6ecef..29287be696a2 100644
--- a/include/linux/netfilter/xt_sctp.h
+++ b/include/linux/netfilter/xt_sctp.h
@@ -1,6 +1,8 @@
 #ifndef _XT_SCTP_H_
 #define _XT_SCTP_H_
 
+#include <linux/types.h>
+
 #define XT_SCTP_SRC_PORTS	        0x01
 #define XT_SCTP_DEST_PORTS	        0x02
 #define XT_SCTP_CHUNK_TYPES		0x04
@@ -8,49 +10,49 @@
 #define XT_SCTP_VALID_FLAGS		0x07
 
 struct xt_sctp_flag_info {
-	u_int8_t chunktype;
-	u_int8_t flag;
-	u_int8_t flag_mask;
+	__u8 chunktype;
+	__u8 flag;
+	__u8 flag_mask;
 };
 
 #define XT_NUM_SCTP_FLAGS	4
 
 struct xt_sctp_info {
-	u_int16_t dpts[2];  /* Min, Max */
-	u_int16_t spts[2];  /* Min, Max */
+	__u16 dpts[2];  /* Min, Max */
+	__u16 spts[2];  /* Min, Max */
 
-	u_int32_t chunkmap[256 / sizeof (u_int32_t)];  /* Bit mask of chunks to be matched according to RFC 2960 */
+	__u32 chunkmap[256 / sizeof (__u32)];  /* Bit mask of chunks to be matched according to RFC 2960 */
 
 #define SCTP_CHUNK_MATCH_ANY   0x01  /* Match if any of the chunk types are present */
 #define SCTP_CHUNK_MATCH_ALL   0x02  /* Match if all of the chunk types are present */
 #define SCTP_CHUNK_MATCH_ONLY  0x04  /* Match if these are the only chunk types present */
 
-	u_int32_t chunk_match_type;
+	__u32 chunk_match_type;
 	struct xt_sctp_flag_info flag_info[XT_NUM_SCTP_FLAGS];
 	int flag_count;
 
-	u_int32_t flags;
-	u_int32_t invflags;
+	__u32 flags;
+	__u32 invflags;
 };
 
 #define bytes(type) (sizeof(type) * 8)
 
 #define SCTP_CHUNKMAP_SET(chunkmap, type) 		\
 	do { 						\
-		(chunkmap)[type / bytes(u_int32_t)] |= 	\
-			1 << (type % bytes(u_int32_t));	\
+		(chunkmap)[type / bytes(__u32)] |= 	\
+			1 << (type % bytes(__u32));	\
 	} while (0)
 
 #define SCTP_CHUNKMAP_CLEAR(chunkmap, type)		 	\
 	do {							\
-		(chunkmap)[type / bytes(u_int32_t)] &= 		\
-			~(1 << (type % bytes(u_int32_t)));	\
+		(chunkmap)[type / bytes(__u32)] &= 		\
+			~(1 << (type % bytes(__u32)));	\
 	} while (0)
 
 #define SCTP_CHUNKMAP_IS_SET(chunkmap, type) 			\
 ({								\
-	((chunkmap)[type / bytes (u_int32_t)] & 		\
-		(1 << (type % bytes (u_int32_t)))) ? 1: 0;	\
+	((chunkmap)[type / bytes (__u32)] & 		\
+		(1 << (type % bytes (__u32)))) ? 1: 0;	\
 })
 
 #define SCTP_CHUNKMAP_RESET(chunkmap) \
@@ -65,7 +67,7 @@ struct xt_sctp_info {
 #define SCTP_CHUNKMAP_IS_CLEAR(chunkmap) \
 	__sctp_chunkmap_is_clear((chunkmap), ARRAY_SIZE(chunkmap))
 static inline bool
-__sctp_chunkmap_is_clear(const u_int32_t *chunkmap, unsigned int n)
+__sctp_chunkmap_is_clear(const __u32 *chunkmap, unsigned int n)
 {
 	unsigned int i;
 	for (i = 0; i < n; ++i)
@@ -77,7 +79,7 @@ __sctp_chunkmap_is_clear(const u_int32_t *chunkmap, unsigned int n)
 #define SCTP_CHUNKMAP_IS_ALL_SET(chunkmap) \
 	__sctp_chunkmap_is_all_set((chunkmap), ARRAY_SIZE(chunkmap))
 static inline bool
-__sctp_chunkmap_is_all_set(const u_int32_t *chunkmap, unsigned int n)
+__sctp_chunkmap_is_all_set(const __u32 *chunkmap, unsigned int n)
 {
 	unsigned int i;
 	for (i = 0; i < n; ++i)
diff --git a/include/linux/netfilter/xt_statistic.h b/include/linux/netfilter/xt_statistic.h
index 3d38bc975048..095f3c66f456 100644
--- a/include/linux/netfilter/xt_statistic.h
+++ b/include/linux/netfilter/xt_statistic.h
@@ -1,6 +1,8 @@
 #ifndef _XT_STATISTIC_H
 #define _XT_STATISTIC_H
 
+#include <linux/types.h>
+
 enum xt_statistic_mode {
 	XT_STATISTIC_MODE_RANDOM,
 	XT_STATISTIC_MODE_NTH,
@@ -14,17 +16,17 @@ enum xt_statistic_flags {
 #define XT_STATISTIC_MASK		0x1
 
 struct xt_statistic_info {
-	u_int16_t			mode;
-	u_int16_t			flags;
+	__u16			mode;
+	__u16			flags;
 	union {
 		struct {
-			u_int32_t	probability;
+			__u32	probability;
 		} random;
 		struct {
-			u_int32_t	every;
-			u_int32_t	packet;
+			__u32	every;
+			__u32	packet;
 			/* Used internally by the kernel */
-			u_int32_t	count;
+			__u32	count;
 		} nth;
 	} u;
 	struct xt_statistic_info	*master __attribute__((aligned(8)));
diff --git a/include/linux/netfilter/xt_string.h b/include/linux/netfilter/xt_string.h
index 8a6ba7bbef9f..ecbb95fc89ed 100644
--- a/include/linux/netfilter/xt_string.h
+++ b/include/linux/netfilter/xt_string.h
@@ -1,6 +1,8 @@
 #ifndef _XT_STRING_H
 #define _XT_STRING_H
 
+#include <linux/types.h>
+
 #define XT_STRING_MAX_PATTERN_SIZE 128
 #define XT_STRING_MAX_ALGO_NAME_SIZE 16
 
@@ -11,18 +13,18 @@ enum {
 
 struct xt_string_info
 {
-	u_int16_t from_offset;
-	u_int16_t to_offset;
+	__u16 from_offset;
+	__u16 to_offset;
 	char	  algo[XT_STRING_MAX_ALGO_NAME_SIZE];
 	char 	  pattern[XT_STRING_MAX_PATTERN_SIZE];
-	u_int8_t  patlen;
+	__u8  patlen;
 	union {
 		struct {
-			u_int8_t  invert;
+			__u8  invert;
 		} v0;
 
 		struct {
-			u_int8_t  flags;
+			__u8  flags;
 		} v1;
 	} u;
 
diff --git a/include/linux/netfilter/xt_tcpmss.h b/include/linux/netfilter/xt_tcpmss.h
index e03274c4c790..fbac56b9e667 100644
--- a/include/linux/netfilter/xt_tcpmss.h
+++ b/include/linux/netfilter/xt_tcpmss.h
@@ -1,9 +1,11 @@
 #ifndef _XT_TCPMSS_MATCH_H
 #define _XT_TCPMSS_MATCH_H
 
+#include <linux/types.h>
+
 struct xt_tcpmss_match_info {
-    u_int16_t mss_min, mss_max;
-    u_int8_t invert;
+    __u16 mss_min, mss_max;
+    __u8 invert;
 };
 
 #endif /*_XT_TCPMSS_MATCH_H*/
diff --git a/include/linux/netfilter/xt_tcpudp.h b/include/linux/netfilter/xt_tcpudp.h
index 78bc65f11adf..a490a0bc1d29 100644
--- a/include/linux/netfilter/xt_tcpudp.h
+++ b/include/linux/netfilter/xt_tcpudp.h
@@ -1,15 +1,17 @@
 #ifndef _XT_TCPUDP_H
 #define _XT_TCPUDP_H
 
+#include <linux/types.h>
+
 /* TCP matching stuff */
 struct xt_tcp
 {
-	u_int16_t spts[2];			/* Source port range. */
-	u_int16_t dpts[2];			/* Destination port range. */
-	u_int8_t option;			/* TCP Option iff non-zero*/
-	u_int8_t flg_mask;			/* TCP flags mask byte */
-	u_int8_t flg_cmp;			/* TCP flags compare byte */
-	u_int8_t invflags;			/* Inverse flags */
+	__u16 spts[2];			/* Source port range. */
+	__u16 dpts[2];			/* Destination port range. */
+	__u8 option;			/* TCP Option iff non-zero*/
+	__u8 flg_mask;			/* TCP flags mask byte */
+	__u8 flg_cmp;			/* TCP flags compare byte */
+	__u8 invflags;			/* Inverse flags */
 };
 
 /* Values for "inv" field in struct ipt_tcp. */
@@ -22,9 +24,9 @@ struct xt_tcp
 /* UDP matching stuff */
 struct xt_udp
 {
-	u_int16_t spts[2];			/* Source port range. */
-	u_int16_t dpts[2];			/* Destination port range. */
-	u_int8_t invflags;			/* Inverse flags */
+	__u16 spts[2];			/* Source port range. */
+	__u16 dpts[2];			/* Destination port range. */
+	__u8 invflags;			/* Inverse flags */
 };
 
 /* Values for "invflags" field in struct ipt_udp. */
-- 
cgit v1.2.3-71-gd317


From 3a471cbc081b6bf2b58a48db13d734ecd3b0d437 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Feb 2009 00:51:45 +0100
Subject: remove __KERNEL_STRICT_NAMES

With the last used of non-strict names gone from the
exported header files, we can remove the old libc5
compatibility cruft from our headers and only export
strict types.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/statfs.h |  5 +++--
 include/linux/types.h        | 13 ++-----------
 2 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/statfs.h b/include/asm-generic/statfs.h
index 6129d6802149..3b4fb3e52f0d 100644
--- a/include/asm-generic/statfs.h
+++ b/include/asm-generic/statfs.h
@@ -1,8 +1,9 @@
 #ifndef _GENERIC_STATFS_H
 #define _GENERIC_STATFS_H
 
-#ifndef __KERNEL_STRICT_NAMES
-# include <linux/types.h>
+#include <linux/types.h>
+
+#ifdef __KERNEL__
 typedef __kernel_fsid_t	fsid_t;
 #endif
 
diff --git a/include/linux/types.h b/include/linux/types.h
index fca82ed55f49..5abe354020f9 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -13,7 +13,7 @@
 
 #include <linux/posix_types.h>
 
-#ifndef __KERNEL_STRICT_NAMES
+#ifdef __KERNEL__
 
 typedef __u32 __kernel_dev_t;
 
@@ -31,7 +31,6 @@ typedef __kernel_timer_t	timer_t;
 typedef __kernel_clockid_t	clockid_t;
 typedef __kernel_mqd_t		mqd_t;
 
-#ifdef __KERNEL__
 typedef _Bool			bool;
 
 typedef __kernel_uid32_t	uid_t;
@@ -47,14 +46,6 @@ typedef __kernel_old_uid_t	old_uid_t;
 typedef __kernel_old_gid_t	old_gid_t;
 #endif /* CONFIG_UID16 */
 
-/* libc5 includes this file to define uid_t, thus uid_t can never change
- * when it is included by non-kernel code
- */
-#else
-typedef __kernel_uid_t		uid_t;
-typedef __kernel_gid_t		gid_t;
-#endif /* __KERNEL__ */
-
 #if defined(__GNUC__)
 typedef __kernel_loff_t		loff_t;
 #endif
@@ -156,7 +147,7 @@ typedef unsigned long blkcnt_t;
 #define pgoff_t unsigned long
 #endif
 
-#endif /* __KERNEL_STRICT_NAMES */
+#endif /* __KERNEL__ */
 
 /*
  * Below are truly Linux-specific types that should never collide with
-- 
cgit v1.2.3-71-gd317


From 8cd2c29dd5f04d91dac6ea7f8b9df4ff1b4380ee Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 25 Feb 2009 15:22:19 -0800
Subject: compiler-gcc4: conditionalize #error on __KERNEL__

Impact: Fix for exported headers

We only want to error out on specific gcc versions if we are actually
building the kernel, so conditionalize the #if...#error on __KERNEL__.

Based on a patchset by Arnd Bergmann <arnd@arndb.de>.

Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler-gcc4.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 09992718f9e8..450fa597c94d 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -3,8 +3,10 @@
 #endif
 
 /* GCC 4.1.[01] miscompiles __weak */
-#if __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ <= 1
-# error Your version of gcc miscompiles the __weak directive
+#ifdef __KERNEL__
+# if __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ <= 1
+#  error Your version of gcc miscompiles the __weak directive
+# endif
 #endif
 
 #define __used			__attribute__((__used__))
-- 
cgit v1.2.3-71-gd317


From d0adde574b8487ef30f69e2d08bba769e4be513f Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 26 Mar 2009 17:49:56 +0000
Subject: Add a strictatime mount option

Add support for explicitly requesting full atime updates. This makes it
possible for kernels to default to relatime but still allow userspace to
override it.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c        | 6 +++++-
 include/linux/fs.h    | 1 +
 include/linux/mount.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 06f8e63f6cb1..d0659ec291c9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -780,6 +780,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		{ MNT_NOATIME, ",noatime" },
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
+		{ MNT_STRICTATIME, ",strictatime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
@@ -1932,11 +1933,14 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		mnt_flags |= MNT_NODIRATIME;
 	if (flags & MS_RELATIME)
 		mnt_flags |= MNT_RELATIME;
+	if (flags & MS_STRICTATIME)
+		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
-		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
+		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
+		   MS_STRICTATIME);
 
 	/* ... and get the mountpoint */
 	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 92734c0012e6..5bc81c4a98c1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -141,6 +141,7 @@ struct inodes_stat_t {
 #define MS_RELATIME	(1<<21)	/* Update atime relative to mtime/ctime. */
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
+#define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index cab2a85e2ee8..51f55f903aff 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -27,6 +27,7 @@ struct mnt_namespace;
 #define MNT_NODIRATIME	0x10
 #define MNT_RELATIME	0x20
 #define MNT_READONLY	0x40	/* does the user want this to be r/o? */
+#define MNT_STRICTATIME 0x80
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
-- 
cgit v1.2.3-71-gd317


From 898585172fa729513d8636257b44bd1cfd279096 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Mon, 16 Feb 2009 02:55:47 +0800
Subject: PCI: save and restore PCIe 2.0 registers

PCIe 2.0 defines several new registers (Device Control 2, Link Control 2,
and Slot Control 2). Save and retore them in pci_save_pcie_state() and
pci_restore_pcie_state().

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c        | 11 ++++++++++-
 include/linux/pci_regs.h |  2 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 676bbcbc272b..59569b8cf1d5 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -647,6 +647,8 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
 
 EXPORT_SYMBOL(pci_choose_state);
 
+#define PCI_EXP_SAVE_REGS	7
+
 static int pci_save_pcie_state(struct pci_dev *dev)
 {
 	int pos, i = 0;
@@ -668,6 +670,9 @@ static int pci_save_pcie_state(struct pci_dev *dev)
 	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]);
 	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]);
 	pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_LNKCTL2, &cap[i++]);
+	pci_read_config_word(dev, pos + PCI_EXP_SLTCTL2, &cap[i++]);
 
 	return 0;
 }
@@ -688,6 +693,9 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
 	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]);
 	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]);
 	pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_LNKCTL2, cap[i++]);
+	pci_write_config_word(dev, pos + PCI_EXP_SLTCTL2, cap[i++]);
 }
 
 
@@ -1372,7 +1380,8 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev)
 {
 	int error;
 
-	error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, 4 * sizeof(u16));
+	error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP,
+					PCI_EXP_SAVE_REGS * sizeof(u16));
 	if (error)
 		dev_err(&dev->dev,
 			"unable to preallocate PCI Express save buffer\n");
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d4e663877f45..e4d08c1b2e0b 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -488,6 +488,8 @@
 #define  PCI_EXP_DEVCAP2_ARI	0x20	/* Alternative Routing-ID */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_ARI	0x20	/* Alternative Routing-ID */
+#define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
+#define PCI_EXP_SLTCTL2		56	/* Slot Control 2 */
 
 /* Extended Capabilities (PCI-X 2.0 and Express) */
 #define PCI_EXT_CAP_ID(header)		(header & 0x0000ffff)
-- 
cgit v1.2.3-71-gd317


From 3341323bb4c198f704cffbfdda37bcec1226ef7d Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex@digriz.org.uk>
Date: Fri, 27 Mar 2009 12:59:54 +0800
Subject: hwrng: timeriomem - Use phys address rather than virt

There is no ioremap'ing or anything in timeriomem-rng.c as I foolishly
used already remapped virtual addresses instead of passing the physical
address to be polled.

This patch fixes this flaw and lets developers do the Right Thing(tm).

Signed-off-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/timeriomem-rng.c | 39 +++++++++++++++++++++++++++++----
 include/linux/timeriomem-rng.h          |  2 +-
 2 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
index 10ad41be5897..dcd352ad0e7f 100644
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -90,10 +90,30 @@ static struct hwrng timeriomem_rng_ops = {
 
 static int __init timeriomem_rng_probe(struct platform_device *pdev)
 {
+	struct resource *res, *mem;
 	int ret;
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+	if (!res)
+		return -ENOENT;
+
+	mem = request_mem_region(res->start, res->end - res->start + 1,
+				 pdev->name);
+	if (mem == NULL)
+		return -EBUSY;
+
+	dev_set_drvdata(&pdev->dev, mem);
+
 	timeriomem_rng_data = pdev->dev.platform_data;
 
+	timeriomem_rng_data->address = ioremap(res->start,
+						res->end - res->start + 1);
+	if (!timeriomem_rng_data->address) {
+		ret = -ENOMEM;
+		goto err_ioremap;
+	}
+
 	if (timeriomem_rng_data->period != 0
 		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
 		timeriomem_rng_timer.expires = jiffies;
@@ -104,23 +124,34 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev)
 	timeriomem_rng_data->present = 1;
 
 	ret = hwrng_register(&timeriomem_rng_ops);
-	if (ret) {
-		dev_err(&pdev->dev, "problem registering\n");
-		return ret;
-	}
+	if (ret)
+		goto err_register;
 
 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
 			timeriomem_rng_data->address,
 			timeriomem_rng_data->period);
 
 	return 0;
+
+err_register:
+	dev_err(&pdev->dev, "problem registering\n");
+	iounmap(timeriomem_rng_data->address);
+err_ioremap:
+	release_resource(mem);
+
+	return ret;
 }
 
 static int __devexit timeriomem_rng_remove(struct platform_device *pdev)
 {
+	struct resource *mem = dev_get_drvdata(&pdev->dev);
+
 	del_timer_sync(&timeriomem_rng_timer);
 	hwrng_unregister(&timeriomem_rng_ops);
 
+	iounmap(timeriomem_rng_data->address);
+	release_resource(mem);
+
 	return 0;
 }
 
diff --git a/include/linux/timeriomem-rng.h b/include/linux/timeriomem-rng.h
index dd253177f65f..3e08a1c86830 100644
--- a/include/linux/timeriomem-rng.h
+++ b/include/linux/timeriomem-rng.h
@@ -14,7 +14,7 @@ struct timeriomem_rng_data {
 	struct completion	completion;
 	unsigned int		present:1;
 
-	u32 __iomem		*address;
+	void __iomem		*address;
 
 	/* measures in usecs */
 	unsigned int		period;
-- 
cgit v1.2.3-71-gd317


From ac99533fb716171db12798039671f19631cf3586 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Thu, 26 Mar 2009 15:11:25 +0000
Subject: wan: convert sdla driver to net_device_ops

Also use internal net_device_stats

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/sdla.c  | 36 +++++++++++++++---------------------
 include/linux/if_frad.h |  1 -
 2 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 6a07ba9371db..1d637f407a0c 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -714,19 +714,19 @@ static int sdla_transmit(struct sk_buff *skb, struct net_device *dev)
 		switch (ret)
 		{
 			case SDLA_RET_OK:
-				flp->stats.tx_packets++;
+				dev->stats.tx_packets++;
 				ret = DLCI_RET_OK;
 				break;
 
 			case SDLA_RET_CIR_OVERFLOW:
 			case SDLA_RET_BUF_OVERSIZE:
 			case SDLA_RET_NO_BUFS:
-				flp->stats.tx_dropped++;
+				dev->stats.tx_dropped++;
 				ret = DLCI_RET_DROP;
 				break;
 
 			default:
-				flp->stats.tx_errors++;
+				dev->stats.tx_errors++;
 				ret = DLCI_RET_ERR;
 				break;
 		}
@@ -807,7 +807,7 @@ static void sdla_receive(struct net_device *dev)
 		if (i == CONFIG_DLCI_MAX)
 		{
 			printk(KERN_NOTICE "%s: Received packet from invalid DLCI %i, ignoring.", dev->name, dlci);
-			flp->stats.rx_errors++;
+			dev->stats.rx_errors++;
 			success = 0;
 		}
 	}
@@ -819,7 +819,7 @@ static void sdla_receive(struct net_device *dev)
 		if (skb == NULL) 
 		{
 			printk(KERN_NOTICE "%s: Memory squeeze, dropping packet.\n", dev->name);
-			flp->stats.rx_dropped++; 
+			dev->stats.rx_dropped++;
 			success = 0;
 		}
 		else
@@ -859,7 +859,7 @@ static void sdla_receive(struct net_device *dev)
 
 	if (success)
 	{
-		flp->stats.rx_packets++;
+		dev->stats.rx_packets++;
 		dlp = netdev_priv(master);
 		(*dlp->receive)(skb, master);
 	}
@@ -1590,13 +1590,14 @@ fail:
 	return err;
 }
  
-static struct net_device_stats *sdla_stats(struct net_device *dev)
-{
-	struct frad_local *flp;
-	flp = netdev_priv(dev);
-
-	return(&flp->stats);
-}
+static const struct net_device_ops sdla_netdev_ops = {
+	.ndo_open	= sdla_open,
+	.ndo_stop	= sdla_close,
+	.ndo_do_ioctl	= sdla_ioctl,
+	.ndo_set_config	= sdla_set_config,
+	.ndo_start_xmit	= sdla_transmit,
+	.ndo_change_mtu	= sdla_change_mtu,
+};
 
 static void setup_sdla(struct net_device *dev)
 {
@@ -1604,20 +1605,13 @@ static void setup_sdla(struct net_device *dev)
 
 	netdev_boot_setup_check(dev);
 
+	dev->netdev_ops		= &sdla_netdev_ops;
 	dev->flags		= 0;
 	dev->type		= 0xFFFF;
 	dev->hard_header_len	= 0;
 	dev->addr_len		= 0;
 	dev->mtu		= SDLA_MAX_MTU;
 
-	dev->open		= sdla_open;
-	dev->stop		= sdla_close;
-	dev->do_ioctl		= sdla_ioctl;
-	dev->set_config		= sdla_set_config;
-	dev->get_stats		= sdla_stats;
-	dev->hard_start_xmit	= sdla_transmit;
-	dev->change_mtu		= sdla_change_mtu;
-
 	flp->activate		= sdla_activate;
 	flp->deactivate		= sdla_deactivate;
 	flp->assoc		= sdla_assoc;
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 60e16a551dd6..673f2209453d 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -153,7 +153,6 @@ struct frhdr
 
 struct dlci_local
 {
-   struct net_device_stats stats;
    struct net_device      *master;
    struct net_device      *slave;
    struct dlci_conf       config;
-- 
cgit v1.2.3-71-gd317


From 088b1b88609ce89b6ab19d114cdbec94a44aa22c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Fri, 2 Jan 2009 13:34:47 +0100
Subject: ide: improve debugging scheme

and more specifically, push __func__ into debug
macro thus making ide_debug_log() calls shorter and more readable.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
---
 drivers/ide/ide-cd.c     | 124 ++++++++++++++++++++++-------------------------
 drivers/ide/ide-cd.h     |   2 +-
 drivers/ide/ide-floppy.c |  35 ++++++-------
 drivers/ide/ide-gd.c     |   4 +-
 drivers/ide/ide-gd.h     |   2 +-
 include/linux/ide.h      |   9 ++--
 6 files changed, 83 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 2177cd11664c..d163e6571e09 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -100,8 +100,7 @@ static int cdrom_log_sense(ide_drive_t *drive, struct request *rq,
 {
 	int log = 0;
 
-	ide_debug_log(IDE_DBG_SENSE, "Call %s, sense_key: 0x%x\n", __func__,
-		      sense->sense_key);
+	ide_debug_log(IDE_DBG_SENSE, "sense_key: 0x%x", sense->sense_key);
 
 	if (!sense || !rq || (rq->cmd_flags & REQ_QUIET))
 		return 0;
@@ -151,13 +150,12 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 	unsigned long bio_sectors;
 	struct cdrom_info *info = drive->driver_data;
 
-	ide_debug_log(IDE_DBG_SENSE, "Call %s, error_code: 0x%x, "
-			"sense_key: 0x%x\n", __func__, sense->error_code,
-			sense->sense_key);
+	ide_debug_log(IDE_DBG_SENSE, "error_code: 0x%x, sense_key: 0x%x",
+				     sense->error_code, sense->sense_key);
 
 	if (failed_command)
-		ide_debug_log(IDE_DBG_SENSE, "%s: failed cmd: 0x%x\n",
-				__func__, failed_command->cmd[0]);
+		ide_debug_log(IDE_DBG_SENSE, "failed cmd: 0x%x",
+					     failed_command->cmd[0]);
 
 	if (!cdrom_log_sense(drive, failed_command, sense))
 		return;
@@ -217,7 +215,7 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	struct cdrom_info *info		= drive->driver_data;
 	struct request *rq		= &info->request_sense_request;
 
-	ide_debug_log(IDE_DBG_SENSE, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_SENSE, "enter");
 
 	if (sense == NULL)
 		sense = &info->sense_data;
@@ -239,8 +237,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	rq->buffer = (void *) failed_command;
 
 	if (failed_command)
-		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x\n",
-			      failed_command->cmd[0]);
+		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
+					     failed_command->cmd[0]);
 
 	drive->hwif->rq = NULL;
 
@@ -252,9 +250,8 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 	struct request *rq = drive->hwif->rq;
 	int nsectors = rq->hard_cur_sectors;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s, cmd: 0x%x, uptodate: 0x%x, "
-		      "nsectors: %d\n", __func__, rq->cmd[0], uptodate,
-		      nsectors);
+	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, uptodate: 0x%x, nsectors: %d",
+				    rq->cmd[0], uptodate, nsectors);
 
 	if (blk_sense_request(rq) && uptodate) {
 		/*
@@ -295,8 +292,8 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 	if (!nsectors)
 		nsectors = 1;
 
-	ide_debug_log(IDE_DBG_FUNC, "Exit %s, uptodate: 0x%x, nsectors: %d\n",
-		      __func__, uptodate, nsectors);
+	ide_debug_log(IDE_DBG_FUNC, "uptodate: 0x%x, nsectors: %d",
+				    uptodate, nsectors);
 
 	ide_end_request(drive, uptodate, nsectors);
 }
@@ -338,9 +335,10 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 		return 1;
 	}
 
-	ide_debug_log(IDE_DBG_RQ, "%s: stat: 0x%x, good_stat: 0x%x, "
-		      "rq->cmd[0]: 0x%x, rq->cmd_type: 0x%x, err: 0x%x\n",
-		      __func__, stat, good_stat, rq->cmd[0], rq->cmd_type, err);
+	ide_debug_log(IDE_DBG_RQ, "stat: 0x%x, good_stat: 0x%x, cmd[0]: 0x%x, "
+				  "rq->cmd_type: 0x%x, err: 0x%x",
+				  stat, good_stat, rq->cmd[0], rq->cmd_type,
+				  err);
 
 	if (blk_sense_request(rq)) {
 		/*
@@ -530,8 +528,7 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
 {
 	ide_hwif_t *hwif = drive->hwif;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s, ireason: 0x%x, rw: 0x%x\n",
-		      __func__, ireason, rw);
+	ide_debug_log(IDE_DBG_FUNC, "ireason: 0x%x, rw: 0x%x", ireason, rw);
 
 	/*
 	 * ireason == 0: the drive wants to receive data from us
@@ -572,7 +569,7 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
  */
 static int ide_cd_check_transfer_size(ide_drive_t *drive, int len)
 {
-	ide_debug_log(IDE_DBG_FUNC, "Call %s, len: %d\n", __func__, len);
+	ide_debug_log(IDE_DBG_FUNC, "len: %d", len);
 
 	if ((len % SECTOR_SIZE) == 0)
 		return 0;
@@ -594,8 +591,7 @@ static int ide_cd_check_transfer_size(ide_drive_t *drive, int len)
 static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 						 struct request *rq)
 {
-	ide_debug_log(IDE_DBG_RQ, "Call %s: rq->cmd_flags: 0x%x\n", __func__,
-		      rq->cmd_flags);
+	ide_debug_log(IDE_DBG_RQ, "rq->cmd_flags: 0x%x", rq->cmd_flags);
 
 	if (rq_data_dir(rq) == READ) {
 		unsigned short sectors_per_frame =
@@ -639,7 +635,7 @@ static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 static void ide_cd_restore_request(ide_drive_t *drive, struct request *rq)
 {
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	if (rq->buffer != bio_data(rq->bio)) {
 		sector_t n =
@@ -658,8 +654,7 @@ static void ide_cd_restore_request(ide_drive_t *drive, struct request *rq)
 
 static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
 {
-	ide_debug_log(IDE_DBG_FUNC, "Call %s, rq->cmd[0]: 0x%x\n",
-		      __func__, rq->cmd[0]);
+	ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]);
 
 	/*
 	 * Some of the trailing request sense fields are optional,
@@ -686,9 +681,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 	if (!sense)
 		sense = &local_sense;
 
-	ide_debug_log(IDE_DBG_PC, "Call %s, cmd[0]: 0x%x, write: 0x%x, "
-		      "timeout: %d, cmd_flags: 0x%x\n", __func__, cmd[0], write,
-		      timeout, cmd_flags);
+	ide_debug_log(IDE_DBG_PC, "cmd[0]: 0x%x, write: 0x%x, timeout: %d, "
+				  "cmd_flags: 0x%x",
+				  cmd[0], write, timeout, cmd_flags);
 
 	/* start of retry loop */
 	do {
@@ -772,8 +767,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	u16 len;
 	u8 ireason;
 
-	ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd[0]: 0x%x, write: 0x%x\n",
-		      __func__, rq->cmd[0], write);
+	ide_debug_log(IDE_DBG_PC, "cmd[0]: 0x%x, write: 0x%x",
+				  rq->cmd[0], write);
 
 	/* check for errors */
 	dma = drive->dma;
@@ -810,8 +805,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (thislen > len)
 		thislen = len;
 
-	ide_debug_log(IDE_DBG_PC, "%s: DRQ: stat: 0x%x, thislen: %d\n",
-		      __func__, stat, thislen);
+	ide_debug_log(IDE_DBG_PC, "DRQ: stat: 0x%x, thislen: %d",
+				  stat, thislen);
 
 	/* If DRQ is clear, the command has completed. */
 	if ((stat & ATA_DRQ) == 0) {
@@ -876,8 +871,9 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		xferfunc = hwif->tp_ops->input_data;
 	}
 
-	ide_debug_log(IDE_DBG_PC, "%s: data transfer, rq->cmd_type: 0x%x, "
-		      "ireason: 0x%x\n", __func__, rq->cmd_type, ireason);
+	ide_debug_log(IDE_DBG_PC, "data transfer, rq->cmd_type: 0x%x, "
+				  "ireason: 0x%x",
+				  rq->cmd_type, ireason);
 
 	/* transfer data */
 	while (thislen > 0) {
@@ -988,9 +984,9 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	unsigned short sectors_per_frame =
 		queue_hardsect_size(drive->queue) >> SECTOR_BITS;
 
-	ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd[0]: 0x%x, write: 0x%x, "
-		      "secs_per_frame: %u\n",
-		      __func__, rq->cmd[0], write, sectors_per_frame);
+	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, write: 0x%x, "
+				  "secs_per_frame: %u",
+				  rq->cmd[0], write, sectors_per_frame);
 
 	if (write) {
 		/* disk has become write protected */
@@ -1026,9 +1022,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 {
 
-	ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd[0]: 0x%x, "
-		      "rq->cmd_type: 0x%x\n", __func__, rq->cmd[0],
-		      rq->cmd_type);
+	ide_debug_log(IDE_DBG_PC, "rq->cmd[0]: 0x%x, rq->cmd_type: 0x%x",
+				  rq->cmd[0], rq->cmd_type);
 
 	if (blk_pc_request(rq))
 		rq->cmd_flags |= REQ_QUIET;
@@ -1067,10 +1062,11 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
-	ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd[0]: 0x%x, "
-		      "rq->cmd_type: 0x%x, block: %llu\n",
-		      __func__, rq->cmd[0], rq->cmd_type,
-		      (unsigned long long)block);
+	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, block: %llu",
+				  rq->cmd[0], (unsigned long long)block);
+
+	if (drive->debug_mask & IDE_DBG_RQ)
+		blk_dump_rq_flags(rq, "ide_cd_do_request");
 
 	if (blk_fs_request(rq)) {
 		if (cdrom_start_rw(drive, rq) == ide_stopped)
@@ -1119,7 +1115,7 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
 	struct cdrom_device_info *cdi = &info->devinfo;
 	unsigned char cmd[BLK_MAX_CDB];
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	memset(cmd, 0, BLK_MAX_CDB);
 	cmd[0] = GPCMD_TEST_UNIT_READY;
@@ -1147,7 +1143,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
 	unsigned len = sizeof(capbuf);
 	u32 blocklen;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	memset(cmd, 0, BLK_MAX_CDB);
 	cmd[0] = GPCMD_READ_CDVD_CAPACITY;
@@ -1179,8 +1175,8 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
 	*capacity = 1 + be32_to_cpu(capbuf.lba);
 	*sectors_per_frame = blocklen >> SECTOR_BITS;
 
-	ide_debug_log(IDE_DBG_PROBE, "%s: cap: %lu, sectors_per_frame: %lu\n",
-		      __func__, *capacity, *sectors_per_frame);
+	ide_debug_log(IDE_DBG_PROBE, "cap: %lu, sectors_per_frame: %lu",
+				     *capacity, *sectors_per_frame);
 
 	return 0;
 }
@@ -1191,7 +1187,7 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
 {
 	unsigned char cmd[BLK_MAX_CDB];
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	memset(cmd, 0, BLK_MAX_CDB);
 
@@ -1221,7 +1217,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
 	long last_written;
 	unsigned long sectors_per_frame = SECTORS_PER_FRAME;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	if (toc == NULL) {
 		/* try to allocate space */
@@ -1383,7 +1379,7 @@ int ide_cdrom_get_capabilities(ide_drive_t *drive, u8 *buf)
 	struct packet_command cgc;
 	int stat, attempts = 3, size = ATAPI_CAPABILITIES_PAGE_SIZE;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	if ((drive->atapi_flags & IDE_AFLAG_FULL_CAPS_PAGE) == 0)
 		size -= ATAPI_CAPABILITIES_PAGE_PAD_SIZE;
@@ -1403,7 +1399,7 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf)
 	struct cdrom_info *cd = drive->driver_data;
 	u16 curspeed, maxspeed;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	if (drive->atapi_flags & IDE_AFLAG_LE_SPEED_FIELDS) {
 		curspeed = le16_to_cpup((__le16 *)&buf[8 + 14]);
@@ -1413,8 +1409,8 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf)
 		maxspeed = be16_to_cpup((__be16 *)&buf[8 + 8]);
 	}
 
-	ide_debug_log(IDE_DBG_PROBE, "%s: curspeed: %u, maxspeed: %u\n",
-		      __func__, curspeed, maxspeed);
+	ide_debug_log(IDE_DBG_PROBE, "curspeed: %u, maxspeed: %u",
+				     curspeed, maxspeed);
 
 	cd->current_speed = (curspeed + (176/2)) / 176;
 	cd->max_speed = (maxspeed + (176/2)) / 176;
@@ -1448,7 +1444,7 @@ static int ide_cdrom_register(ide_drive_t *drive, int nslots)
 	struct cdrom_info *info = drive->driver_data;
 	struct cdrom_device_info *devinfo = &info->devinfo;
 
-	ide_debug_log(IDE_DBG_PROBE, "Call %s, nslots: %d\n", __func__, nslots);
+	ide_debug_log(IDE_DBG_PROBE, "nslots: %d", nslots);
 
 	devinfo->ops = &ide_cdrom_dops;
 	devinfo->speed = info->current_speed;
@@ -1471,9 +1467,8 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 	mechtype_t mechtype;
 	int nslots = 1;
 
-	ide_debug_log(IDE_DBG_PROBE, "Call %s, drive->media: 0x%x, "
-		      "drive->atapi_flags: 0x%lx\n", __func__, drive->media,
-		      drive->atapi_flags);
+	ide_debug_log(IDE_DBG_PROBE, "media: 0x%x, atapi_flags: 0x%lx",
+				     drive->media, drive->atapi_flags);
 
 	cdi->mask = (CDC_CD_R | CDC_CD_RW | CDC_DVD | CDC_DVD_R |
 		     CDC_DVD_RAM | CDC_SELECT_DISC | CDC_PLAY_AUDIO |
@@ -1754,7 +1749,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 	char *fw_rev = (char *)&id[ATA_ID_FW_REV];
 	int nslots;
 
-	ide_debug_log(IDE_DBG_PROBE, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_PROBE, "enter");
 
 	blk_queue_prep_rq(drive->queue, ide_cdrom_prep_fn);
 	blk_queue_dma_alignment(drive->queue, 31);
@@ -1797,7 +1792,7 @@ static void ide_cd_remove(ide_drive_t *drive)
 {
 	struct cdrom_info *info = drive->driver_data;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	ide_proc_unregister_driver(drive, info->driver);
 	device_del(&info->dev);
@@ -1815,7 +1810,7 @@ static void ide_cd_release(struct device *dev)
 	ide_drive_t *drive = info->drive;
 	struct gendisk *g = info->disk;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	kfree(info->toc);
 	if (devinfo->handle == drive)
@@ -1974,9 +1969,8 @@ static int ide_cd_probe(ide_drive_t *drive)
 	struct gendisk *g;
 	struct request_sense sense;
 
-	ide_debug_log(IDE_DBG_PROBE, "Call %s, drive->driver_req: %s, "
-		      "drive->media: 0x%x\n", __func__, drive->driver_req,
-		      drive->media);
+	ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x",
+				     drive->driver_req, drive->media);
 
 	if (!strstr("ide-cdrom", drive->driver_req))
 		goto failed;
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index c878bfcf1116..3b77dd735088 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -11,7 +11,7 @@
 #define IDECD_DEBUG_LOG		0
 
 #if IDECD_DEBUG_LOG
-#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args)
+#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, ## args)
 #else
 #define ide_debug_log(lvl, fmt, args...) do {} while (0)
 #endif
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 317ec62c33d4..d1a79e8e0d69 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -74,7 +74,7 @@ static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
 	struct request *rq = drive->hwif->rq;
 	int error;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	switch (uptodate) {
 	case 0:
@@ -121,7 +121,7 @@ static void ide_floppy_callback(ide_drive_t *drive, int dsc)
 	struct ide_atapi_pc *pc = drive->pc;
 	int uptodate = pc->error ? 0 : 1;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	if (floppy->failed_pc == pc)
 		floppy->failed_pc = NULL;
@@ -140,11 +140,11 @@ static void ide_floppy_callback(ide_drive_t *drive, int dsc)
 				(u16)get_unaligned((u16 *)&buf[16]) : 0x10000;
 
 			if (floppy->failed_pc)
-				ide_debug_log(IDE_DBG_PC, "pc = %x, ",
+				ide_debug_log(IDE_DBG_PC, "pc = %x",
 					      floppy->failed_pc->c[0]);
 
 			ide_debug_log(IDE_DBG_SENSE, "sense key = %x, asc = %x,"
-				      "ascq = %x\n", floppy->sense_key,
+				      "ascq = %x", floppy->sense_key,
 				      floppy->asc, floppy->ascq);
 		} else
 			printk(KERN_ERR PFX "Error in REQUEST SENSE itself - "
@@ -193,7 +193,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 		return ide_stopped;
 	}
 
-	ide_debug_log(IDE_DBG_FUNC, "%s: Retry #%d\n", __func__, pc->retries);
+	ide_debug_log(IDE_DBG_FUNC, "retry #%d", pc->retries);
 
 	pc->retries++;
 
@@ -242,8 +242,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 	int blocks = rq->nr_sectors / floppy->bs_factor;
 	int cmd = rq_data_dir(rq);
 
-	ide_debug_log(IDE_DBG_FUNC, "%s: block: %d, blocks: %d\n", __func__,
-		      block, blocks);
+	ide_debug_log(IDE_DBG_FUNC, "block: %d, blocks: %d", block, blocks);
 
 	ide_init_pc(pc);
 	pc->c[0] = cmd == READ ? GPCMD_READ_10 : GPCMD_WRITE_10;
@@ -287,15 +286,10 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_atapi_pc *pc;
 
-	ide_debug_log(IDE_DBG_FUNC, "%s: dev: %s, cmd: 0x%x, cmd_type: %x, "
-		      "errors: %d\n",
-		      __func__, rq->rq_disk ? rq->rq_disk->disk_name : "?",
-		      rq->cmd[0], rq->cmd_type, rq->errors);
-
-	ide_debug_log(IDE_DBG_FUNC, "%s: sector: %ld, nr_sectors: %ld, "
-		      "current_nr_sectors: %d\n",
-		      __func__, (long)rq->sector, rq->nr_sectors,
-		      rq->current_nr_sectors);
+	if (drive->debug_mask & IDE_DBG_RQ)
+		blk_dump_rq_flags(rq, (rq->rq_disk
+					? rq->rq_disk->disk_name
+					: "dev?"));
 
 	if (rq->errors >= ERROR_MAX) {
 		if (floppy->failed_pc)
@@ -438,8 +432,9 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 		length = be16_to_cpup((__be16 *)&pc.buf[desc_start + 6]);
 
 		ide_debug_log(IDE_DBG_PROBE, "Descriptor %d: %dkB, %d blocks, "
-			      "%d sector size\n",
-			      i, blocks * length / 1024, blocks, length);
+					     "%d sector size",
+					     i, blocks * length / 1024,
+					     blocks, length);
 
 		if (i)
 			continue;
@@ -495,8 +490,8 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 				"in drive\n", drive->name);
 			break;
 		}
-		ide_debug_log(IDE_DBG_PROBE, "Descriptor 0 Code: %d\n",
-			      pc.buf[desc_start + 4] & 0x03);
+		ide_debug_log(IDE_DBG_PROBE, "Descriptor 0 Code: %d",
+					     pc.buf[desc_start + 4] & 0x03);
 	}
 
 	/* Clik! disk does not support get_flexible_disk_page */
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 047109419902..c51a35093ae2 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -182,7 +182,7 @@ static int ide_gd_open(struct block_device *bdev, fmode_t mode)
 
 	drive = idkp->drive;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	idkp->openers++;
 
@@ -232,7 +232,7 @@ static int ide_gd_release(struct gendisk *disk, fmode_t mode)
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
 
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+	ide_debug_log(IDE_DBG_FUNC, "enter");
 
 	if (idkp->openers == 1)
 		drive->disk_ops->flush(drive);
diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h
index b604bdd318a1..70b43765327d 100644
--- a/drivers/ide/ide-gd.h
+++ b/drivers/ide/ide-gd.h
@@ -8,7 +8,7 @@
 #define IDE_GD_DEBUG_LOG	0
 
 #if IDE_GD_DEBUG_LOG
-#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args)
+#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, ## args)
 #else
 #define ide_debug_log(lvl, fmt, args...) do {} while (0)
 #endif
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 854eba8b2ba3..9a386501b9c1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1045,10 +1045,11 @@ enum {
 };
 
 /* DRV_NAME has to be defined in the driver before using the macro below */
-#define __ide_debug_log(lvl, fmt, args...)			\
-{								\
-	if (unlikely(drive->debug_mask & lvl))			\
-		printk(KERN_INFO DRV_NAME ": " fmt, ## args);	\
+#define __ide_debug_log(lvl, fmt, args...)				\
+{									\
+	if (unlikely(drive->debug_mask & lvl))				\
+		printk(KERN_INFO DRV_NAME ": %s: " fmt "\n",		\
+					  __func__, ## args);		\
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From d15a613ba01ff2b209ecad7a38ccbb23b3b06c92 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:21 +0100
Subject: ide: remove IDE_ARCH_INTR (v2)

This micro-optimization is not worth it.  Just always check for
existence of ->ack_intr method in ide_intr() and ide_timer_expiry().

v2:
Fix brown-paper-bag bug spotted by David D. Kilzer.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michael Schmitz <schmitz@debian.org>
Cc: "David D. Kilzer" <ddkilzer@kilzer.net>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 arch/m68k/include/asm/ide.h | 3 ---
 drivers/ide/ide-io.c        | 5 +++--
 include/linux/ide.h         | 5 -----
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/include/asm/ide.h b/arch/m68k/include/asm/ide.h
index b996a3c8cff5..9f95f06eebe2 100644
--- a/arch/m68k/include/asm/ide.h
+++ b/arch/m68k/include/asm/ide.h
@@ -123,8 +123,5 @@ ide_get_lock(irq_handler_t handler, void *data)
 }
 #endif /* CONFIG_BLK_DEV_FALCON_IDE */
 
-#define IDE_ARCH_ACK_INTR
-#define ide_ack_intr(hwif)	((hwif)->ack_intr ? (hwif)->ack_intr(hwif) : 1)
-
 #endif /* __KERNEL__ */
 #endif /* _M68K_IDE_H */
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 2e92497b58aa..e85060164203 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -739,7 +739,8 @@ void ide_timer_expiry (unsigned long data)
 		} else if (drive_is_ready(drive)) {
 			if (drive->waiting_for_dma)
 				hwif->dma_ops->dma_lost_irq(drive);
-			(void)ide_ack_intr(hwif);
+			if (hwif->ack_intr)
+				hwif->ack_intr(hwif);
 			printk(KERN_WARNING "%s: lost interrupt\n",
 				drive->name);
 			startstop = handler(drive);
@@ -854,7 +855,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 
 	spin_lock_irqsave(&hwif->lock, flags);
 
-	if (!ide_ack_intr(hwif))
+	if (hwif->ack_intr && hwif->ack_intr(hwif) == 0)
 		goto out;
 
 	handler = hwif->handler;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9a386501b9c1..cda80b5779a4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -202,11 +202,6 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 
 #define MAX_HWIFS	10
 
-/* Currently only m68k, apus and m8xx need it */
-#ifndef IDE_ARCH_ACK_INTR
-# define ide_ack_intr(hwif) (1)
-#endif
-
 /* Currently only Atari needs it */
 #ifndef IDE_ARCH_LOCK
 # define ide_release_lock()			do {} while (0)
-- 
cgit v1.2.3-71-gd317


From e354c1d8033d97a97a38a1b2cffa1bc285b92ad4 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:22 +0100
Subject: ide: remove IDE_ARCH_LOCK (v2)

* Add ->{get,release}_lock methods to struct ide_port_info
  and struct ide_host.

* Convert core IDE code, m68k IDE code and falconide support to use
  ->{get,release}_lock methods instead of ide_{get,release}_lock().

* Remove IDE_ARCH_LOCK.

v2:
* Build fix from Geert updating ide_{get,release}_lock() callers in
  falconide.c.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michael Schmitz <schmitz@debian.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 arch/m68k/include/asm/ide.h | 36 ------------------------------------
 drivers/ide/falconide.c     | 29 +++++++++++++++++++++++++----
 drivers/ide/ide-io.c        |  8 ++++----
 drivers/ide/ide-probe.c     |  2 ++
 include/linux/ide.h         | 17 +++++++++++------
 5 files changed, 42 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/include/asm/ide.h b/arch/m68k/include/asm/ide.h
index 9f95f06eebe2..4e6e77759f88 100644
--- a/arch/m68k/include/asm/ide.h
+++ b/arch/m68k/include/asm/ide.h
@@ -36,11 +36,6 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 
-#ifdef CONFIG_ATARI
-#include <linux/interrupt.h>
-#include <asm/atari_stdma.h>
-#endif
-
 #ifdef CONFIG_MAC
 #include <asm/macints.h>
 #endif
@@ -92,36 +87,5 @@
 #define outsw_swapw(port, addr, n)	raw_outsw_swapw((u16 *)port, addr, n)
 #endif
 
-#ifdef CONFIG_BLK_DEV_FALCON_IDE
-#define IDE_ARCH_LOCK
-
-extern int falconide_intr_lock;
-
-static __inline__ void ide_release_lock (void)
-{
-	if (MACH_IS_ATARI) {
-		if (falconide_intr_lock == 0) {
-			printk("ide_release_lock: bug\n");
-			return;
-		}
-		falconide_intr_lock = 0;
-		stdma_release();
-	}
-}
-
-static __inline__ void
-ide_get_lock(irq_handler_t handler, void *data)
-{
-	if (MACH_IS_ATARI) {
-		if (falconide_intr_lock == 0) {
-			if (in_interrupt() > 0)
-				panic( "Falcon IDE hasn't ST-DMA lock in interrupt" );
-			stdma_lock(handler, data);
-			falconide_intr_lock = 1;
-		}
-	}
-}
-#endif /* CONFIG_BLK_DEV_FALCON_IDE */
-
 #endif /* __KERNEL__ */
 #endif /* _M68K_IDE_H */
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index a638e952d67a..d4d7ff1a3516 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -40,8 +40,27 @@
      *  which is shared between several drivers.
      */
 
-int falconide_intr_lock;
-EXPORT_SYMBOL(falconide_intr_lock);
+static int falconide_intr_lock;
+
+static void falconide_release_lock(void)
+{
+	if (falconide_intr_lock == 0) {
+		printk(KERN_ERR "%s: bug\n", __func__);
+		return;
+	}
+	falconide_intr_lock = 0;
+	stdma_release();
+}
+
+static void falconide_get_lock(irq_handler_t handler, void *data)
+{
+	if (falconide_intr_lock == 0) {
+		if (in_interrupt() > 0)
+			panic("Falcon IDE hasn't ST-DMA lock in interrupt");
+		stdma_lock(handler, data);
+		falconide_intr_lock = 1;
+	}
+}
 
 static void falconide_input_data(ide_drive_t *drive, struct request *rq,
 				 void *buf, unsigned int len)
@@ -81,6 +100,8 @@ static const struct ide_tp_ops falconide_tp_ops = {
 };
 
 static const struct ide_port_info falconide_port_info = {
+	.get_lock		= falconide_get_lock,
+	.release_lock		= falconide_release_lock,
 	.tp_ops			= &falconide_tp_ops,
 	.host_flags		= IDE_HFLAG_NO_DMA | IDE_HFLAG_SERIALIZE,
 };
@@ -132,9 +153,9 @@ static int __init falconide_init(void)
 		goto err;
 	}
 
-	ide_get_lock(NULL, NULL);
+	falconide_get_lock(NULL, NULL);
 	rc = ide_host_register(host, &falconide_port_info, hws);
-	ide_release_lock();
+	falconide_release_lock();
 
 	if (rc)
 		goto err_free;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index e85060164203..030b0ea1a1e1 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -501,8 +501,8 @@ static inline int ide_lock_host(struct ide_host *host, ide_hwif_t *hwif)
 	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
 		rc = test_and_set_bit_lock(IDE_HOST_BUSY, &host->host_busy);
 		if (rc == 0) {
-			/* for atari only */
-			ide_get_lock(ide_intr, hwif);
+			if (host->get_lock)
+				host->get_lock(ide_intr, hwif);
 		}
 	}
 	return rc;
@@ -511,8 +511,8 @@ static inline int ide_lock_host(struct ide_host *host, ide_hwif_t *hwif)
 static inline void ide_unlock_host(struct ide_host *host)
 {
 	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
-		/* for atari only */
-		ide_release_lock();
+		if (host->release_lock)
+			host->release_lock();
 		clear_bit_unlock(IDE_HOST_BUSY, &host->host_busy);
 	}
 }
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index b0510b033d78..a3edbb5d0452 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1325,6 +1325,8 @@ struct ide_host *ide_host_alloc(const struct ide_port_info *d, hw_regs_t **hws)
 
 	if (d) {
 		host->init_chipset = d->init_chipset;
+		host->get_lock     = d->get_lock;
+		host->release_lock = d->release_lock;
 		host->host_flags = d->host_flags;
 	}
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cda80b5779a4..b7d95f09cc2e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -202,12 +202,6 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 
 #define MAX_HWIFS	10
 
-/* Currently only Atari needs it */
-#ifndef IDE_ARCH_LOCK
-# define ide_release_lock()			do {} while (0)
-# define ide_get_lock(hdlr, data)		do {} while (0)
-#endif /* IDE_ARCH_LOCK */
-
 /*
  * Now for the data we need to maintain per-drive:  ide_drive_t
  */
@@ -845,8 +839,14 @@ struct ide_host {
 	ide_hwif_t	*ports[MAX_HOST_PORTS + 1];
 	unsigned int	n_ports;
 	struct device	*dev[2];
+
 	int		(*init_chipset)(struct pci_dev *);
+
+	void		(*get_lock)(irq_handler_t, void *);
+	void		(*release_lock)(void);
+
 	irq_handler_t	irq_handler;
+
 	unsigned long	host_flags;
 	void		*host_priv;
 	ide_hwif_t	*cur_port;	/* for hosts requiring serialization */
@@ -1358,7 +1358,12 @@ enum {
 
 struct ide_port_info {
 	char			*name;
+
 	int			(*init_chipset)(struct pci_dev *);
+
+	void			(*get_lock)(irq_handler_t, void *);
+	void			(*release_lock)(void);
+
 	void			(*init_iops)(ide_hwif_t *);
 	void                    (*init_hwif)(ide_hwif_t *);
 	int			(*init_dma)(ide_hwif_t *,
-- 
cgit v1.2.3-71-gd317


From 15a453a955f89f6545118770c669b52e925368bd Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:26 +0100
Subject: ide: include <asm/ide.h> only when needed

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io-std.c | 7 +++++++
 drivers/ide/tx4938ide.c  | 2 ++
 drivers/ide/tx4939ide.c  | 2 ++
 include/linux/ide.h      | 7 -------
 4 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 45b43dd49cda..9a8da6744d93 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -2,6 +2,13 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
 
+#if defined(CONFIG_ARM) || defined(CONFIG_M68K) || defined(CONFIG_MIPS) || \
+    defined(CONFIG_PARISC) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
+#include <asm/ide.h>
+#else
+#include <asm-generic/ide_iops.h>
+#endif
+
 /*
  *	Conventional PIO operations for ATA devices
  */
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index d9095345f7ca..efade9e898b3 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -15,6 +15,8 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
+
+#include <asm/ide.h>
 #include <asm/txx9/tx4938.h>
 
 static void tx4938ide_tune_ebusc(unsigned int ebus_ch,
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 40b0812a045c..fb037a5e70b3 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -18,6 +18,8 @@
 #include <linux/io.h>
 #include <linux/scatterlist.h>
 
+#include <asm/ide.h>
+
 #define MODNAME	"tx4939ide"
 
 /* ATA Shadow Registers (8-bit except for Data which is 16-bit) */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b7d95f09cc2e..47878719c56b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -193,13 +193,6 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 	hw->io_ports.ctl_addr = ctl_addr;
 }
 
-#if defined(CONFIG_ARM) || defined(CONFIG_M68K) || defined(CONFIG_MIPS) || \
-    defined(CONFIG_PARISC) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
-#include <asm/ide.h>
-#else
-#include <asm-generic/ide_iops.h>
-#endif
-
 #define MAX_HWIFS	10
 
 /*
-- 
cgit v1.2.3-71-gd317


From 69197ad70ef6b854988299c1377864f9755cd03d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:26 +0100
Subject: ide: fix memleak on failure in probe_for_drive()

Always free drive->id in probe_for_drive() if device is not present.

While at it:
- remove dead IDE_DFLAG_DEAD flag
- remove superfluous IDE_DFLAG_PRESENT check

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c | 22 +++++++++-------------
 include/linux/ide.h     |  2 --
 2 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index a3edbb5d0452..4b00945cf7d1 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -228,15 +228,9 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
 	m[ATA_ID_PROD_LEN - 1] = '\0';
 
 	if (strstr(m, "E X A B Y T E N E S T"))
-		goto err_misc;
-
-	drive->dev_flags |= IDE_DFLAG_PRESENT;
-	drive->dev_flags &= ~IDE_DFLAG_DEAD;
-
-	return;
-err_misc:
-	kfree(id);
-	drive->dev_flags &= ~IDE_DFLAG_PRESENT;
+		drive->dev_flags &= ~IDE_DFLAG_PRESENT;
+	else
+		drive->dev_flags |= IDE_DFLAG_PRESENT;
 }
 
 /**
@@ -505,8 +499,7 @@ static u8 probe_for_drive(ide_drive_t *drive)
 		}
 
 		if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-			/* drive not found */
-			return 0;
+			goto out_free;
 
 		/* identification failed? */
 		if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
@@ -530,7 +523,7 @@ static u8 probe_for_drive(ide_drive_t *drive)
 	}
 
 	if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-		return 0;
+		goto out_free;
 
 	/* The drive wasn't being helpful. Add generic info only */
 	if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
@@ -543,7 +536,10 @@ static u8 probe_for_drive(ide_drive_t *drive)
 		ide_disk_init_mult_count(drive);
 	}
 
-	return !!(drive->dev_flags & IDE_DFLAG_PRESENT);
+	return 1;
+out_free:
+	kfree(drive->id);
+	return 0;
 }
 
 static void hwif_release_dev(struct device *dev)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 47878719c56b..ab8ee4f32f52 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -494,8 +494,6 @@ enum {
 	IDE_DFLAG_NICE1			= (1 << 5),
 	/* device is physically present */
 	IDE_DFLAG_PRESENT		= (1 << 6),
-	/* device ejected hint */
-	IDE_DFLAG_DEAD			= (1 << 7),
 	/* id read from device (synthetic if not set) */
 	IDE_DFLAG_ID_READ		= (1 << 8),
 	IDE_DFLAG_NOPROBE		= (1 << 9),
-- 
cgit v1.2.3-71-gd317


From 255115fb35f80735c21a1cbe9809e9795a3af26e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:27 +0100
Subject: ide: allow host drivers to specify IRQ flags

* Add ->irq_flags field to struct ide_port_info and struct ide_host.

* Update host drivers and IDE PCI code to use ->irq_flags field.

* Convert init_irq() and ide_intr() to use host->irq_flags.

This fixes handling of shared IRQs for non-PCI hosts
and removes ugly ifdeffery from core IDE code.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/buddha.c    |  1 +
 drivers/ide/delkin_cb.c |  1 +
 drivers/ide/falconide.c |  1 +
 drivers/ide/gayle.c     |  1 +
 drivers/ide/ide-cs.c    |  1 +
 drivers/ide/ide-io.c    | 15 ++++-----------
 drivers/ide/ide-probe.c | 13 +++----------
 drivers/ide/macide.c    |  1 +
 drivers/ide/q40ide.c    |  1 +
 drivers/ide/scc_pata.c  |  1 +
 drivers/ide/setup-pci.c |  4 ++++
 drivers/ide/sgiioc4.c   |  1 +
 include/linux/ide.h     |  6 ++++++
 13 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/buddha.c b/drivers/ide/buddha.c
index 606c3320fa58..d028f8864bc1 100644
--- a/drivers/ide/buddha.c
+++ b/drivers/ide/buddha.c
@@ -145,6 +145,7 @@ static void __init buddha_setup_ports(hw_regs_t *hw, unsigned long base,
 
 static const struct ide_port_info buddha_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
+	.irq_flags		= IRQF_SHARED,
 };
 
     /*
diff --git a/drivers/ide/delkin_cb.c b/drivers/ide/delkin_cb.c
index bacb1194c9c9..f153b95619bb 100644
--- a/drivers/ide/delkin_cb.c
+++ b/drivers/ide/delkin_cb.c
@@ -66,6 +66,7 @@ static const struct ide_port_info delkin_cb_port_info = {
 	.port_ops		= &delkin_cb_port_ops,
 	.host_flags		= IDE_HFLAG_IO_32BIT | IDE_HFLAG_UNMASK_IRQS |
 				  IDE_HFLAG_NO_DMA,
+	.irq_flags		= IRQF_SHARED,
 	.init_chipset		= delkin_cb_init_chipset,
 };
 
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index bb0c86e976e4..6085feb1fae8 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -105,6 +105,7 @@ static const struct ide_port_info falconide_port_info = {
 	.tp_ops			= &falconide_tp_ops,
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
 				  IDE_HFLAG_NO_DMA,
+	.irq_flags		= IRQF_SHARED,
 };
 
 static void __init falconide_setup_ports(hw_regs_t *hw)
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
index dce01765adbc..dc778251cb05 100644
--- a/drivers/ide/gayle.c
+++ b/drivers/ide/gayle.c
@@ -120,6 +120,7 @@ static void __init gayle_setup_ports(hw_regs_t *hw, unsigned long base,
 static const struct ide_port_info gayle_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
 				  IDE_HFLAG_NO_DMA,
+	.irq_flags		= IRQF_SHARED,
 };
 
     /*
diff --git a/drivers/ide/ide-cs.c b/drivers/ide/ide-cs.c
index f50210fe558f..9e47f3529d55 100644
--- a/drivers/ide/ide-cs.c
+++ b/drivers/ide/ide-cs.c
@@ -154,6 +154,7 @@ static const struct ide_port_ops idecs_port_ops = {
 static const struct ide_port_info idecs_port_info = {
 	.port_ops		= &idecs_port_ops,
 	.host_flags		= IDE_HFLAG_NO_DMA,
+	.irq_flags		= IRQF_SHARED,
 };
 
 static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 030b0ea1a1e1..7007c48e27ae 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -841,6 +841,7 @@ static void unexpected_intr(int irq, ide_hwif_t *hwif)
 irqreturn_t ide_intr (int irq, void *dev_id)
 {
 	ide_hwif_t *hwif = (ide_hwif_t *)dev_id;
+	struct ide_host *host = hwif->host;
 	ide_drive_t *uninitialized_var(drive);
 	ide_handler_t *handler;
 	unsigned long flags;
@@ -848,8 +849,8 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	irqreturn_t irq_ret = IRQ_NONE;
 	int plug_device = 0;
 
-	if (hwif->host->host_flags & IDE_HFLAG_SERIALIZE) {
-		if (hwif != hwif->host->cur_port)
+	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
+		if (hwif != host->cur_port)
 			goto out_early;
 	}
 
@@ -872,27 +873,19 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 		 *
 		 * For PCI, we cannot tell the difference,
 		 * so in that case we just ignore it and hope it goes away.
-		 *
-		 * FIXME: unexpected_intr should be hwif-> then we can
-		 * remove all the ifdef PCI crap
 		 */
-#ifdef CONFIG_BLK_DEV_IDEPCI
-		if (hwif->chipset != ide_pci)
-#endif	/* CONFIG_BLK_DEV_IDEPCI */
-		{
+		if ((host->irq_flags & IRQF_SHARED) == 0) {
 			/*
 			 * Probably not a shared PCI interrupt,
 			 * so we can safely try to do something about it:
 			 */
 			unexpected_intr(irq, hwif);
-#ifdef CONFIG_BLK_DEV_IDEPCI
 		} else {
 			/*
 			 * Whack the status register, just in case
 			 * we have a leftover pending IRQ.
 			 */
 			(void)hwif->tp_ops->read_status(hwif);
-#endif /* CONFIG_BLK_DEV_IDEPCI */
 		}
 		goto out;
 	}
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 4b00945cf7d1..f3a56595eb75 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -837,20 +837,13 @@ static int ide_port_setup_devices(ide_hwif_t *hwif)
 static int init_irq (ide_hwif_t *hwif)
 {
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	irq_handler_t irq_handler;
-	int sa = 0;
+	struct ide_host *host = hwif->host;
+	irq_handler_t irq_handler = host->irq_handler;
+	int sa = host->irq_flags;
 
-	irq_handler = hwif->host->irq_handler;
 	if (irq_handler == NULL)
 		irq_handler = ide_intr;
 
-#if defined(__mc68000__)
-	sa = IRQF_SHARED;
-#endif /* __mc68000__ */
-
-	if (hwif->chipset == ide_pci)
-		sa = IRQF_SHARED;
-
 	if (io_ports->ctl_addr)
 		hwif->tp_ops->set_irq(hwif, 1);
 
diff --git a/drivers/ide/macide.c b/drivers/ide/macide.c
index 56112ee9f5a8..4b1718e83283 100644
--- a/drivers/ide/macide.c
+++ b/drivers/ide/macide.c
@@ -82,6 +82,7 @@ static void __init macide_setup_ports(hw_regs_t *hw, unsigned long base,
 
 static const struct ide_port_info macide_port_info = {
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
+	.irq_flags		= IRQF_SHARED,
 };
 
 static const char *mac_ide_name[] =
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index ebd576df2d84..32f669d656a6 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -112,6 +112,7 @@ static const struct ide_tp_ops q40ide_tp_ops = {
 static const struct ide_port_info q40ide_port_info = {
 	.tp_ops			= &q40ide_tp_ops,
 	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
+	.irq_flags		= IRQF_SHARED,
 };
 
 /* 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 540bc842f3ad..ae965da5dde0 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -891,6 +891,7 @@ static const struct ide_port_info scc_chipset __devinitdata = {
 	.port_ops	= &scc_port_ops,
 	.dma_ops	= &scc_dma_ops,
 	.host_flags	= IDE_HFLAG_SINGLE,
+	.irq_flags	= IRQF_SHARED,
 	.pio_mask	= ATA_PIO4,
 };
 
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 24bc884826fc..a19dbccd7617 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -558,6 +558,8 @@ int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
 
 	host->host_priv = priv;
 
+	host->irq_flags = IRQF_SHARED;
+
 	pci_set_drvdata(dev, host);
 
 	ret = do_ide_setup_pci_device(dev, d, 1);
@@ -606,6 +608,8 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
 
 	host->host_priv = priv;
 
+	host->irq_flags = IRQF_SHARED;
+
 	pci_set_drvdata(pdev[0], host);
 	pci_set_drvdata(pdev[1], host);
 
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index fdb9d7037694..1cffe70f385d 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -557,6 +557,7 @@ static const struct ide_port_info sgiioc4_port_info __devinitconst = {
 	.port_ops		= &sgiioc4_port_ops,
 	.dma_ops		= &sgiioc4_dma_ops,
 	.host_flags		= IDE_HFLAG_MMIO,
+	.irq_flags		= IRQF_SHARED,
 	.mwdma_mask		= ATA_MWDMA2_ONLY,
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ab8ee4f32f52..901d323c7bbe 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -839,6 +839,9 @@ struct ide_host {
 	irq_handler_t	irq_handler;
 
 	unsigned long	host_flags;
+
+	int		irq_flags;
+
 	void		*host_priv;
 	ide_hwif_t	*cur_port;	/* for hosts requiring serialization */
 
@@ -1371,6 +1374,9 @@ struct ide_port_info {
 	u16			max_sectors;	/* if < than the default one */
 
 	u32			host_flags;
+
+	int			irq_flags;
+
 	u8			pio_mask;
 	u8			swdma_mask;
 	u8			mwdma_mask;
-- 
cgit v1.2.3-71-gd317


From 2787cb8ae5c68a6945eb82ccf96b5f2c4f238323 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:28 +0100
Subject: ide: add IDE_HFLAG_DTC2278 host flag

Add IDE_HFLAG_DTC2278 host flag and use it instead of ide_dtc2278
chipset type in ide_init_port().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/dtc2278.c   | 3 ++-
 drivers/ide/ide-probe.c | 2 +-
 include/linux/ide.h     | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/dtc2278.c b/drivers/ide/dtc2278.c
index 689b2e493413..c6b138122981 100644
--- a/drivers/ide/dtc2278.c
+++ b/drivers/ide/dtc2278.c
@@ -100,7 +100,8 @@ static const struct ide_port_info dtc2278_port_info __initdata = {
 				  IDE_HFLAG_IO_32BIT |
 				  /* disallow ->io_32bit changes */
 				  IDE_HFLAG_NO_IO_32BIT |
-				  IDE_HFLAG_NO_DMA,
+				  IDE_HFLAG_NO_DMA |
+				  IDE_HFLAG_DTC2278,
 	.pio_mask		= ATA_PIO4,
 };
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 75b79cc96339..62270f474681 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1061,7 +1061,7 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
 		hwif->tp_ops = d->tp_ops;
 
 	/* ->set_pio_mode for DTC2278 is currently limited to port 0 */
-	if (hwif->chipset != ide_dtc2278 || hwif->channel == 0)
+	if ((hwif->host_flags & IDE_HFLAG_DTC2278) == 0 || hwif->channel == 0)
 		hwif->port_ops = d->port_ops;
 
 	hwif->swdma_mask = d->swdma_mask;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 901d323c7bbe..732a05f3de08 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1325,6 +1325,8 @@ enum {
 	IDE_HFLAG_ERROR_STOPS_FIFO	= (1 << 19),
 	/* serialize ports */
 	IDE_HFLAG_SERIALIZE		= (1 << 20),
+	/* host is DTC2278 */
+	IDE_HFLAG_DTC2278		= (1 << 21),
 	/* host is TRM290 */
 	IDE_HFLAG_TRM290		= (1 << 23),
 	/* use 32-bit I/O ops */
-- 
cgit v1.2.3-71-gd317


From c094ea0774d6881598da430ea0916a597162f8df Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:28 +0100
Subject: ide: add IDE_HFLAG_4DRIVES host flag

Add IDE_HFLAG_4DRIVES host flag and use it instead of ide_4drives
chipset type in ide_init_port().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-4drives.c | 3 ++-
 drivers/ide/ide-probe.c   | 4 ++--
 include/linux/ide.h       | 2 ++
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-4drives.c b/drivers/ide/ide-4drives.c
index 9e85b1ec9607..78aca75a2c48 100644
--- a/drivers/ide/ide-4drives.c
+++ b/drivers/ide/ide-4drives.c
@@ -23,7 +23,8 @@ static const struct ide_port_ops ide_4drives_port_ops = {
 
 static const struct ide_port_info ide_4drives_port_info = {
 	.port_ops		= &ide_4drives_port_ops,
-	.host_flags		= IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA,
+	.host_flags		= IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA |
+				  IDE_HFLAG_4DRIVES,
 };
 
 static int __init ide_4drives_init(void)
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 62270f474681..335322f40c5a 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1381,8 +1381,8 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
 		if (ide_probe_port(hwif) == 0)
 			hwif->present = 1;
 
-		if (hwif->chipset != ide_4drives || !hwif->mate ||
-		    !hwif->mate->present) {
+		if ((hwif->host_flags & IDE_HFLAG_4DRIVES) == 0 ||
+		    hwif->mate == NULL || hwif->mate->present == 0) {
 			if (ide_register_port(hwif)) {
 				ide_disable_port(hwif);
 				continue;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 732a05f3de08..3d4ba8e95d4a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1327,6 +1327,8 @@ enum {
 	IDE_HFLAG_SERIALIZE		= (1 << 20),
 	/* host is DTC2278 */
 	IDE_HFLAG_DTC2278		= (1 << 21),
+	/* 4 devices on a single set of I/O ports */
+	IDE_HFLAG_4DRIVES		= (1 << 22),
 	/* host is TRM290 */
 	IDE_HFLAG_TRM290		= (1 << 23),
 	/* use 32-bit I/O ops */
-- 
cgit v1.2.3-71-gd317


From 19710d25d50ae0be05eebe4231ed8918b1092d82 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:28 +0100
Subject: ide: add "flagged" taskfile flags to struct ide_taskfile (v2)

* Add ->ftf_flags field to struct ide_taskfile
  and convert flags for TASKFILE ioctl to use it.

* Rename "flagged" taskfile flags:
  - IDE_TFLAG_FLAGGED -> IDE_FTFLAG_FLAGGED
  - IDE_TFLAG_FLAGGED_SET_IN_FLAGS -> IDE_FTFLAG_SET_IN_FLAGS
  - IDE_TFLAG_{OUT,IN}_DATA -> IDE_FTFLAG_{OUT,IN}_DATA

v2:
* Remember to fully update ide-h8300.c, scc_pata.c and tx493{8,9}ide.c
  (thanks to Stephen Rothwell for noticing).

There should be no functional changes caused by this patch.

Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     |  6 ++---
 drivers/ide/ide-h8300.c    |  6 ++---
 drivers/ide/ide-io-std.c   |  6 ++---
 drivers/ide/ide-taskfile.c | 12 ++++-----
 drivers/ide/ns87415.c      |  2 +-
 drivers/ide/scc_pata.c     |  6 ++---
 drivers/ide/tx4938ide.c    |  6 ++---
 drivers/ide/tx4939ide.c    |  6 ++---
 include/linux/ide.h        | 66 ++++++++++++++++++++++++----------------------
 9 files changed, 60 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 4bd46d2d6b64..6eabf9e31290 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -192,10 +192,10 @@ static void at91_ide_tf_load(ide_drive_t *drive, ide_task_t *task)
 	struct ide_taskfile *tf = &task->tf;
 	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
-	if (task->tf_flags & IDE_TFLAG_FLAGGED)
+	if (task->tf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DATA) {
+	if (task->tf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		at91_ide_output_data(drive, NULL, &data, 2);
@@ -233,7 +233,7 @@ static void at91_ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &task->tf;
 
-	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+	if (task->tf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		at91_ide_input_data(drive, NULL, &data, 2);
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 9270d3255ee0..11e937485bff 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -51,10 +51,10 @@ static void h8300_tf_load(ide_drive_t *drive, ide_task_t *task)
 	struct ide_taskfile *tf = &task->tf;
 	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
-	if (task->tf_flags & IDE_TFLAG_FLAGGED)
+	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DATA)
+	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA)
 		mm_outw((tf->hob_data << 8) | tf->data, io_ports->data_addr);
 
 	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
@@ -90,7 +90,7 @@ static void h8300_tf_read(ide_drive_t *drive, ide_task_t *task)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &task->tf;
 
-	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data = mm_inw(io_ports->data_addr);
 
 		tf->data = data & 0xff;
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 9a8da6744d93..cad59f0bfbce 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -96,10 +96,10 @@ void ide_tf_load(ide_drive_t *drive, ide_task_t *task)
 	else
 		tf_outb = ide_outb;
 
-	if (task->tf_flags & IDE_TFLAG_FLAGGED)
+	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		if (mmio)
@@ -153,7 +153,7 @@ void ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 		tf_inb  = ide_inb;
 	}
 
-	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		if (mmio)
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 2461245820b7..02240a3ee0fb 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -73,8 +73,8 @@ ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task)
 		}
 	}
 
-	if (task->tf_flags & IDE_TFLAG_FLAGGED)
-		task->tf_flags |= IDE_TFLAG_FLAGGED_SET_IN_FLAGS;
+	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
+		task->ftf_flags |= IDE_FTFLAG_SET_IN_FLAGS;
 
 	memcpy(&hwif->task, task, sizeof(*task));
 
@@ -551,10 +551,10 @@ int ide_taskfile_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
 		args.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_IN_HOB);
 
 	if (req_task->out_flags.all) {
-		args.tf_flags |= IDE_TFLAG_FLAGGED;
+		args.ftf_flags |= IDE_FTFLAG_FLAGGED;
 
 		if (req_task->out_flags.b.data)
-			args.tf_flags |= IDE_TFLAG_OUT_DATA;
+			args.ftf_flags |= IDE_FTFLAG_OUT_DATA;
 
 		if (req_task->out_flags.b.nsector_hob)
 			args.tf_flags |= IDE_TFLAG_OUT_HOB_NSECT;
@@ -582,7 +582,7 @@ int ide_taskfile_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
 	}
 
 	if (req_task->in_flags.b.data)
-		args.tf_flags |= IDE_TFLAG_IN_DATA;
+		args.ftf_flags |= IDE_FTFLAG_IN_DATA;
 
 	switch(req_task->data_phase) {
 		case TASKFILE_MULTI_OUT:
@@ -647,7 +647,7 @@ int ide_taskfile_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
 	memcpy(req_task->hob_ports, &args.tf_array[0], HDIO_DRIVE_HOB_HDR_SIZE - 2);
 	memcpy(req_task->io_ports, &args.tf_array[6], HDIO_DRIVE_TASK_HDR_SIZE);
 
-	if ((args.tf_flags & IDE_TFLAG_FLAGGED_SET_IN_FLAGS) &&
+	if ((args.ftf_flags & IDE_FTFLAG_SET_IN_FLAGS) &&
 	    req_task->in_flags.all == 0) {
 		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
 		if (drive->dev_flags & IDE_DFLAG_LBA48)
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index ea48a3ee8063..159eb39c7932 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -66,7 +66,7 @@ static void superio_tf_read(ide_drive_t *drive, ide_task_t *task)
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
 	struct ide_taskfile *tf = &task->tf;
 
-	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data = inw(io_ports->data_addr);
 
 		tf->data = data & 0xff;
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index ae965da5dde0..82929c725d82 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -672,10 +672,10 @@ static void scc_tf_load(ide_drive_t *drive, ide_task_t *task)
 	struct ide_taskfile *tf = &task->tf;
 	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
-	if (task->tf_flags & IDE_TFLAG_FLAGGED)
+	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DATA)
+	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA)
 		out_be32((void *)io_ports->data_addr,
 			 (tf->hob_data << 8) | tf->data);
 
@@ -711,7 +711,7 @@ static void scc_tf_read(ide_drive_t *drive, ide_task_t *task)
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
 	struct ide_taskfile *tf = &task->tf;
 
-	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data = (u16)in_be32((void *)io_ports->data_addr);
 
 		tf->data = data & 0xff;
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index efade9e898b3..6b51e0c58af7 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -89,10 +89,10 @@ static void tx4938ide_tf_load(ide_drive_t *drive, ide_task_t *task)
 	struct ide_taskfile *tf = &task->tf;
 	u8 HIHI = task->tf_flags & IDE_TFLAG_LBA48 ? 0xE0 : 0xEF;
 
-	if (task->tf_flags & IDE_TFLAG_FLAGGED)
+	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		/* no endian swap */
@@ -132,7 +132,7 @@ static void tx4938ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &task->tf;
 
-	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		/* no endian swap */
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index fb037a5e70b3..f0033eb2e885 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -474,10 +474,10 @@ static void tx4939ide_tf_load(ide_drive_t *drive, ide_task_t *task)
 	struct ide_taskfile *tf = &task->tf;
 	u8 HIHI = task->tf_flags & IDE_TFLAG_LBA48 ? 0xE0 : 0xEF;
 
-	if (task->tf_flags & IDE_TFLAG_FLAGGED)
+	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		/* no endian swap */
@@ -519,7 +519,7 @@ static void tx4939ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	struct ide_io_ports *io_ports = &hwif->io_ports;
 	struct ide_taskfile *tf = &task->tf;
 
-	if (task->tf_flags & IDE_TFLAG_IN_DATA) {
+	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		/* no endian swap */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 3d4ba8e95d4a..675d4363ece4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -234,56 +234,52 @@ typedef enum {
 
 enum {
 	IDE_TFLAG_LBA48			= (1 << 0),
-	IDE_TFLAG_FLAGGED		= (1 << 2),
-	IDE_TFLAG_OUT_DATA		= (1 << 3),
-	IDE_TFLAG_OUT_HOB_FEATURE	= (1 << 4),
-	IDE_TFLAG_OUT_HOB_NSECT		= (1 << 5),
-	IDE_TFLAG_OUT_HOB_LBAL		= (1 << 6),
-	IDE_TFLAG_OUT_HOB_LBAM		= (1 << 7),
-	IDE_TFLAG_OUT_HOB_LBAH		= (1 << 8),
+	IDE_TFLAG_OUT_HOB_FEATURE	= (1 << 1),
+	IDE_TFLAG_OUT_HOB_NSECT		= (1 << 2),
+	IDE_TFLAG_OUT_HOB_LBAL		= (1 << 3),
+	IDE_TFLAG_OUT_HOB_LBAM		= (1 << 4),
+	IDE_TFLAG_OUT_HOB_LBAH		= (1 << 5),
 	IDE_TFLAG_OUT_HOB		= IDE_TFLAG_OUT_HOB_FEATURE |
 					  IDE_TFLAG_OUT_HOB_NSECT |
 					  IDE_TFLAG_OUT_HOB_LBAL |
 					  IDE_TFLAG_OUT_HOB_LBAM |
 					  IDE_TFLAG_OUT_HOB_LBAH,
-	IDE_TFLAG_OUT_FEATURE		= (1 << 9),
-	IDE_TFLAG_OUT_NSECT		= (1 << 10),
-	IDE_TFLAG_OUT_LBAL		= (1 << 11),
-	IDE_TFLAG_OUT_LBAM		= (1 << 12),
-	IDE_TFLAG_OUT_LBAH		= (1 << 13),
+	IDE_TFLAG_OUT_FEATURE		= (1 << 6),
+	IDE_TFLAG_OUT_NSECT		= (1 << 7),
+	IDE_TFLAG_OUT_LBAL		= (1 << 8),
+	IDE_TFLAG_OUT_LBAM		= (1 << 9),
+	IDE_TFLAG_OUT_LBAH		= (1 << 10),
 	IDE_TFLAG_OUT_TF		= IDE_TFLAG_OUT_FEATURE |
 					  IDE_TFLAG_OUT_NSECT |
 					  IDE_TFLAG_OUT_LBAL |
 					  IDE_TFLAG_OUT_LBAM |
 					  IDE_TFLAG_OUT_LBAH,
-	IDE_TFLAG_OUT_DEVICE		= (1 << 14),
-	IDE_TFLAG_WRITE			= (1 << 15),
-	IDE_TFLAG_FLAGGED_SET_IN_FLAGS	= (1 << 16),
-	IDE_TFLAG_IN_DATA		= (1 << 17),
-	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 18),
-	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 19),
-	IDE_TFLAG_IN_HOB_FEATURE	= (1 << 20),
-	IDE_TFLAG_IN_HOB_NSECT		= (1 << 21),
-	IDE_TFLAG_IN_HOB_LBAL		= (1 << 22),
-	IDE_TFLAG_IN_HOB_LBAM		= (1 << 23),
-	IDE_TFLAG_IN_HOB_LBAH		= (1 << 24),
+	IDE_TFLAG_OUT_DEVICE		= (1 << 11),
+	IDE_TFLAG_WRITE			= (1 << 12),
+	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 13),
+	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 14),
+	IDE_TFLAG_IN_HOB_FEATURE	= (1 << 15),
+	IDE_TFLAG_IN_HOB_NSECT		= (1 << 16),
+	IDE_TFLAG_IN_HOB_LBAL		= (1 << 17),
+	IDE_TFLAG_IN_HOB_LBAM		= (1 << 18),
+	IDE_TFLAG_IN_HOB_LBAH		= (1 << 19),
 	IDE_TFLAG_IN_HOB_LBA		= IDE_TFLAG_IN_HOB_LBAL |
 					  IDE_TFLAG_IN_HOB_LBAM |
 					  IDE_TFLAG_IN_HOB_LBAH,
 	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_FEATURE |
 					  IDE_TFLAG_IN_HOB_NSECT |
 					  IDE_TFLAG_IN_HOB_LBA,
-	IDE_TFLAG_IN_FEATURE		= (1 << 1),
-	IDE_TFLAG_IN_NSECT		= (1 << 25),
-	IDE_TFLAG_IN_LBAL		= (1 << 26),
-	IDE_TFLAG_IN_LBAM		= (1 << 27),
-	IDE_TFLAG_IN_LBAH		= (1 << 28),
+	IDE_TFLAG_IN_FEATURE		= (1 << 20),
+	IDE_TFLAG_IN_NSECT		= (1 << 21),
+	IDE_TFLAG_IN_LBAL		= (1 << 22),
+	IDE_TFLAG_IN_LBAM		= (1 << 23),
+	IDE_TFLAG_IN_LBAH		= (1 << 24),
 	IDE_TFLAG_IN_LBA		= IDE_TFLAG_IN_LBAL |
 					  IDE_TFLAG_IN_LBAM |
 					  IDE_TFLAG_IN_LBAH,
 	IDE_TFLAG_IN_TF			= IDE_TFLAG_IN_NSECT |
 					  IDE_TFLAG_IN_LBA,
-	IDE_TFLAG_IN_DEVICE		= (1 << 29),
+	IDE_TFLAG_IN_DEVICE		= (1 << 25),
 	IDE_TFLAG_HOB			= IDE_TFLAG_OUT_HOB |
 					  IDE_TFLAG_IN_HOB,
 	IDE_TFLAG_TF			= IDE_TFLAG_OUT_TF |
@@ -291,9 +287,16 @@ enum {
 	IDE_TFLAG_DEVICE		= IDE_TFLAG_OUT_DEVICE |
 					  IDE_TFLAG_IN_DEVICE,
 	/* force 16-bit I/O operations */
-	IDE_TFLAG_IO_16BIT		= (1 << 30),
+	IDE_TFLAG_IO_16BIT		= (1 << 26),
 	/* ide_task_t was allocated using kmalloc() */
-	IDE_TFLAG_DYN			= (1 << 31),
+	IDE_TFLAG_DYN			= (1 << 27),
+};
+
+enum {
+	IDE_FTFLAG_FLAGGED		= (1 << 0),
+	IDE_FTFLAG_SET_IN_FLAGS		= (1 << 1),
+	IDE_FTFLAG_OUT_DATA		= (1 << 2),
+	IDE_FTFLAG_IN_DATA		= (1 << 3),
 };
 
 struct ide_taskfile {
@@ -330,6 +333,7 @@ typedef struct ide_task_s {
 		struct ide_taskfile	tf;
 		u8			tf_array[14];
 	};
+	u8			ftf_flags;	/* for TASKFILE ioctl */
 	u32			tf_flags;
 	int			data_phase;
 	struct request		*rq;		/* copy of request */
-- 
cgit v1.2.3-71-gd317


From 3616b6536a74ff1c56029c17cbb3575c69c0a574 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:29 +0100
Subject: ide: complete power step in ide_complete_pm_request()

* Complete power step in ide_complete_pm_request().

* Rename ide_complete_pm_request() to ide_complete_pm_rq().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c | 8 ++------
 drivers/ide/ide-pm.c | 9 +++++++--
 include/linux/ide.h  | 2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 7007c48e27ae..49b098de367c 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -178,11 +178,7 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 				kfree(task);
 		}
 	} else if (blk_pm_request(rq)) {
-		struct request_pm_state *pm = rq->data;
-
-		ide_complete_power_step(drive, rq);
-		if (pm->pm_step == IDE_PM_COMPLETED)
-			ide_complete_pm_request(drive, rq);
+		ide_complete_pm_rq(drive, rq);
 		return;
 	}
 
@@ -438,7 +434,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 			startstop = ide_start_power_step(drive, rq);
 			if (startstop == ide_stopped &&
 			    pm->pm_step == IDE_PM_COMPLETED)
-				ide_complete_pm_request(drive, rq);
+				ide_complete_pm_rq(drive, rq);
 			return startstop;
 		} else if (!rq->rq_disk && blk_special_request(rq))
 			/*
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 60538d9c84ee..74c7c2bbe0fd 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -169,18 +169,23 @@ out_do_tf:
 }
 
 /**
- *	ide_complete_pm_request - end the current Power Management request
+ *	ide_complete_pm_rq - end the current Power Management request
  *	@drive: target drive
  *	@rq: request
  *
  *	This function cleans up the current PM request and stops the queue
  *	if necessary.
  */
-void ide_complete_pm_request(ide_drive_t *drive, struct request *rq)
+void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
+	struct request_pm_state *pm = rq->data;
 	unsigned long flags;
 
+	ide_complete_power_step(drive, rq);
+	if (pm->pm_step != IDE_PM_COMPLETED)
+		return;
+
 #ifdef DEBUG_PM
 	printk("%s: completing PM request, %s\n", drive->name,
 	       blk_pm_suspend_request(rq) ? "suspend" : "resume");
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 675d4363ece4..c5ac19e43fc0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1081,7 +1081,7 @@ int generic_ide_resume(struct device *);
 
 void ide_complete_power_step(ide_drive_t *, struct request *);
 ide_startstop_t ide_start_power_step(ide_drive_t *, struct request *);
-void ide_complete_pm_request(ide_drive_t *, struct request *);
+void ide_complete_pm_rq(ide_drive_t *, struct request *);
 void ide_check_pm_state(ide_drive_t *, struct request *);
 
 /*
-- 
cgit v1.2.3-71-gd317


From a09485df9cda49fbde2766c86eb18a9cae585162 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:31 +0100
Subject: ide: move request type specific code from ide_end_drive_cmd() to
 callers (v3)

* Move request type specific code from ide_end_drive_cmd() to callers.

* Remove stale ide_end_drive_cmd() documentation and drop no longer
  used 'stat' argument.  Then rename the function to ide_complete_rq().

v2:
* Fix handling of blk_pm_request() requests in task_no_data_intr().

v3:
* Some ide_no_data_taskfile() users (HPA code and HDIO_DRIVE_* ioctls
  handlers) access original command later so we need to update it in
  ide_complete_task().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-eh.c       | 11 ++++++++++-
 drivers/ide/ide-floppy.c   |  2 +-
 drivers/ide/ide-io.c       | 40 ++++++++++------------------------------
 drivers/ide/ide-tape.c     |  2 +-
 drivers/ide/ide-taskfile.c | 26 +++++++++++++++++++-------
 include/linux/ide.h        |  3 ++-
 6 files changed, 43 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 1231b5e486f2..e2c04886616f 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -124,7 +124,16 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 	/* retry only "normal" I/O: */
 	if (!blk_fs_request(rq)) {
 		rq->errors = 1;
-		ide_end_drive_cmd(drive, stat, err);
+		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
+			ide_task_t *task = rq->special;
+
+			if (task)
+				ide_complete_task(drive, task, stat, err);
+		} else if (blk_pm_request(rq)) {
+			ide_complete_pm_rq(drive, rq);
+			return ide_stopped;
+		}
+		ide_complete_rq(drive, err);
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d1a79e8e0d69..39e7fda37c5f 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -101,7 +101,7 @@ static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
 	}
 	rq->errors = error;
 	/* fixme: need to move this local also */
-	ide_end_drive_cmd(drive, 0, 0);
+	ide_complete_rq(drive, 0);
 	return 0;
 }
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index b8426e9c0906..4a97a97e56c4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -144,49 +144,28 @@ int ide_end_dequeued_request(ide_drive_t *drive, struct request *rq,
 }
 EXPORT_SYMBOL_GPL(ide_end_dequeued_request);
 
-static void ide_complete_task(ide_drive_t *drive, ide_task_t *task,
-			      u8 stat, u8 err)
+void ide_complete_task(ide_drive_t *drive, ide_task_t *task, u8 stat, u8 err)
 {
 	struct ide_taskfile *tf = &task->tf;
+	struct request *rq = task->rq;
 
 	tf->error = err;
 	tf->status = stat;
 
 	drive->hwif->tp_ops->tf_read(drive, task);
 
+	if (rq && rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
+		memcpy(rq->special, task, sizeof(*task));
+
 	if (task->tf_flags & IDE_TFLAG_DYN)
 		kfree(task);
 }
 
-/**
- *	ide_end_drive_cmd	-	end an explicit drive command
- *	@drive: command 
- *	@stat: status bits
- *	@err: error bits
- *
- *	Clean up after success/failure of an explicit drive command.
- *	These get thrown onto the queue so they are synchronized with
- *	real I/O operations on the drive.
- *
- *	In LBA48 mode we have to read the register set twice to get
- *	all the extra information out.
- */
- 
-void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
+void ide_complete_rq(ide_drive_t *drive, u8 err)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
 
-	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-		ide_task_t *task = (ide_task_t *)rq->special;
-
-		if (task)
-			ide_complete_task(drive, task, stat, err);
-	} else if (blk_pm_request(rq)) {
-		ide_complete_pm_rq(drive, rq);
-		return;
-	}
-
 	hwif->rq = NULL;
 
 	rq->errors = err;
@@ -195,7 +174,7 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 				     blk_rq_bytes(rq))))
 		BUG();
 }
-EXPORT_SYMBOL(ide_end_drive_cmd);
+EXPORT_SYMBOL(ide_complete_rq);
 
 void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
@@ -358,8 +337,9 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 #ifdef DEBUG
  	printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
-	ide_end_drive_cmd(drive, hwif->tp_ops->read_status(hwif),
-			  ide_read_error(drive));
+	(void)hwif->tp_ops->read_status(hwif);
+
+	ide_complete_rq(drive, ide_read_error(drive));
 
  	return ide_stopped;
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 4e6181c7bbda..de2d926e66c2 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -502,7 +502,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 
 	spin_lock_irqsave(&tape->lock, flags);
 
-	ide_end_drive_cmd(drive, 0, 0);
+	ide_complete_rq(drive, 0);
 
 	spin_unlock_irqrestore(&tape->lock, flags);
 	return 0;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 02240a3ee0fb..297cf6f4c723 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -147,12 +147,9 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 			}
 		}
 		return ide_error(drive, "task_no_data_intr", stat);
-		/* calls ide_end_drive_cmd */
 	}
 
-	if (!custom)
-		ide_end_drive_cmd(drive, stat, ide_read_error(drive));
-	else if (tf->command == ATA_CMD_IDLEIMMEDIATE) {
+	if (custom && tf->command == ATA_CMD_IDLEIMMEDIATE) {
 		hwif->tp_ops->tf_read(drive, task);
 		if (tf->lbal != 0xc4) {
 			printk(KERN_ERR "%s: head unload failed!\n",
@@ -160,10 +157,22 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 			ide_tf_dump(drive->name, tf);
 		} else
 			drive->dev_flags |= IDE_DFLAG_PARKED;
-		ide_end_drive_cmd(drive, stat, ide_read_error(drive));
-	} else if (tf->command == ATA_CMD_SET_MULTI)
+	} else if (custom && tf->command == ATA_CMD_SET_MULTI)
 		drive->mult_count = drive->mult_req;
 
+	if (custom == 0 || tf->command == ATA_CMD_IDLEIMMEDIATE) {
+		struct request *rq = hwif->rq;
+		u8 err = ide_read_error(drive);
+
+		if (blk_pm_request(rq))
+			ide_complete_pm_rq(drive, rq);
+		else {
+			if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
+				ide_complete_task(drive, task, stat, err);
+			ide_complete_rq(drive, err);
+		}
+	}
+
 	return ide_stopped;
 }
 
@@ -321,9 +330,12 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
 void task_end_request(ide_drive_t *drive, struct request *rq, u8 stat)
 {
 	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
+		ide_task_t *task = rq->special;
 		u8 err = ide_read_error(drive);
 
-		ide_end_drive_cmd(drive, stat, err);
+		if (task)
+			ide_complete_task(drive, task, stat, err);
+		ide_complete_rq(drive, err);
 		return;
 	}
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c5ac19e43fc0..83bed2f4378a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1158,7 +1158,8 @@ extern ide_startstop_t ide_do_reset (ide_drive_t *);
 extern int ide_devset_execute(ide_drive_t *drive,
 			      const struct ide_devset *setting, int arg);
 
-extern void ide_end_drive_cmd(ide_drive_t *, u8, u8);
+void ide_complete_task(ide_drive_t *, ide_task_t *, u8, u8);
+void ide_complete_rq(ide_drive_t *, u8);
 
 void ide_tf_dump(const char *, struct ide_taskfile *);
 
-- 
cgit v1.2.3-71-gd317


From e3d9a73a83d98fc466dabdcfe4f4e7e4419e3f8e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:32 +0100
Subject: ide: remove ->data_phase field from ide_hwif_t

* Always use hwif->task->data_phase and remove ->data_phase
  field from ide_hwif_t.

* Remove superfluous REQ_TYPE_ATA_TASKFILE check from
  ide_pio_datablock() while at it.

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-disk.c     |  3 ---
 drivers/ide/ide-io.c       |  5 +----
 drivers/ide/ide-park.c     |  2 +-
 drivers/ide/ide-taskfile.c | 18 ++++++++----------
 include/linux/ide.h        |  3 ---
 5 files changed, 10 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 806760d24cef..0f196e5fcff3 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -160,8 +160,6 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 		task.tf_flags |= IDE_TFLAG_WRITE;
 
 	ide_tf_set_cmd(drive, &task, dma);
-	if (!dma)
-		hwif->data_phase = task.data_phase;
 	task.rq = rq;
 
 	rc = do_rw_taskfile(drive, &task);
@@ -170,7 +168,6 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 		/* fallback to PIO */
 		task.tf_flags |= IDE_TFLAG_DMA_PIO_FALLBACK;
 		ide_tf_set_cmd(drive, &task, 0);
-		hwif->data_phase = task.data_phase;
 		ide_init_sg_cmd(drive, rq);
 		rc = do_rw_taskfile(drive, &task);
 	}
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 45fc18ff73cb..38076169b893 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -310,13 +310,10 @@ EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
 static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 		struct request *rq)
 {
-	ide_hwif_t *hwif = drive->hwif;
 	ide_task_t *task = rq->special;
 
 	if (task) {
-		hwif->data_phase = task->data_phase;
-
-		switch (hwif->data_phase) {
+		switch (task->data_phase) {
 		case TASKFILE_MULTI_OUT:
 		case TASKFILE_OUT:
 		case TASKFILE_MULTI_IN:
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index f30e52152fcb..cddc7c778760 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -81,7 +81,7 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 
 	task.tf_flags |= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	task.rq = rq;
-	drive->hwif->data_phase = task.data_phase = TASKFILE_NO_DATA;
+	task.data_phase = TASKFILE_NO_DATA;
 	return do_rw_taskfile(drive, &task);
 }
 
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 297cf6f4c723..7237e1547b1f 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -265,21 +265,18 @@ static void ide_pio_multi(ide_drive_t *drive, struct request *rq,
 static void ide_pio_datablock(ide_drive_t *drive, struct request *rq,
 				     unsigned int write)
 {
+	ide_task_t *task = &drive->hwif->task;
 	u8 saved_io_32bit = drive->io_32bit;
 
 	if (rq->bio)	/* fs request */
 		rq->errors = 0;
 
-	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-		ide_task_t *task = rq->special;
-
-		if (task->tf_flags & IDE_TFLAG_IO_16BIT)
-			drive->io_32bit = 0;
-	}
+	if (task->tf_flags & IDE_TFLAG_IO_16BIT)
+		drive->io_32bit = 0;
 
 	touch_softlockup_watchdog();
 
-	switch (drive->hwif->data_phase) {
+	switch (task->data_phase) {
 	case TASKFILE_MULTI_IN:
 	case TASKFILE_MULTI_OUT:
 		ide_pio_multi(drive, rq, write);
@@ -297,9 +294,10 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
 {
 	if (rq->bio) {
 		ide_hwif_t *hwif = drive->hwif;
+		ide_task_t *task = &hwif->task;
 		int sectors = hwif->nsect - hwif->nleft;
 
-		switch (hwif->data_phase) {
+		switch (task->data_phase) {
 		case TASKFILE_IN:
 			if (hwif->nleft)
 				break;
@@ -431,14 +429,14 @@ static ide_startstop_t task_out_intr (ide_drive_t *drive)
 
 static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, struct request *rq)
 {
-	ide_hwif_t *hwif = drive->hwif;
+	ide_task_t *task = &drive->hwif->task;
 	ide_startstop_t startstop;
 
 	if (ide_wait_stat(&startstop, drive, ATA_DRQ,
 			  drive->bad_wstat, WAIT_DRQ)) {
 		printk(KERN_ERR "%s: no DRQ after issuing %sWRITE%s\n",
 			drive->name,
-			hwif->data_phase == TASKFILE_MULTI_OUT ? "MULT" : "",
+			task->data_phase == TASKFILE_MULTI_OUT ? "MULT" : "",
 			(drive->dev_flags & IDE_DFLAG_LBA48) ? "_EXT" : "");
 		return startstop;
 	}
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 83bed2f4378a..146b07a9b649 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -768,9 +768,6 @@ typedef struct hwif_s {
 	int orig_sg_nents;
 	int sg_dma_direction;		/* dma transfer direction */
 
-	/* data phase of the active command (currently only valid for PIO/DMA) */
-	int		data_phase;
-
 	struct ide_task_s task;		/* current command */
 
 	unsigned int nsect;
-- 
cgit v1.2.3-71-gd317


From c7016e95a556098db6dc4d9096a6189be9e18266 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:33 +0100
Subject: ide: remove no longer needed PC_FLAG_TIMEDOUT packet command flag

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c | 5 -----
 include/linux/ide.h     | 2 --
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 6adc5b4a4406..f44474b0adae 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -336,11 +336,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	timeout = (drive->media == ide_floppy) ? WAIT_FLOPPY_CMD
 					       : WAIT_TAPE_CMD;
 
-	if (pc->flags & PC_FLAG_TIMEDOUT) {
-		drive->pc_callback(drive, 0);
-		return ide_stopped;
-	}
-
 	/* Clear the interrupt */
 	stat = tp_ops->read_status(hwif);
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 146b07a9b649..8b0ea43884c0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -350,8 +350,6 @@ enum {
 	PC_FLAG_DMA_IN_PROGRESS		= (1 << 4),
 	PC_FLAG_DMA_ERROR		= (1 << 5),
 	PC_FLAG_WRITING			= (1 << 6),
-	/* command timed out */
-	PC_FLAG_TIMEDOUT		= (1 << 7),
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From 5e2040fd0a97888952b37243b5868872bbe0f6ac Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:34 +0100
Subject: ide: move ->failed_pc to ide_drive_t

Move ->failed_pc from struct ide_{disk,tape}_obj to ide_drive_t.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c | 21 ++++++++++-----------
 drivers/ide/ide-gd.h     |  2 --
 drivers/ide/ide-tape.c   | 29 ++++++++++-------------------
 include/linux/ide.h      |  3 +++
 4 files changed, 23 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 6dda0fba017b..f9ad4b3021ee 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -70,7 +70,6 @@
  */
 static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
 {
-	struct ide_disk_obj *floppy = drive->driver_data;
 	struct request *rq = drive->hwif->rq;
 	int error;
 
@@ -90,7 +89,7 @@ static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
 	}
 
 	if (error)
-		floppy->failed_pc = NULL;
+		drive->failed_pc = NULL;
 
 	if (!blk_special_request(rq)) {
 		/* our real local end request function */
@@ -121,8 +120,8 @@ static void ide_floppy_callback(ide_drive_t *drive, int dsc)
 
 	ide_debug_log(IDE_DBG_FUNC, "enter");
 
-	if (floppy->failed_pc == pc)
-		floppy->failed_pc = NULL;
+	if (drive->failed_pc == pc)
+		drive->failed_pc = NULL;
 
 	if (pc->c[0] == GPCMD_READ_10 || pc->c[0] == GPCMD_WRITE_10 ||
 	    (pc->rq && blk_pc_request(pc->rq)))
@@ -137,9 +136,9 @@ static void ide_floppy_callback(ide_drive_t *drive, int dsc)
 			floppy->progress_indication = buf[15] & 0x80 ?
 				(u16)get_unaligned((u16 *)&buf[16]) : 0x10000;
 
-			if (floppy->failed_pc)
+			if (drive->failed_pc)
 				ide_debug_log(IDE_DBG_PC, "pc = %x",
-					      floppy->failed_pc->c[0]);
+					      drive->failed_pc->c[0]);
 
 			ide_debug_log(IDE_DBG_SENSE, "sense key = %x, asc = %x,"
 				      "ascq = %x", floppy->sense_key,
@@ -173,9 +172,9 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
 
-	if (floppy->failed_pc == NULL &&
+	if (drive->failed_pc == NULL &&
 	    pc->c[0] != GPCMD_REQUEST_SENSE)
-		floppy->failed_pc = pc;
+		drive->failed_pc = pc;
 
 	/* Set the current packet command */
 	drive->pc = pc;
@@ -186,7 +185,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 		/* Giving up */
 		pc->error = IDEFLOPPY_ERROR_GENERAL;
 
-		floppy->failed_pc = NULL;
+		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
 		return ide_stopped;
 	}
@@ -290,8 +289,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 					: "dev?"));
 
 	if (rq->errors >= ERROR_MAX) {
-		if (floppy->failed_pc)
-			ide_floppy_report_error(floppy, floppy->failed_pc);
+		if (drive->failed_pc)
+			ide_floppy_report_error(floppy, drive->failed_pc);
 		else
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h
index 70b43765327d..55970772bd04 100644
--- a/drivers/ide/ide-gd.h
+++ b/drivers/ide/ide-gd.h
@@ -20,8 +20,6 @@ struct ide_disk_obj {
 	struct device		dev;
 	unsigned int		openers;	/* protected by BKL for now */
 
-	/* Last failed packet command */
-	struct ide_atapi_pc *failed_pc;
 	/* used for blk_{fs,pc}_request() requests */
 	struct ide_atapi_pc queued_pc;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 72b4350bfeb6..d6555984ee88 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -171,14 +171,6 @@ typedef struct ide_tape_obj {
 	struct gendisk		*disk;
 	struct device		dev;
 
-	/*
-	 *	failed_pc points to the last failed packet command, or contains
-	 *	NULL if we do not need to retry any packet command. This is
-	 *	required since an additional packet command is needed before the
-	 *	retry, to get detailed information on what went wrong.
-	 */
-	/* Last failed packet command */
-	struct ide_atapi_pc *failed_pc;
 	/* used by REQ_IDETAPE_{READ,WRITE} requests */
 	struct ide_atapi_pc queued_pc;
 
@@ -397,7 +389,7 @@ static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
 static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc *pc = tape->failed_pc;
+	struct ide_atapi_pc *pc = drive->failed_pc;
 
 	tape->sense_key = sense[2] & 0xF;
 	tape->asc       = sense[12];
@@ -477,7 +469,6 @@ static void ide_tape_kfree_buffer(idetape_tape_t *tape)
 static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 {
 	struct request *rq = drive->hwif->rq;
-	idetape_tape_t *tape = drive->driver_data;
 	int error;
 
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
@@ -489,7 +480,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 	}
 	rq->errors = error;
 	if (error)
-		tape->failed_pc = NULL;
+		drive->failed_pc = NULL;
 
 	if (!blk_special_request(rq)) {
 		ide_end_request(drive, uptodate, nr_sects);
@@ -514,8 +505,8 @@ static void ide_tape_callback(ide_drive_t *drive, int dsc)
 	if (dsc)
 		ide_tape_handle_dsc(drive);
 
-	if (tape->failed_pc == pc)
-		tape->failed_pc = NULL;
+	if (drive->failed_pc == pc)
+		drive->failed_pc = NULL;
 
 	if (pc->c[0] == REQUEST_SENSE) {
 		if (uptodate)
@@ -653,8 +644,8 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 			"Two request sense in serial were issued\n");
 	}
 
-	if (tape->failed_pc == NULL && pc->c[0] != REQUEST_SENSE)
-		tape->failed_pc = pc;
+	if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE)
+		drive->failed_pc = pc;
 
 	/* Set the current packet command */
 	drive->pc = pc;
@@ -680,7 +671,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 			/* Giving up */
 			pc->error = IDETAPE_ERROR_GENERAL;
 		}
-		tape->failed_pc = NULL;
+		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
 		return ide_stopped;
 	}
@@ -740,7 +731,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 		pc->error = 0;
 	} else {
 		pc->error = IDETAPE_ERROR_GENERAL;
-		tape->failed_pc = NULL;
+		drive->failed_pc = NULL;
 	}
 	drive->pc_callback(drive, 0);
 	return ide_stopped;
@@ -799,8 +790,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	}
 
 	/* Retry a failed packet command */
-	if (tape->failed_pc && drive->pc->c[0] == REQUEST_SENSE) {
-		pc = tape->failed_pc;
+	if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) {
+		pc = drive->failed_pc;
 		goto out;
 	}
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 8b0ea43884c0..4a904681f3e4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -609,6 +609,9 @@ struct ide_drive_s {
 	/* current packet command */
 	struct ide_atapi_pc *pc;
 
+	/* last failed packet command */
+	struct ide_atapi_pc *failed_pc;
+
 	/* callback for packet commands */
 	void (*pc_callback)(struct ide_drive_s *, int);
 
-- 
cgit v1.2.3-71-gd317


From c152cc1a90f9680cefa74d9ff9ce36038081ba72 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:34 +0100
Subject: ide: use ->end_request only for private device driver requests

* Move IDE{FLOPPY,TAPE}_ERROR_* defines to <linux/ide.h> and rename them
  to IDE_DRV_ERROR_*.

* Handle ->end_request special cases for floppy/tape media in ide_kill_rq().

* Call ->end_request only for private device driver requests.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c   |  7 ++-----
 drivers/ide/ide-io.c       |  7 ++++++-
 drivers/ide/ide-tape.c     | 19 +++++++------------
 drivers/ide/ide-taskfile.c | 16 +++-------------
 include/linux/ide.h        |  7 +++++++
 5 files changed, 25 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index f9ad4b3021ee..fb235641da33 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -61,9 +61,6 @@
  */
 #define IDEFLOPPY_PC_DELAY	(HZ/20)	/* default delay for ZIP 100 (50ms) */
 
-/* Error code returned in rq->errors to the higher part of the driver. */
-#define	IDEFLOPPY_ERROR_GENERAL		101
-
 /*
  * Used to finish servicing a request. For read/write requests, we will call
  * ide_end_request to pass to the next buffer.
@@ -77,7 +74,7 @@ static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
 
 	switch (uptodate) {
 	case 0:
-		error = IDEFLOPPY_ERROR_GENERAL;
+		error = IDE_DRV_ERROR_GENERAL;
 		break;
 
 	case 1:
@@ -183,7 +180,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 		if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR))
 			ide_floppy_report_error(floppy, pc);
 		/* Giving up */
-		pc->error = IDEFLOPPY_ERROR_GENERAL;
+		pc->error = IDE_DRV_ERROR_GENERAL;
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 38076169b893..da2f97dfa8f8 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -178,7 +178,12 @@ EXPORT_SYMBOL(ide_complete_rq);
 
 void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
-	if (rq->rq_disk) {
+	drive->failed_pc = NULL;
+
+	if (drive->media == ide_tape)
+		rq->errors = IDE_DRV_ERROR_GENERAL;
+
+	if (blk_special_request(rq) && rq->rq_disk) {
 		struct ide_driver *drv;
 
 		drv = *(struct ide_driver **)rq->rq_disk->private_data;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d6555984ee88..e3b4c1c39d37 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -152,11 +152,6 @@ struct idetape_bh {
 #define IDETAPE_LU_RETENSION_MASK	2
 #define IDETAPE_LU_EOT_MASK		4
 
-/* Error codes returned in rq->errors to the higher part of the driver. */
-#define IDETAPE_ERROR_GENERAL		101
-#define IDETAPE_ERROR_FILEMARK		102
-#define IDETAPE_ERROR_EOD		103
-
 /* Structures related to the SELECT SENSE / MODE SENSE packet commands. */
 #define IDETAPE_BLOCK_DESCRIPTOR	0
 #define IDETAPE_CAPABILITIES_PAGE	0x2a
@@ -422,19 +417,19 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 		}
 	}
 	if (pc->c[0] == READ_6 && (sense[2] & 0x80)) {
-		pc->error = IDETAPE_ERROR_FILEMARK;
+		pc->error = IDE_DRV_ERROR_FILEMARK;
 		pc->flags |= PC_FLAG_ABORT;
 	}
 	if (pc->c[0] == WRITE_6) {
 		if ((sense[2] & 0x40) || (tape->sense_key == 0xd
 		     && tape->asc == 0x0 && tape->ascq == 0x2)) {
-			pc->error = IDETAPE_ERROR_EOD;
+			pc->error = IDE_DRV_ERROR_EOD;
 			pc->flags |= PC_FLAG_ABORT;
 		}
 	}
 	if (pc->c[0] == READ_6 || pc->c[0] == WRITE_6) {
 		if (tape->sense_key == 8) {
-			pc->error = IDETAPE_ERROR_EOD;
+			pc->error = IDE_DRV_ERROR_EOD;
 			pc->flags |= PC_FLAG_ABORT;
 		}
 		if (!(pc->flags & PC_FLAG_ABORT) &&
@@ -474,7 +469,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	switch (uptodate) {
-	case 0:	error = IDETAPE_ERROR_GENERAL; break;
+	case 0: error = IDE_DRV_ERROR_GENERAL; break;
 	case 1: error = 0; break;
 	default: error = uptodate;
 	}
@@ -669,7 +664,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 						tape->ascq);
 			}
 			/* Giving up */
-			pc->error = IDETAPE_ERROR_GENERAL;
+			pc->error = IDE_DRV_ERROR_GENERAL;
 		}
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
@@ -730,7 +725,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 		}
 		pc->error = 0;
 	} else {
-		pc->error = IDETAPE_ERROR_GENERAL;
+		pc->error = IDE_DRV_ERROR_GENERAL;
 		drive->failed_pc = NULL;
 	}
 	drive->pc_callback(drive, 0);
@@ -1210,7 +1205,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 
 	if (tape->merge_bh)
 		idetape_init_merge_buffer(tape);
-	if (errors == IDETAPE_ERROR_GENERAL)
+	if (errors == IDE_DRV_ERROR_GENERAL)
 		return -EIO;
 	return ret;
 }
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 7237e1547b1f..f85b7f21a617 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -315,12 +315,8 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
 			break;
 		}
 
-		if (sectors > 0) {
-			struct ide_driver *drv;
-
-			drv = *(struct ide_driver **)rq->rq_disk->private_data;
-			drv->end_request(drive, 1, sectors);
-		}
+		if (sectors > 0)
+			ide_end_request(drive, 1, sectors);
 	}
 	return ide_error(drive, s, stat);
 }
@@ -337,13 +333,7 @@ void task_end_request(ide_drive_t *drive, struct request *rq, u8 stat)
 		return;
 	}
 
-	if (rq->rq_disk) {
-		struct ide_driver *drv;
-
-		drv = *(struct ide_driver **)rq->rq_disk->private_data;;
-		drv->end_request(drive, 1, rq->nr_sectors);
-	} else
-		ide_end_request(drive, 1, rq->nr_sectors);
+	ide_end_request(drive, 1, rq->nr_sectors);
 }
 
 /*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 4a904681f3e4..aece06a4930f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -40,6 +40,13 @@
 #define ERROR_RESET	3	/* Reset controller every 4th retry */
 #define ERROR_RECAL	1	/* Recalibrate every 2nd retry */
 
+/* Error codes returned in rq->errors to the higher part of the driver. */
+enum {
+	IDE_DRV_ERROR_GENERAL	= 101,
+	IDE_DRV_ERROR_FILEMARK	= 102,
+	IDE_DRV_ERROR_EOD	= 103,
+};
+
 /*
  * Definitions for accessing IDE controller registers
  */
-- 
cgit v1.2.3-71-gd317


From 3ee38302ffc63da93eb0313053a990bb3466e275 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:36 +0100
Subject: ide: remove ->end_request method

* Handle completion of private driver requests explicitly
  for ide_floppy and ide_tape media in ide_kill_rq().

* Remove no longer needed ->end_request method.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c     |  1 -
 drivers/ide/ide-disk.c   |  1 -
 drivers/ide/ide-floppy.c | 20 --------------------
 drivers/ide/ide-gd.c     |  6 ------
 drivers/ide/ide-io.c     | 14 +++++++-------
 drivers/ide/ide-tape.c   | 17 -----------------
 include/linux/ide.h      |  2 --
 7 files changed, 7 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4528e25f2bbb..bb804ae57bc5 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1834,7 +1834,6 @@ static struct ide_driver ide_cdrom_driver = {
 	.remove			= ide_cd_remove,
 	.version		= IDECD_VERSION,
 	.do_request		= ide_cd_do_request,
-	.end_request		= ide_end_request,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc_entries		= ide_cd_proc_entries,
 	.proc_devsets		= ide_cd_proc_devsets,
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 0f196e5fcff3..912be155a8c1 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -734,6 +734,5 @@ const struct ide_disk_ops ide_ata_disk_ops = {
 	.init_media	= ide_disk_init_media,
 	.set_doorlock	= ide_disk_set_doorlock,
 	.do_request	= ide_do_rw_disk,
-	.end_request	= ide_end_request,
 	.ioctl		= ide_disk_ioctl,
 };
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index bdd8f8e2df6d..ab870a08d62b 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -61,25 +61,6 @@
  */
 #define IDEFLOPPY_PC_DELAY	(HZ/20)	/* default delay for ZIP 100 (50ms) */
 
-/*
- * Used to finish servicing a private request.
- */
-static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
-{
-	struct request *rq = drive->hwif->rq;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (uptodate == 0)
-		drive->failed_pc = NULL;
-
-	rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
-
-	ide_complete_rq(drive, 0);
-
-	return 0;
-}
-
 static void idefloppy_update_buffers(ide_drive_t *drive,
 				struct ide_atapi_pc *pc)
 {
@@ -560,6 +541,5 @@ const struct ide_disk_ops ide_atapi_disk_ops = {
 	.init_media	= ide_floppy_init_media,
 	.set_doorlock	= ide_set_media_lock,
 	.do_request	= ide_floppy_do_request,
-	.end_request	= ide_floppy_end_request,
 	.ioctl		= ide_floppy_ioctl,
 };
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index c51a35093ae2..1aebdf1a4f58 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -145,11 +145,6 @@ static ide_startstop_t ide_gd_do_request(ide_drive_t *drive,
 	return drive->disk_ops->do_request(drive, rq, sector);
 }
 
-static int ide_gd_end_request(ide_drive_t *drive, int uptodate, int nrsecs)
-{
-	return drive->disk_ops->end_request(drive, uptodate, nrsecs);
-}
-
 static struct ide_driver ide_gd_driver = {
 	.gen_driver = {
 		.owner		= THIS_MODULE,
@@ -162,7 +157,6 @@ static struct ide_driver ide_gd_driver = {
 	.shutdown		= ide_gd_shutdown,
 	.version		= IDE_GD_VERSION,
 	.do_request		= ide_gd_do_request,
-	.end_request		= ide_gd_end_request,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc_entries		= ide_disk_proc_entries,
 	.proc_devsets		= ide_disk_proc_devsets,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index da2f97dfa8f8..6eee41beec73 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -178,17 +178,17 @@ EXPORT_SYMBOL(ide_complete_rq);
 
 void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
+	u8 drv_req = blk_special_request(rq) && rq->rq_disk;
+	u8 media = drive->media;
+
 	drive->failed_pc = NULL;
 
-	if (drive->media == ide_tape)
+	if ((media == ide_floppy && drv_req) || media == ide_tape)
 		rq->errors = IDE_DRV_ERROR_GENERAL;
 
-	if (blk_special_request(rq) && rq->rq_disk) {
-		struct ide_driver *drv;
-
-		drv = *(struct ide_driver **)rq->rq_disk->private_data;
-		drv->end_request(drive, 0, 0);
-	} else
+	if ((media == ide_floppy || media == ide_tape) && drv_req)
+		ide_complete_rq(drive, 0);
+	else
 		ide_end_request(drive, 0, 0);
 }
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 35469f3069a2..fc61bbef3bb9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -461,22 +461,6 @@ static void ide_tape_kfree_buffer(idetape_tape_t *tape)
 	}
 }
 
-static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
-{
-	struct request *rq = drive->hwif->rq;
-
-	debug_log(DBG_PROCS, "Enter %s\n", __func__);
-
-	rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
-
-	if (uptodate == 0)
-		drive->failed_pc = NULL;
-
-	ide_complete_rq(drive, 0);
-
-	return 0;
-}
-
 static void ide_tape_handle_dsc(ide_drive_t *);
 
 static void ide_tape_callback(ide_drive_t *drive, int dsc)
@@ -2306,7 +2290,6 @@ static struct ide_driver idetape_driver = {
 	.remove			= ide_tape_remove,
 	.version		= IDETAPE_VERSION,
 	.do_request		= idetape_do_request,
-	.end_request		= idetape_end_request,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc_entries		= ide_tape_proc_entries,
 	.proc_devsets		= ide_tape_proc_devsets,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index aece06a4930f..c2cdf7750185 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -427,7 +427,6 @@ struct ide_disk_ops {
 					int);
 	ide_startstop_t	(*do_request)(struct ide_drive_s *, struct request *,
 				      sector_t);
-	int		(*end_request)(struct ide_drive_s *, int, int);
 	int		(*ioctl)(struct ide_drive_s *, struct block_device *,
 				 fmode_t, unsigned int, unsigned long);
 };
@@ -1098,7 +1097,6 @@ void ide_check_pm_state(ide_drive_t *, struct request *);
 struct ide_driver {
 	const char			*version;
 	ide_startstop_t	(*do_request)(ide_drive_t *, struct request *, sector_t);
-	int		(*end_request)(ide_drive_t *, int, int);
 	struct device_driver	gen_driver;
 	int		(*probe)(ide_drive_t *);
 	void		(*remove)(ide_drive_t *);
-- 
cgit v1.2.3-71-gd317


From 03a2faaea8f44edfe583ddf1240948019becfbe4 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:36 +0100
Subject: ide: return request status from ->pc_callback method

Make ->pc_callback method return request status and then move
the request completion from ->pc_callback to ide_pc_intr().

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 12 +++++++++++-
 drivers/ide/ide-floppy.c | 12 ++++--------
 drivers/ide/ide-tape.c   | 10 ++--------
 include/linux/ide.h      |  2 +-
 4 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f44474b0adae..f72b5a675435 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -357,6 +357,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 	/* No more interrupts */
 	if ((stat & ATA_DRQ) == 0) {
+		int uptodate;
+
 		debug_log("Packet command completed, %d bytes transferred\n",
 			  pc->xferred);
 
@@ -395,7 +397,15 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			dsc = 1;
 
 		/* Command finished - Call the callback function */
-		drive->pc_callback(drive, dsc);
+		uptodate = drive->pc_callback(drive, dsc);
+
+		if (uptodate == 0)
+			drive->failed_pc = NULL;
+
+		if (blk_special_request(rq))
+			ide_complete_rq(drive, 0);
+		else
+			ide_end_request(drive, uptodate, 0);
 
 		return ide_stopped;
 	}
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index ab870a08d62b..5625946739ad 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -71,7 +71,7 @@ static void idefloppy_update_buffers(ide_drive_t *drive,
 		ide_end_request(drive, 1, 0);
 }
 
-static void ide_floppy_callback(ide_drive_t *drive, int dsc)
+static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
 	struct ide_atapi_pc *pc = drive->pc;
@@ -108,14 +108,10 @@ static void ide_floppy_callback(ide_drive_t *drive, int dsc)
 			       "Aborting request!\n");
 	}
 
-	if (uptodate == 0)
-		drive->failed_pc = NULL;
-
-	if (blk_special_request(rq)) {
+	if (blk_special_request(rq))
 		rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
-		ide_complete_rq(drive, 0);
-	} else
-		ide_end_request(drive, uptodate, 0);
+
+	return uptodate;
 }
 
 static void ide_floppy_report_error(struct ide_disk_obj *floppy,
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fc61bbef3bb9..a42e49c6cc3f 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -463,7 +463,7 @@ static void ide_tape_kfree_buffer(idetape_tape_t *tape)
 
 static void ide_tape_handle_dsc(ide_drive_t *);
 
-static void ide_tape_callback(ide_drive_t *drive, int dsc)
+static int ide_tape_callback(ide_drive_t *drive, int dsc)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc *pc = drive->pc;
@@ -530,13 +530,7 @@ static void ide_tape_callback(ide_drive_t *drive, int dsc)
 
 	rq->errors = err;
 
-	if (uptodate == 0)
-		drive->failed_pc = NULL;
-
-	if (blk_special_request(rq))
-		ide_complete_rq(drive, 0);
-	else
-		ide_end_request(drive, uptodate, 0);
+	return uptodate;
 }
 
 /*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c2cdf7750185..9127d87cfa93 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -619,7 +619,7 @@ struct ide_drive_s {
 	struct ide_atapi_pc *failed_pc;
 
 	/* callback for packet commands */
-	void (*pc_callback)(struct ide_drive_s *, int);
+	int  (*pc_callback)(struct ide_drive_s *, int);
 
 	void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
 	int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-- 
cgit v1.2.3-71-gd317


From e6830a86c260d73c6f370aa7ed17ee6c71e5ee05 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:37 +0100
Subject: ide: call ide_build_sglist() prior to ->dma_setup (v2)

* Re-map sg table if needed in ide_build_sglist().

* Move ide_build_sglist() call from ->dma_setup to its users.

* Un-export ide_build_sglist().

v2:
* Build fix for CONFIG_BLK_DEV_IDEDMA=n (noticed by Randy Dunlap).

There should be no functional changes caused by this patch.

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c   |  7 +------
 drivers/ide/icside.c       |  6 ------
 drivers/ide/ide-atapi.c    | 19 ++++++++++++++-----
 drivers/ide/ide-dma-sff.c  |  4 ----
 drivers/ide/ide-dma.c      |  5 +++--
 drivers/ide/ide-taskfile.c |  1 +
 drivers/ide/pmac.c         |  7 +------
 drivers/ide/sgiioc4.c      | 10 ++--------
 drivers/ide/tx4939ide.c    |  4 ----
 include/linux/ide.h        |  2 ++
 10 files changed, 24 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 82f153810eb9..3fc3ced8192c 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -211,21 +211,16 @@ static void auide_set_dma_mode(ide_drive_t *drive, const u8 speed)
 #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
 static int auide_build_dmatable(ide_drive_t *drive)
 {
-	int i, iswrite, count = 0;
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
 	_auide_hwif *ahwif = &auide_hwif;
 	struct scatterlist *sg;
+	int i = hwif->sg_nents, iswrite, count = 0;
 
 	iswrite = (rq_data_dir(rq) == WRITE);
 	/* Save for interrupt context */
 	ahwif->drive = drive;
 
-	hwif->sg_nents = i = ide_build_sglist(drive, rq);
-
-	if (!i)
-		return 0;
-
 	/* fill the descriptors */
 	sg = hwif->sg_table;
 	while (i && sg_dma_len(sg)) {
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index cf0522f937c1..78fc36f98d29 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -325,12 +325,6 @@ static int icside_dma_setup(ide_drive_t *drive)
 	 */
 	BUG_ON(dma_channel_active(ec->dma));
 
-	hwif->sg_nents = ide_build_sglist(drive, rq);
-	if (hwif->sg_nents == 0) {
-		ide_map_sg(drive, rq);
-		return 1;
-	}
-
 	/*
 	 * Ensure that we have the right interrupt routed.
 	 */
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f72b5a675435..2b9ac2106674 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -631,18 +631,23 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
 	ide_expiry_t *expiry = NULL;
+	struct request *rq = hwif->rq;
 	unsigned int timeout;
 	u32 tf_flags;
 	u16 bcount;
 
 	if (dev_is_idecd(drive)) {
 		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
-		bcount = ide_cd_get_xferlen(hwif->rq);
+		bcount = ide_cd_get_xferlen(rq);
 		expiry = ide_cd_expiry;
 		timeout = ATAPI_WAIT_PC;
 
-		if (drive->dma)
-			drive->dma = !hwif->dma_ops->dma_setup(drive);
+		if (drive->dma) {
+			if (ide_build_sglist(drive, rq))
+				drive->dma = !hwif->dma_ops->dma_setup(drive);
+			else
+				drive->dma = 0;
+		}
 	} else {
 		pc = drive->pc;
 
@@ -661,8 +666,12 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 		}
 
 		if ((pc->flags & PC_FLAG_DMA_OK) &&
-		     (drive->dev_flags & IDE_DFLAG_USING_DMA))
-			drive->dma = !hwif->dma_ops->dma_setup(drive);
+		     (drive->dev_flags & IDE_DFLAG_USING_DMA)) {
+			if (ide_build_sglist(drive, rq))
+				drive->dma = !hwif->dma_ops->dma_setup(drive);
+			else
+				drive->dma = 0;
+		}
 
 		if (!drive->dma)
 			pc->flags &= ~PC_FLAG_DMA_OK;
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 123d393658af..22b3e751d19b 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -120,10 +120,6 @@ int ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	struct scatterlist *sg;
 	u8 is_trm290 = !!(hwif->host_flags & IDE_HFLAG_TRM290);
 
-	hwif->sg_nents = ide_build_sglist(drive, rq);
-	if (hwif->sg_nents == 0)
-		return 0;
-
 	for_each_sg(hwif->sg_table, sg, hwif->sg_nents, i) {
 		u32 cur_addr, cur_len, xcount, bcount;
 
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index a878f4734f81..12c11b71402e 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -138,14 +138,15 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq)
 		hwif->sg_dma_direction = DMA_TO_DEVICE;
 
 	i = dma_map_sg(hwif->dev, sg, hwif->sg_nents, hwif->sg_dma_direction);
-	if (i) {
+	if (i == 0)
+		ide_map_sg(drive, rq);
+	else {
 		hwif->orig_sg_nents = hwif->sg_nents;
 		hwif->sg_nents = i;
 	}
 
 	return i;
 }
-EXPORT_SYMBOL_GPL(ide_build_sglist);
 
 /**
  *	ide_destroy_dmatable	-	clean up DMA mapping
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 15bbfc1dcd28..925fb9241893 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -103,6 +103,7 @@ ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task)
 		return ide_started;
 	default:
 		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
+		    ide_build_sglist(drive, hwif->rq) == 0 ||
 		    dma_ops->dma_setup(drive))
 			return ide_stopped;
 		dma_ops->dma_exec_cmd(drive, tf->command);
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 74625e821a43..904fb54668e8 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1429,10 +1429,10 @@ pmac_ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	pmac_ide_hwif_t *pmif =
 		(pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent);
 	struct dbdma_cmd *table;
-	int i, count = 0;
 	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
 	struct scatterlist *sg;
 	int wr = (rq_data_dir(rq) == WRITE);
+	int i = hwif->sg_nents, count = 0;
 
 	/* DMA table is already aligned */
 	table = (struct dbdma_cmd *) pmif->dma_table_cpu;
@@ -1442,11 +1442,6 @@ pmac_ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	while (readl(&dma->status) & RUN)
 		udelay(1);
 
-	hwif->sg_nents = i = ide_build_sglist(drive, rq);
-
-	if (!i)
-		return 0;
-
 	/* Build DBDMA commands list */
 	sg = hwif->sg_table;
 	while (i && sg_dma_len(sg)) {
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 1cffe70f385d..ab9433a7ad1f 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -429,15 +429,9 @@ sgiioc4_build_dma_table(ide_drive_t * drive, struct request *rq, int ddir)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned int *table = hwif->dmatable_cpu;
-	unsigned int count = 0, i = 1;
-	struct scatterlist *sg;
+	unsigned int count = 0, i = hwif->sg_nents;
+	struct scatterlist *sg = hwif->sg_table;
 
-	hwif->sg_nents = i = ide_build_sglist(drive, rq);
-
-	if (!i)
-		return 0;	/* sglist of length Zero */
-
-	sg = hwif->sg_table;
 	while (i && sg_dma_len(sg)) {
 		dma_addr_t cur_addr;
 		int cur_len;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index f0033eb2e885..ee86688d8461 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -240,10 +240,6 @@ static int tx4939ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	int i;
 	struct scatterlist *sg;
 
-	hwif->sg_nents = ide_build_sglist(drive, rq);
-	if (hwif->sg_nents == 0)
-		return 0;
-
 	for_each_sg(hwif->sg_table, sg, hwif->sg_nents, i) {
 		u32 cur_addr, cur_len, bcount;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9127d87cfa93..2ee236d1f3ac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1477,6 +1477,8 @@ static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
 static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
 static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
+static inline int ide_build_sglist(ide_drive_t *drive,
+				   struct request *rq) { return 0; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3-71-gd317


From 22aa4b32a19b1f231d4ce7e9af6354b577a22a35 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:37 +0100
Subject: ide: remove ide_task_t typedef

While at it:
- rename struct ide_task_s to struct ide_cmd
- remove stale comments from idedisk_{read_native,set}_max_address()
- drop unused 'cmd' argument from ide_{cmd,task}_ioctl()
- drop unused 'task' argument from tx4939ide_tf_load_fixup()
- rename ide_complete_task() to ide_complete_cmd()
- use consistent naming for struct ide_cmd variables

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c      |  62 +++++++++----------
 drivers/ide/ide-acpi.c      |  10 +--
 drivers/ide/ide-atapi.c     |  42 ++++++-------
 drivers/ide/ide-disk.c      | 121 ++++++++++++++++++------------------
 drivers/ide/ide-disk_proc.c |  23 +++----
 drivers/ide/ide-eh.c        |   6 +-
 drivers/ide/ide-h8300.c     |  62 +++++++++----------
 drivers/ide/ide-io-std.c    |  62 +++++++++----------
 drivers/ide/ide-io.c        |  40 ++++++------
 drivers/ide/ide-ioctls.c    |  44 ++++++-------
 drivers/ide/ide-iops.c      |  30 ++++-----
 drivers/ide/ide-lib.c       |  20 +++---
 drivers/ide/ide-park.c      |  17 ++---
 drivers/ide/ide-pm.c        |  32 +++++-----
 drivers/ide/ide-probe.c     |  18 +++---
 drivers/ide/ide-proc.c      |  16 ++---
 drivers/ide/ide-taskfile.c  | 148 +++++++++++++++++++++++---------------------
 drivers/ide/ns87415.c       |  30 ++++-----
 drivers/ide/scc_pata.c      |  62 +++++++++----------
 drivers/ide/tx4938ide.c     |  62 +++++++++----------
 drivers/ide/tx4939ide.c     |  75 +++++++++++-----------
 include/linux/ide.h         |  26 ++++----
 22 files changed, 511 insertions(+), 497 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 6eabf9e31290..6be7d87382ab 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -185,55 +185,55 @@ static void ide_mm_outb(u8 value, unsigned long port)
 	writeb(value, (void __iomem *) port);
 }
 
-static void at91_ide_tf_load(ide_drive_t *drive, ide_task_t *task)
+static void at91_ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
-	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
+	struct ide_taskfile *tf = &cmd->tf;
+	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
-	if (task->tf_flags & IDE_FTFLAG_FLAGGED)
+	if (cmd->tf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->tf_flags & IDE_FTFLAG_OUT_DATA) {
+	if (cmd->tf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		at91_ide_output_data(drive, NULL, &data, 2);
 	}
 
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		ide_mm_outb(tf->hob_feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
 		ide_mm_outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
 		ide_mm_outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
 		ide_mm_outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
 		ide_mm_outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
 		ide_mm_outb(tf->feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
 		ide_mm_outb(tf->nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
 		ide_mm_outb(tf->lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
 		ide_mm_outb(tf->lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
 		ide_mm_outb(tf->lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
 		ide_mm_outb((tf->device & HIHI) | drive->select, io_ports->device_addr);
 }
 
-static void at91_ide_tf_read(ide_drive_t *drive, ide_task_t *task)
+static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 
-	if (task->tf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->tf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		at91_ide_input_data(drive, NULL, &data, 2);
@@ -244,31 +244,31 @@ static void at91_ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	/* be sure we're looking at the low order bits */
 	ide_mm_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
-	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = ide_mm_inb(io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = ide_mm_inb(io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
 		tf->lbal   = ide_mm_inb(io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
 		tf->lbam   = ide_mm_inb(io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
 		tf->lbah   = ide_mm_inb(io_ports->lbah_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
 		tf->device = ide_mm_inb(io_ports->device_addr);
 
-	if (task->tf_flags & IDE_TFLAG_LBA48) {
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		ide_mm_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = ide_mm_inb(io_ports->feature_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
 			tf->hob_nsect   = ide_mm_inb(io_ports->nsect_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
 			tf->hob_lbal    = ide_mm_inb(io_ports->lbal_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
 			tf->hob_lbam    = ide_mm_inb(io_ports->lbam_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
 			tf->hob_lbah    = ide_mm_inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 5b704f1ea90c..12f436951bff 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -304,7 +304,7 @@ static int do_drive_set_taskfiles(ide_drive_t *drive,
 	/* send all taskfile registers (0x1f1-0x1f7) *in*that*order* */
 	for (ix = 0; ix < gtf_count; ix++) {
 		u8 *gtf = (u8 *)(gtf_address + ix * REGS_PER_GTF);
-		ide_task_t task;
+		struct ide_cmd cmd;
 
 		DEBPRINT("(0x1f1-1f7): "
 			 "hex: %02x %02x %02x %02x %02x %02x %02x\n",
@@ -317,11 +317,11 @@ static int do_drive_set_taskfiles(ide_drive_t *drive,
 		}
 
 		/* convert GTF to taskfile */
-		memset(&task, 0, sizeof(ide_task_t));
-		memcpy(&task.tf_array[7], gtf, REGS_PER_GTF);
-		task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+		memset(&cmd, 0, sizeof(cmd));
+		memcpy(&cmd.tf_array[7], gtf, REGS_PER_GTF);
+		cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
-		err = ide_no_data_taskfile(drive, &task);
+		err = ide_no_data_taskfile(drive, &cmd);
 		if (err) {
 			printk(KERN_ERR "%s: ide_no_data_taskfile failed: %u\n",
 					__func__, err);
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2b9ac2106674..92c6ef6feb57 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -302,16 +302,16 @@ EXPORT_SYMBOL_GPL(ide_cd_get_xferlen);
 
 void ide_read_bcount_and_ireason(ide_drive_t *drive, u16 *bcount, u8 *ireason)
 {
-	ide_task_t task;
+	struct ide_cmd cmd;
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_IN_LBAH | IDE_TFLAG_IN_LBAM |
-			IDE_TFLAG_IN_NSECT;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_IN_LBAH | IDE_TFLAG_IN_LBAM |
+		       IDE_TFLAG_IN_NSECT;
 
-	drive->hwif->tp_ops->tf_read(drive, &task);
+	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
-	*bcount = (task.tf.lbah << 8) | task.tf.lbam;
-	*ireason = task.tf.nsect & 3;
+	*bcount = (cmd.tf.lbah << 8) | cmd.tf.lbam;
+	*ireason = cmd.tf.nsect & 3;
 }
 EXPORT_SYMBOL_GPL(ide_read_bcount_and_ireason);
 
@@ -482,32 +482,32 @@ next_irq:
 static void ide_pktcmd_tf_load(ide_drive_t *drive, u32 tf_flags, u16 bcount)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	ide_task_t task;
+	struct ide_cmd cmd;
 	u8 dma = drive->dma;
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
-			IDE_TFLAG_OUT_FEATURE | tf_flags;
-	task.tf.feature = dma;		/* Use PIO/DMA */
-	task.tf.lbam    = bcount & 0xff;
-	task.tf.lbah    = (bcount >> 8) & 0xff;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
+		       IDE_TFLAG_OUT_FEATURE | tf_flags;
+	cmd.tf.feature = dma;		/* Use PIO/DMA */
+	cmd.tf.lbam    = bcount & 0xff;
+	cmd.tf.lbah    = (bcount >> 8) & 0xff;
 
-	ide_tf_dump(drive->name, &task.tf);
+	ide_tf_dump(drive->name, &cmd.tf);
 	hwif->tp_ops->set_irq(hwif, 1);
 	SELECT_MASK(drive, 0);
-	hwif->tp_ops->tf_load(drive, &task);
+	hwif->tp_ops->tf_load(drive, &cmd);
 }
 
 static u8 ide_read_ireason(ide_drive_t *drive)
 {
-	ide_task_t task;
+	struct ide_cmd cmd;
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_IN_NSECT;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_IN_NSECT;
 
-	drive->hwif->tp_ops->tf_read(drive, &task);
+	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
-	return task.tf.nsect & 3;
+	return cmd.tf.nsect & 3;
 }
 
 static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason)
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 912be155a8c1..6647cb8bd910 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -62,24 +62,24 @@ static const u8 ide_data_phases[] = {
 	TASKFILE_OUT_DMA,
 };
 
-static void ide_tf_set_cmd(ide_drive_t *drive, ide_task_t *task, u8 dma)
+static void ide_tf_set_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 dma)
 {
 	u8 index, lba48, write;
 
-	lba48 = (task->tf_flags & IDE_TFLAG_LBA48) ? 2 : 0;
-	write = (task->tf_flags & IDE_TFLAG_WRITE) ? 1 : 0;
+	lba48 = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 2 : 0;
+	write = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 0;
 
 	if (dma)
 		index = 8;
 	else
 		index = drive->mult_count ? 0 : 4;
 
-	task->tf.command = ide_rw_cmds[index + lba48 + write];
+	cmd->tf.command = ide_rw_cmds[index + lba48 + write];
 
 	if (dma)
 		index = 8; /* fixup index */
 
-	task->data_phase = ide_data_phases[index / 2 + write];
+	cmd->data_phase = ide_data_phases[index / 2 + write];
 }
 
 /*
@@ -93,8 +93,8 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	u16 nsectors		= (u16)rq->nr_sectors;
 	u8 lba48		= !!(drive->dev_flags & IDE_DFLAG_LBA48);
 	u8 dma			= !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
-	ide_task_t		task;
-	struct ide_taskfile	*tf = &task.tf;
+	struct ide_cmd		cmd;
+	struct ide_taskfile	*tf = &cmd.tf;
 	ide_startstop_t		rc;
 
 	if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && lba48 && dma) {
@@ -109,8 +109,8 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 		ide_map_sg(drive, rq);
 	}
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
 	if (drive->dev_flags & IDE_DFLAG_LBA) {
 		if (lba48) {
@@ -129,7 +129,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 			tf->lbam   = (u8)(block >>  8);
 			tf->lbah   = (u8)(block >> 16);
 
-			task.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
+			cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
 		} else {
 			tf->nsect  = nsectors & 0xff;
 			tf->lbal   = block;
@@ -157,19 +157,19 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	}
 
 	if (rq_data_dir(rq))
-		task.tf_flags |= IDE_TFLAG_WRITE;
+		cmd.tf_flags |= IDE_TFLAG_WRITE;
 
-	ide_tf_set_cmd(drive, &task, dma);
-	task.rq = rq;
+	ide_tf_set_cmd(drive, &cmd, dma);
+	cmd.rq = rq;
 
-	rc = do_rw_taskfile(drive, &task);
+	rc = do_rw_taskfile(drive, &cmd);
 
 	if (rc == ide_stopped && dma) {
 		/* fallback to PIO */
-		task.tf_flags |= IDE_TFLAG_DMA_PIO_FALLBACK;
-		ide_tf_set_cmd(drive, &task, 0);
+		cmd.tf_flags |= IDE_TFLAG_DMA_PIO_FALLBACK;
+		ide_tf_set_cmd(drive, &cmd, 0);
 		ide_init_sg_cmd(drive, rq);
-		rc = do_rw_taskfile(drive, &task);
+		rc = do_rw_taskfile(drive, &cmd);
 	}
 
 	return rc;
@@ -213,22 +213,22 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
  */
 static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48)
 {
-	ide_task_t args;
-	struct ide_taskfile *tf = &args.tf;
+	struct ide_cmd cmd;
+	struct ide_taskfile *tf = &cmd.tf;
 	u64 addr = 0;
 
-	/* Create IDE/ATA command request structure */
-	memset(&args, 0, sizeof(ide_task_t));
+	memset(&cmd, 0, sizeof(cmd));
 	if (lba48)
 		tf->command = ATA_CMD_READ_NATIVE_MAX_EXT;
 	else
 		tf->command = ATA_CMD_READ_NATIVE_MAX;
 	tf->device  = ATA_LBA;
-	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	if (lba48)
-		args.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
-	/* submit command request */
-	ide_no_data_taskfile(drive, &args);
+		cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
+
+	ide_no_data_taskfile(drive, &cmd);
 
 	/* if OK, compute maximum address value */
 	if ((tf->status & 0x01) == 0)
@@ -243,13 +243,13 @@ static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48)
  */
 static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
 {
-	ide_task_t args;
-	struct ide_taskfile *tf = &args.tf;
+	struct ide_cmd cmd;
+	struct ide_taskfile *tf = &cmd.tf;
 	u64 addr_set = 0;
 
 	addr_req--;
-	/* Create IDE/ATA command request structure */
-	memset(&args, 0, sizeof(ide_task_t));
+
+	memset(&cmd, 0, sizeof(cmd));
 	tf->lbal     = (addr_req >>  0) & 0xff;
 	tf->lbam     = (addr_req >>= 8) & 0xff;
 	tf->lbah     = (addr_req >>= 8) & 0xff;
@@ -263,11 +263,13 @@ static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
 		tf->command  = ATA_CMD_SET_MAX;
 	}
 	tf->device |= ATA_LBA;
-	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 	if (lba48)
-		args.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
-	/* submit command request */
-	ide_no_data_taskfile(drive, &args);
+		cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_HOB);
+
+	ide_no_data_taskfile(drive, &cmd);
+
 	/* if OK, compute maximum address value */
 	if ((tf->status & 0x01) == 0)
 		addr_set = ide_get_lba_addr(tf, lba48) + 1;
@@ -386,24 +388,24 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
 static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 {
 	ide_drive_t *drive = q->queuedata;
-	ide_task_t *task = kmalloc(sizeof(*task), GFP_ATOMIC);
+	struct ide_cmd *cmd = kmalloc(sizeof(*cmd), GFP_ATOMIC);
 
 	/* FIXME: map struct ide_taskfile on rq->cmd[] */
-	BUG_ON(task == NULL);
+	BUG_ON(cmd == NULL);
 
-	memset(task, 0, sizeof(*task));
+	memset(cmd, 0, sizeof(*cmd));
 	if (ata_id_flush_ext_enabled(drive->id) &&
 	    (drive->capacity64 >= (1UL << 28)))
-		task->tf.command = ATA_CMD_FLUSH_EXT;
+		cmd->tf.command = ATA_CMD_FLUSH_EXT;
 	else
-		task->tf.command = ATA_CMD_FLUSH;
-	task->tf_flags	 = IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
+		cmd->tf.command = ATA_CMD_FLUSH;
+	cmd->tf_flags	 = IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
 			   IDE_TFLAG_DYN;
-	task->data_phase = TASKFILE_NO_DATA;
+	cmd->data_phase = TASKFILE_NO_DATA;
 
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 	rq->cmd_flags |= REQ_SOFTBARRIER;
-	rq->special = task;
+	rq->special = cmd;
 }
 
 ide_devset_get(multcount, mult_count);
@@ -453,15 +455,15 @@ static int set_nowerr(ide_drive_t *drive, int arg)
 
 static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect)
 {
-	ide_task_t task;
+	struct ide_cmd cmd;
 
-	memset(&task, 0, sizeof(task));
-	task.tf.feature = feature;
-	task.tf.nsect   = nsect;
-	task.tf.command = ATA_CMD_SET_FEATURES;
-	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf.feature = feature;
+	cmd.tf.nsect   = nsect;
+	cmd.tf.command = ATA_CMD_SET_FEATURES;
+	cmd.tf_flags   = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
-	return ide_no_data_taskfile(drive, &task);
+	return ide_no_data_taskfile(drive, &cmd);
 }
 
 static void update_ordered(ide_drive_t *drive)
@@ -528,15 +530,16 @@ static int set_wcache(ide_drive_t *drive, int arg)
 
 static int do_idedisk_flushcache(ide_drive_t *drive)
 {
-	ide_task_t args;
+	struct ide_cmd cmd;
 
-	memset(&args, 0, sizeof(ide_task_t));
+	memset(&cmd, 0, sizeof(cmd));
 	if (ata_id_flush_ext_enabled(drive->id))
-		args.tf.command = ATA_CMD_FLUSH_EXT;
+		cmd.tf.command = ATA_CMD_FLUSH_EXT;
 	else
-		args.tf.command = ATA_CMD_FLUSH;
-	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	return ide_no_data_taskfile(drive, &args);
+		cmd.tf.command = ATA_CMD_FLUSH;
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+
+	return ide_no_data_taskfile(drive, &cmd);
 }
 
 ide_devset_get(acoustic, acoustic);
@@ -708,17 +711,17 @@ static int ide_disk_init_media(ide_drive_t *drive, struct gendisk *disk)
 static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 				 int on)
 {
-	ide_task_t task;
+	struct ide_cmd cmd;
 	int ret;
 
 	if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0)
 		return 0;
 
-	memset(&task, 0, sizeof(task));
-	task.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
-	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
+	cmd.tf_flags   = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
-	ret = ide_no_data_taskfile(drive, &task);
+	ret = ide_no_data_taskfile(drive, &cmd);
 
 	if (ret)
 		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index 5766c1f62ad2..afe4f47e9e19 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -6,33 +6,34 @@
 
 static int smart_enable(ide_drive_t *drive)
 {
-	ide_task_t args;
-	struct ide_taskfile *tf = &args.tf;
+	struct ide_cmd cmd;
+	struct ide_taskfile *tf = &cmd.tf;
 
-	memset(&args, 0, sizeof(ide_task_t));
+	memset(&cmd, 0, sizeof(cmd));
 	tf->feature = ATA_SMART_ENABLE;
 	tf->lbam    = ATA_SMART_LBAM_PASS;
 	tf->lbah    = ATA_SMART_LBAH_PASS;
 	tf->command = ATA_CMD_SMART;
-	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	return ide_no_data_taskfile(drive, &args);
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+
+	return ide_no_data_taskfile(drive, &cmd);
 }
 
 static int get_smart_data(ide_drive_t *drive, u8 *buf, u8 sub_cmd)
 {
-	ide_task_t args;
-	struct ide_taskfile *tf = &args.tf;
+	struct ide_cmd cmd;
+	struct ide_taskfile *tf = &cmd.tf;
 
-	memset(&args, 0, sizeof(ide_task_t));
+	memset(&cmd, 0, sizeof(cmd));
 	tf->feature = sub_cmd;
 	tf->nsect   = 0x01;
 	tf->lbam    = ATA_SMART_LBAM_PASS;
 	tf->lbah    = ATA_SMART_LBAH_PASS;
 	tf->command = ATA_CMD_SMART;
-	args.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	args.data_phase	= TASKFILE_IN;
+	cmd.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.data_phase	= TASKFILE_IN;
 
-	return ide_raw_taskfile(drive, &args, buf, 1);
+	return ide_raw_taskfile(drive, &cmd, buf, 1);
 }
 
 static int proc_idedisk_read_cache
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index e2c04886616f..f6e1a82a3cc5 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -125,10 +125,10 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 	if (!blk_fs_request(rq)) {
 		rq->errors = 1;
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-			ide_task_t *task = rq->special;
+			struct ide_cmd *cmd = rq->special;
 
-			if (task)
-				ide_complete_task(drive, task, stat, err);
+			if (cmd)
+				ide_complete_cmd(drive, cmd, stat, err);
 		} else if (blk_pm_request(rq)) {
 			ide_complete_pm_rq(drive, rq);
 			return ide_stopped;
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 11e937485bff..c7883f23c66a 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -44,53 +44,53 @@ static u16 mm_inw(unsigned long a)
 	return r;
 }
 
-static void h8300_tf_load(ide_drive_t *drive, ide_task_t *task)
+static void h8300_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
-	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
+	struct ide_taskfile *tf = &cmd->tf;
+	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
-	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
+	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA)
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA)
 		mm_outw((tf->hob_data << 8) | tf->data, io_ports->data_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		outb(tf->hob_feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
 		outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
 		outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
 		outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
 		outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
 		outb(tf->feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
 		outb(tf->nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
 		outb(tf->lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
 		outb(tf->lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
 		outb(tf->lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
 		outb((tf->device & HIHI) | drive->select,
 		     io_ports->device_addr);
 }
 
-static void h8300_tf_read(ide_drive_t *drive, ide_task_t *task)
+static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 
-	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data = mm_inw(io_ports->data_addr);
 
 		tf->data = data & 0xff;
@@ -100,31 +100,31 @@ static void h8300_tf_read(ide_drive_t *drive, ide_task_t *task)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
-	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = inb(io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
 		tf->lbal   = inb(io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
 		tf->lbam   = inb(io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
 		tf->lbah   = inb(io_ports->lbah_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
 		tf->device = inb(io_ports->device_addr);
 
-	if (task->tf_flags & IDE_TFLAG_LBA48) {
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = inb(io_ports->feature_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
 			tf->hob_nsect   = inb(io_ports->nsect_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
 			tf->hob_lbal    = inb(io_ports->lbal_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
 			tf->hob_lbam    = inb(io_ports->lbam_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
 			tf->hob_lbah    = inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index cad59f0bfbce..570c0cc4514d 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -82,24 +82,24 @@ void ide_set_irq(ide_hwif_t *hwif, int on)
 }
 EXPORT_SYMBOL_GPL(ide_set_irq);
 
-void ide_tf_load(ide_drive_t *drive, ide_task_t *task)
+void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 	void (*tf_outb)(u8 addr, unsigned long port);
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
+	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
 	if (mmio)
 		tf_outb = ide_mm_outb;
 	else
 		tf_outb = ide_outb;
 
-	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
+	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		if (mmio)
@@ -108,39 +108,39 @@ void ide_tf_load(ide_drive_t *drive, ide_task_t *task)
 			outw(data, io_ports->data_addr);
 	}
 
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		tf_outb(tf->hob_feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
 		tf_outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
 		tf_outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
 		tf_outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
 		tf_outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
 		tf_outb(tf->feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
 		tf_outb(tf->nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
 		tf_outb(tf->lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
 		tf_outb(tf->lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
 		tf_outb(tf->lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
 		tf_outb((tf->device & HIHI) | drive->select,
 			 io_ports->device_addr);
 }
 EXPORT_SYMBOL_GPL(ide_tf_load);
 
-void ide_tf_read(ide_drive_t *drive, ide_task_t *task)
+void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 	void (*tf_outb)(u8 addr, unsigned long port);
 	u8 (*tf_inb)(unsigned long port);
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
@@ -153,7 +153,7 @@ void ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 		tf_inb  = ide_inb;
 	}
 
-	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		if (mmio)
@@ -168,31 +168,31 @@ void ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	/* be sure we're looking at the low order bits */
 	tf_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
-	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = tf_inb(io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tf_inb(io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
 		tf->lbal   = tf_inb(io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
 		tf->lbam   = tf_inb(io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
 		tf->lbah   = tf_inb(io_ports->lbah_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
 		tf->device = tf_inb(io_ports->device_addr);
 
-	if (task->tf_flags & IDE_TFLAG_LBA48) {
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tf_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = tf_inb(io_ports->feature_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
 			tf->hob_nsect   = tf_inb(io_ports->nsect_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
 			tf->hob_lbal    = tf_inb(io_ports->lbal_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
 			tf->hob_lbam    = tf_inb(io_ports->lbam_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
 			tf->hob_lbah    = tf_inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 6eee41beec73..2900271c6ddd 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -144,21 +144,21 @@ int ide_end_dequeued_request(ide_drive_t *drive, struct request *rq,
 }
 EXPORT_SYMBOL_GPL(ide_end_dequeued_request);
 
-void ide_complete_task(ide_drive_t *drive, ide_task_t *task, u8 stat, u8 err)
+void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 {
-	struct ide_taskfile *tf = &task->tf;
-	struct request *rq = task->rq;
+	struct ide_taskfile *tf = &cmd->tf;
+	struct request *rq = cmd->rq;
 
 	tf->error = err;
 	tf->status = stat;
 
-	drive->hwif->tp_ops->tf_read(drive, task);
+	drive->hwif->tp_ops->tf_read(drive, cmd);
 
 	if (rq && rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
-		memcpy(rq->special, task, sizeof(*task));
+		memcpy(rq->special, cmd, sizeof(*cmd));
 
-	if (task->tf_flags & IDE_TFLAG_DYN)
-		kfree(task);
+	if (cmd->tf_flags & IDE_TFLAG_DYN)
+		kfree(cmd);
 }
 
 void ide_complete_rq(ide_drive_t *drive, u8 err)
@@ -217,20 +217,20 @@ static void ide_tf_set_setmult_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
 static ide_startstop_t ide_disk_special(ide_drive_t *drive)
 {
 	special_t *s = &drive->special;
-	ide_task_t args;
+	struct ide_cmd cmd;
 
-	memset(&args, 0, sizeof(ide_task_t));
-	args.data_phase = TASKFILE_NO_DATA;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.data_phase = TASKFILE_NO_DATA;
 
 	if (s->b.set_geometry) {
 		s->b.set_geometry = 0;
-		ide_tf_set_specify_cmd(drive, &args.tf);
+		ide_tf_set_specify_cmd(drive, &cmd.tf);
 	} else if (s->b.recalibrate) {
 		s->b.recalibrate = 0;
-		ide_tf_set_restore_cmd(drive, &args.tf);
+		ide_tf_set_restore_cmd(drive, &cmd.tf);
 	} else if (s->b.set_multmode) {
 		s->b.set_multmode = 0;
-		ide_tf_set_setmult_cmd(drive, &args.tf);
+		ide_tf_set_setmult_cmd(drive, &cmd.tf);
 	} else if (s->all) {
 		int special = s->all;
 		s->all = 0;
@@ -238,10 +238,10 @@ static ide_startstop_t ide_disk_special(ide_drive_t *drive)
 		return ide_stopped;
 	}
 
-	args.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE |
-			IDE_TFLAG_CUSTOM_HANDLER;
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE |
+		       IDE_TFLAG_CUSTOM_HANDLER;
 
-	do_rw_taskfile(drive, &args);
+	do_rw_taskfile(drive, &cmd);
 
 	return ide_started;
 }
@@ -315,10 +315,10 @@ EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
 static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 		struct request *rq)
 {
-	ide_task_t *task = rq->special;
+	struct ide_cmd *cmd = rq->special;
 
-	if (task) {
-		switch (task->data_phase) {
+	if (cmd) {
+		switch (cmd->data_phase) {
 		case TASKFILE_MULTI_OUT:
 		case TASKFILE_OUT:
 		case TASKFILE_MULTI_IN:
@@ -329,7 +329,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 			break;
 		}
 
-		return do_rw_taskfile(drive, task);
+		return do_rw_taskfile(drive, cmd);
 	}
 
  	/*
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 1be263eb9c07..4953028a13d4 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -111,13 +111,13 @@ static int ide_set_nice_ioctl(ide_drive_t *drive, unsigned long arg)
 	return 0;
 }
 
-static int ide_cmd_ioctl(ide_drive_t *drive, unsigned cmd, unsigned long arg)
+static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 {
 	u8 *buf = NULL;
 	int bufsize = 0, err = 0;
 	u8 args[4], xfer_rate = 0;
-	ide_task_t tfargs;
-	struct ide_taskfile *tf = &tfargs.tf;
+	struct ide_cmd cmd;
+	struct ide_taskfile *tf = &cmd.tf;
 	u16 *id = drive->id;
 
 	if (NULL == (void *) arg) {
@@ -134,24 +134,24 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned cmd, unsigned long arg)
 	if (copy_from_user(args, (void __user *)arg, 4))
 		return -EFAULT;
 
-	memset(&tfargs, 0, sizeof(ide_task_t));
+	memset(&cmd, 0, sizeof(cmd));
 	tf->feature = args[2];
 	if (args[0] == ATA_CMD_SMART) {
 		tf->nsect = args[3];
 		tf->lbal  = args[1];
 		tf->lbam  = 0x4f;
 		tf->lbah  = 0xc2;
-		tfargs.tf_flags = IDE_TFLAG_OUT_TF | IDE_TFLAG_IN_NSECT;
+		cmd.tf_flags = IDE_TFLAG_OUT_TF | IDE_TFLAG_IN_NSECT;
 	} else {
 		tf->nsect = args[1];
-		tfargs.tf_flags = IDE_TFLAG_OUT_FEATURE |
-				  IDE_TFLAG_OUT_NSECT | IDE_TFLAG_IN_NSECT;
+		cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT |
+			       IDE_TFLAG_IN_NSECT;
 	}
 	tf->command = args[0];
-	tfargs.data_phase = args[3] ? TASKFILE_IN : TASKFILE_NO_DATA;
+	cmd.data_phase = args[3] ? TASKFILE_IN : TASKFILE_NO_DATA;
 
 	if (args[3]) {
-		tfargs.tf_flags |= IDE_TFLAG_IO_16BIT;
+		cmd.tf_flags |= IDE_TFLAG_IO_16BIT;
 		bufsize = SECTOR_SIZE * args[3];
 		buf = kzalloc(bufsize, GFP_KERNEL);
 		if (buf == NULL)
@@ -172,7 +172,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned cmd, unsigned long arg)
 		}
 	}
 
-	err = ide_raw_taskfile(drive, &tfargs, buf, args[3]);
+	err = ide_raw_taskfile(drive, &cmd, buf, args[3]);
 
 	args[0] = tf->status;
 	args[1] = tf->error;
@@ -194,25 +194,25 @@ abort:
 	return err;
 }
 
-static int ide_task_ioctl(ide_drive_t *drive, unsigned cmd, unsigned long arg)
+static int ide_task_ioctl(ide_drive_t *drive, unsigned long arg)
 {
 	void __user *p = (void __user *)arg;
 	int err = 0;
 	u8 args[7];
-	ide_task_t task;
+	struct ide_cmd cmd;
 
 	if (copy_from_user(args, p, 7))
 		return -EFAULT;
 
-	memset(&task, 0, sizeof(task));
-	memcpy(&task.tf_array[7], &args[1], 6);
-	task.tf.command = args[0];
-	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	memset(&cmd, 0, sizeof(cmd));
+	memcpy(&cmd.tf_array[7], &args[1], 6);
+	cmd.tf.command = args[0];
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
-	err = ide_no_data_taskfile(drive, &task);
+	err = ide_no_data_taskfile(drive, &cmd);
 
-	args[0] = task.tf.command;
-	memcpy(&args[1], &task.tf_array[7], 6);
+	args[0] = cmd.tf.command;
+	memcpy(&args[1], &cmd.tf_array[7], 6);
 
 	if (copy_to_user(p, args, 7))
 		err = -EFAULT;
@@ -262,17 +262,17 @@ int generic_ide_ioctl(ide_drive_t *drive, struct block_device *bdev,
 		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
 			return -EACCES;
 		if (drive->media == ide_disk)
-			return ide_taskfile_ioctl(drive, cmd, arg);
+			return ide_taskfile_ioctl(drive, arg);
 		return -ENOMSG;
 #endif
 	case HDIO_DRIVE_CMD:
 		if (!capable(CAP_SYS_RAWIO))
 			return -EACCES;
-		return ide_cmd_ioctl(drive, cmd, arg);
+		return ide_cmd_ioctl(drive, arg);
 	case HDIO_DRIVE_TASK:
 		if (!capable(CAP_SYS_RAWIO))
 			return -EACCES;
-		return ide_task_ioctl(drive, cmd, arg);
+		return ide_task_ioctl(drive, arg);
 	case HDIO_DRIVE_RESET:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 317c5dadd7c0..c3023de7270c 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -31,15 +31,15 @@ void SELECT_DRIVE(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_port_ops *port_ops = hwif->port_ops;
-	ide_task_t task;
+	struct ide_cmd cmd;
 
 	if (port_ops && port_ops->selectproc)
 		port_ops->selectproc(drive);
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_OUT_DEVICE;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_OUT_DEVICE;
 
-	drive->hwif->tp_ops->tf_load(drive, &task);
+	drive->hwif->tp_ops->tf_load(drive, &cmd);
 }
 
 void SELECT_MASK(ide_drive_t *drive, int mask)
@@ -52,14 +52,14 @@ void SELECT_MASK(ide_drive_t *drive, int mask)
 
 u8 ide_read_error(ide_drive_t *drive)
 {
-	ide_task_t task;
+	struct ide_cmd cmd;
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_IN_FEATURE;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_IN_FEATURE;
 
-	drive->hwif->tp_ops->tf_read(drive, &task);
+	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
-	return task.tf.error;
+	return cmd.tf.error;
 }
 EXPORT_SYMBOL_GPL(ide_read_error);
 
@@ -329,7 +329,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	u16 *id = drive->id, i;
 	int error = 0;
 	u8 stat;
-	ide_task_t task;
+	struct ide_cmd cmd;
 
 #ifdef CONFIG_BLK_DEV_IDEDMA
 	if (hwif->dma_ops)	/* check if host supports DMA */
@@ -361,12 +361,12 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	udelay(1);
 	tp_ops->set_irq(hwif, 0);
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT;
-	task.tf.feature = SETFEATURES_XFER;
-	task.tf.nsect   = speed;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT;
+	cmd.tf.feature = SETFEATURES_XFER;
+	cmd.tf.nsect   = speed;
 
-	tp_ops->tf_load(drive, &task);
+	tp_ops->tf_load(drive, &cmd);
 
 	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
 
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index f6c683dd2987..217b7fdf2b17 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -34,19 +34,19 @@ void ide_toggle_bounce(ide_drive_t *drive, int on)
 static void ide_dump_opcode(ide_drive_t *drive)
 {
 	struct request *rq = drive->hwif->rq;
-	ide_task_t *task = NULL;
+	struct ide_cmd *cmd = NULL;
 
 	if (!rq)
 		return;
 
 	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
-		task = rq->special;
+		cmd = rq->special;
 
 	printk(KERN_ERR "ide: failed opcode was: ");
-	if (task == NULL)
+	if (cmd == NULL)
 		printk(KERN_CONT "unknown\n");
 	else
-		printk(KERN_CONT "0x%02x\n", task->tf.command);
+		printk(KERN_CONT "0x%02x\n", cmd->tf.command);
 }
 
 u64 ide_get_lba_addr(struct ide_taskfile *tf, int lba48)
@@ -66,18 +66,18 @@ EXPORT_SYMBOL_GPL(ide_get_lba_addr);
 
 static void ide_dump_sector(ide_drive_t *drive)
 {
-	ide_task_t task;
-	struct ide_taskfile *tf = &task.tf;
+	struct ide_cmd cmd;
+	struct ide_taskfile *tf = &cmd.tf;
 	u8 lba48 = !!(drive->dev_flags & IDE_DFLAG_LBA48);
 
-	memset(&task, 0, sizeof(task));
+	memset(&cmd, 0, sizeof(cmd));
 	if (lba48)
-		task.tf_flags = IDE_TFLAG_IN_LBA | IDE_TFLAG_IN_HOB_LBA |
+		cmd.tf_flags = IDE_TFLAG_IN_LBA | IDE_TFLAG_IN_HOB_LBA |
 				IDE_TFLAG_LBA48;
 	else
-		task.tf_flags = IDE_TFLAG_IN_LBA | IDE_TFLAG_IN_DEVICE;
+		cmd.tf_flags = IDE_TFLAG_IN_LBA | IDE_TFLAG_IN_DEVICE;
 
-	drive->hwif->tp_ops->tf_read(drive, &task);
+	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
 	if (lba48 || (tf->device & ATA_LBA))
 		printk(KERN_CONT ", LBAsect=%llu",
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index cddc7c778760..63c77f99a726 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -63,10 +63,10 @@ out:
 
 ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 {
-	ide_task_t task;
-	struct ide_taskfile *tf = &task.tf;
+	struct ide_cmd cmd;
+	struct ide_taskfile *tf = &cmd.tf;
 
-	memset(&task, 0, sizeof(task));
+	memset(&cmd, 0, sizeof(cmd));
 	if (rq->cmd[0] == REQ_PARK_HEADS) {
 		drive->sleep = *(unsigned long *)rq->special;
 		drive->dev_flags |= IDE_DFLAG_SLEEPING;
@@ -75,14 +75,15 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 		tf->lbal = 0x4c;
 		tf->lbam = 0x4e;
 		tf->lbah = 0x55;
-		task.tf_flags |= IDE_TFLAG_CUSTOM_HANDLER;
+		cmd.tf_flags |= IDE_TFLAG_CUSTOM_HANDLER;
 	} else		/* cmd == REQ_UNPARK_HEADS */
 		tf->command = ATA_CMD_CHK_POWER;
 
-	task.tf_flags |= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	task.rq = rq;
-	task.data_phase = TASKFILE_NO_DATA;
-	return do_rw_taskfile(drive, &task);
+	cmd.tf_flags |= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.rq = rq;
+	cmd.data_phase = TASKFILE_NO_DATA;
+
+	return do_rw_taskfile(drive, &cmd);
 }
 
 ssize_t ide_park_show(struct device *dev, struct device_attribute *attr,
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 74c7c2bbe0fd..5c9fc20f95b5 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -8,7 +8,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	ide_task_t args;
+	struct ide_cmd cmd;
 	int ret;
 
 	/* call ACPI _GTM only once */
@@ -16,10 +16,10 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		ide_acpi_get_timing(hwif);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&args, 0, sizeof(args));
+	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
-	rq->special = &args;
+	rq->special = &cmd;
 	rq->data = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
@@ -42,7 +42,7 @@ int generic_ide_resume(struct device *dev)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	ide_task_t args;
+	struct ide_cmd cmd;
 	int err;
 
 	/* call ACPI _PS0 / _STM only once */
@@ -54,11 +54,11 @@ int generic_ide_resume(struct device *dev)
 	ide_acpi_exec_tfs(drive);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&args, 0, sizeof(args));
+	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = &args;
+	rq->special = &cmd;
 	rq->data = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
@@ -109,9 +109,9 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
 	struct request_pm_state *pm = rq->data;
-	ide_task_t *args = rq->special;
+	struct ide_cmd *cmd = rq->special;
 
-	memset(args, 0, sizeof(*args));
+	memset(cmd, 0, sizeof(*cmd));
 
 	switch (pm->pm_step) {
 	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
@@ -124,12 +124,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			return ide_stopped;
 		}
 		if (ata_id_flush_ext_enabled(drive->id))
-			args->tf.command = ATA_CMD_FLUSH_EXT;
+			cmd->tf.command = ATA_CMD_FLUSH_EXT;
 		else
-			args->tf.command = ATA_CMD_FLUSH;
+			cmd->tf.command = ATA_CMD_FLUSH;
 		goto out_do_tf;
 	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		args->tf.command = ATA_CMD_STANDBYNOW1;
+		cmd->tf.command = ATA_CMD_STANDBYNOW1;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
 		ide_set_max_pio(drive);
@@ -142,7 +142,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			ide_complete_power_step(drive, rq);
 		return ide_stopped;
 	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
-		args->tf.command = ATA_CMD_IDLEIMMEDIATE;
+		cmd->tf.command = ATA_CMD_IDLEIMMEDIATE;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
 		/*
@@ -160,12 +160,14 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 	}
 
 	pm->pm_step = IDE_PM_COMPLETED;
+
 	return ide_stopped;
 
 out_do_tf:
-	args->tf_flags	 = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	args->data_phase = TASKFILE_NO_DATA;
-	return do_rw_taskfile(drive, args);
+	cmd->tf_flags	 = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd->data_phase = TASKFILE_NO_DATA;
+
+	return do_rw_taskfile(drive, cmd);
 }
 
 /**
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 335322f40c5a..548864510ba9 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -283,13 +283,13 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 	 * identify command to be sure of reply
 	 */
 	if (cmd == ATA_CMD_ID_ATAPI) {
-		ide_task_t task;
+		struct ide_cmd cmd;
 
-		memset(&task, 0, sizeof(task));
+		memset(&cmd, 0, sizeof(cmd));
 		/* disable DMA & overlap */
-		task.tf_flags = IDE_TFLAG_OUT_FEATURE;
+		cmd.tf_flags = IDE_TFLAG_OUT_FEATURE;
 
-		tp_ops->tf_load(drive, &task);
+		tp_ops->tf_load(drive, &cmd);
 	}
 
 	/* ask drive for ID */
@@ -337,14 +337,14 @@ int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus)
 
 static u8 ide_read_device(ide_drive_t *drive)
 {
-	ide_task_t task;
+	struct ide_cmd cmd;
 
-	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_IN_DEVICE;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf_flags = IDE_TFLAG_IN_DEVICE;
 
-	drive->hwif->tp_ops->tf_read(drive, &task);
+	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
-	return task.tf.device;
+	return cmd.tf.device;
 }
 
 /**
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 417cde56eafd..10a88bf3eefa 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -194,20 +194,20 @@ ide_devset_get(xfer_rate, current_speed);
 
 static int set_xfer_rate (ide_drive_t *drive, int arg)
 {
-	ide_task_t task;
+	struct ide_cmd cmd;
 	int err;
 
 	if (arg < XFER_PIO_0 || arg > XFER_UDMA_6)
 		return -EINVAL;
 
-	memset(&task, 0, sizeof(task));
-	task.tf.command = ATA_CMD_SET_FEATURES;
-	task.tf.feature = SETFEATURES_XFER;
-	task.tf.nsect   = (u8)arg;
-	task.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT |
-			IDE_TFLAG_IN_NSECT;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf.command = ATA_CMD_SET_FEATURES;
+	cmd.tf.feature = SETFEATURES_XFER;
+	cmd.tf.nsect   = (u8)arg;
+	cmd.tf_flags   = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT |
+			 IDE_TFLAG_IN_NSECT;
 
-	err = ide_no_data_taskfile(drive, &task);
+	err = ide_no_data_taskfile(drive, &cmd);
 
 	if (!err) {
 		ide_set_xfer_rate(drive, (u8) arg);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 925fb9241893..2b85c137764a 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -39,33 +39,34 @@ void ide_tf_dump(const char *s, struct ide_taskfile *tf)
 
 int taskfile_lib_get_identify (ide_drive_t *drive, u8 *buf)
 {
-	ide_task_t args;
+	struct ide_cmd cmd;
 
-	memset(&args, 0, sizeof(ide_task_t));
-	args.tf.nsect = 0x01;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.tf.nsect = 0x01;
 	if (drive->media == ide_disk)
-		args.tf.command = ATA_CMD_ID_ATA;
+		cmd.tf.command = ATA_CMD_ID_ATA;
 	else
-		args.tf.command = ATA_CMD_ID_ATAPI;
-	args.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	args.data_phase	= TASKFILE_IN;
-	return ide_raw_taskfile(drive, &args, buf, 1);
+		cmd.tf.command = ATA_CMD_ID_ATAPI;
+	cmd.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.data_phase	= TASKFILE_IN;
+
+	return ide_raw_taskfile(drive, &cmd, buf, 1);
 }
 
 static ide_startstop_t task_no_data_intr(ide_drive_t *);
 static ide_startstop_t pre_task_out_intr(ide_drive_t *, struct request *);
 static ide_startstop_t task_in_intr(ide_drive_t *);
 
-ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task)
+ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 	ide_handler_t *handler = NULL;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 
-	if (task->data_phase == TASKFILE_MULTI_IN ||
-	    task->data_phase == TASKFILE_MULTI_OUT) {
+	if (cmd->data_phase == TASKFILE_MULTI_IN ||
+	    cmd->data_phase == TASKFILE_MULTI_OUT) {
 		if (!drive->mult_count) {
 			printk(KERN_ERR "%s: multimode not set!\n",
 					drive->name);
@@ -73,24 +74,24 @@ ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task)
 		}
 	}
 
-	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
-		task->ftf_flags |= IDE_FTFLAG_SET_IN_FLAGS;
+	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
+		cmd->ftf_flags |= IDE_FTFLAG_SET_IN_FLAGS;
 
-	memcpy(&hwif->task, task, sizeof(*task));
+	memcpy(&hwif->cmd, cmd, sizeof(*cmd));
 
-	if ((task->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
+	if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
 		ide_tf_dump(drive->name, tf);
 		tp_ops->set_irq(hwif, 1);
 		SELECT_MASK(drive, 0);
-		tp_ops->tf_load(drive, task);
+		tp_ops->tf_load(drive, cmd);
 	}
 
-	switch (task->data_phase) {
+	switch (cmd->data_phase) {
 	case TASKFILE_MULTI_OUT:
 	case TASKFILE_OUT:
 		tp_ops->exec_command(hwif, tf->command);
 		ndelay(400);	/* FIXME */
-		return pre_task_out_intr(drive, task->rq);
+		return pre_task_out_intr(drive, cmd->rq);
 	case TASKFILE_MULTI_IN:
 	case TASKFILE_IN:
 		handler = task_in_intr;
@@ -119,9 +120,9 @@ EXPORT_SYMBOL_GPL(do_rw_taskfile);
 static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	ide_task_t *task = &hwif->task;
-	struct ide_taskfile *tf = &task->tf;
-	int custom = (task->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) ? 1 : 0;
+	struct ide_cmd *cmd = &hwif->cmd;
+	struct ide_taskfile *tf = &cmd->tf;
+	int custom = (cmd->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) ? 1 : 0;
 	int retries = (custom && tf->command == ATA_CMD_INIT_DEV_PARAMS) ? 5 : 1;
 	u8 stat;
 
@@ -151,7 +152,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 	}
 
 	if (custom && tf->command == ATA_CMD_IDLEIMMEDIATE) {
-		hwif->tp_ops->tf_read(drive, task);
+		hwif->tp_ops->tf_read(drive, cmd);
 		if (tf->lbal != 0xc4) {
 			printk(KERN_ERR "%s: head unload failed!\n",
 			       drive->name);
@@ -169,7 +170,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 			ide_complete_pm_rq(drive, rq);
 		else {
 			if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
-				ide_complete_task(drive, task, stat, err);
+				ide_complete_cmd(drive, cmd, stat, err);
 			ide_complete_rq(drive, err);
 		}
 	}
@@ -266,18 +267,18 @@ static void ide_pio_multi(ide_drive_t *drive, struct request *rq,
 static void ide_pio_datablock(ide_drive_t *drive, struct request *rq,
 				     unsigned int write)
 {
-	ide_task_t *task = &drive->hwif->task;
+	struct ide_cmd *cmd = &drive->hwif->cmd;
 	u8 saved_io_32bit = drive->io_32bit;
 
 	if (blk_fs_request(rq))
 		rq->errors = 0;
 
-	if (task->tf_flags & IDE_TFLAG_IO_16BIT)
+	if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
 		drive->io_32bit = 0;
 
 	touch_softlockup_watchdog();
 
-	switch (task->data_phase) {
+	switch (cmd->data_phase) {
 	case TASKFILE_MULTI_IN:
 	case TASKFILE_MULTI_OUT:
 		ide_pio_multi(drive, rq, write);
@@ -295,10 +296,10 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
 {
 	if (blk_fs_request(rq)) {
 		ide_hwif_t *hwif = drive->hwif;
-		ide_task_t *task = &hwif->task;
+		struct ide_cmd *cmd = &hwif->cmd;
 		int sectors = hwif->nsect - hwif->nleft;
 
-		switch (task->data_phase) {
+		switch (cmd->data_phase) {
 		case TASKFILE_IN:
 			if (hwif->nleft)
 				break;
@@ -325,11 +326,11 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
 void task_end_request(ide_drive_t *drive, struct request *rq, u8 stat)
 {
 	if (blk_fs_request(rq) == 0) {
-		ide_task_t *task = rq->special;
+		struct ide_cmd *cmd = rq->special;
 		u8 err = ide_read_error(drive);
 
-		if (task)
-			ide_complete_task(drive, task, stat, err);
+		if (cmd)
+			ide_complete_cmd(drive, cmd, stat, err);
 		ide_complete_rq(drive, err);
 		return;
 	}
@@ -420,14 +421,14 @@ static ide_startstop_t task_out_intr (ide_drive_t *drive)
 
 static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, struct request *rq)
 {
-	ide_task_t *task = &drive->hwif->task;
+	struct ide_cmd *cmd = &drive->hwif->cmd;
 	ide_startstop_t startstop;
 
 	if (ide_wait_stat(&startstop, drive, ATA_DRQ,
 			  drive->bad_wstat, WAIT_DRQ)) {
 		printk(KERN_ERR "%s: no DRQ after issuing %sWRITE%s\n",
 			drive->name,
-			task->data_phase == TASKFILE_MULTI_OUT ? "MULT" : "",
+			cmd->data_phase == TASKFILE_MULTI_OUT ? "MULT" : "",
 			(drive->dev_flags & IDE_DFLAG_LBA48) ? "_EXT" : "");
 		return startstop;
 	}
@@ -441,7 +442,8 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, struct request *rq)
 	return ide_started;
 }
 
-int ide_raw_taskfile(ide_drive_t *drive, ide_task_t *task, u8 *buf, u16 nsect)
+int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
+		     u16 nsect)
 {
 	struct request *rq;
 	int error;
@@ -459,11 +461,11 @@ int ide_raw_taskfile(ide_drive_t *drive, ide_task_t *task, u8 *buf, u16 nsect)
 	rq->hard_nr_sectors = rq->nr_sectors = nsect;
 	rq->hard_cur_sectors = rq->current_nr_sectors = nsect;
 
-	if (task->tf_flags & IDE_TFLAG_WRITE)
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
 		rq->cmd_flags |= REQ_RW;
 
-	rq->special = task;
-	task->rq = rq;
+	rq->special = cmd;
+	cmd->rq = rq;
 
 	error = blk_execute_rq(drive->queue, NULL, rq, 0);
 	blk_put_request(rq);
@@ -473,19 +475,19 @@ int ide_raw_taskfile(ide_drive_t *drive, ide_task_t *task, u8 *buf, u16 nsect)
 
 EXPORT_SYMBOL(ide_raw_taskfile);
 
-int ide_no_data_taskfile(ide_drive_t *drive, ide_task_t *task)
+int ide_no_data_taskfile(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	task->data_phase = TASKFILE_NO_DATA;
+	cmd->data_phase = TASKFILE_NO_DATA;
 
-	return ide_raw_taskfile(drive, task, NULL, 0);
+	return ide_raw_taskfile(drive, cmd, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(ide_no_data_taskfile);
 
 #ifdef CONFIG_IDE_TASK_IOCTL
-int ide_taskfile_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
+int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 {
 	ide_task_request_t	*req_task;
-	ide_task_t		args;
+	struct ide_cmd		cmd;
 	u8 *outbuf		= NULL;
 	u8 *inbuf		= NULL;
 	u8 *data_buf		= NULL;
@@ -539,51 +541,53 @@ int ide_taskfile_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
 		}
 	}
 
-	memset(&args, 0, sizeof(ide_task_t));
+	memset(&cmd, 0, sizeof(cmd));
 
-	memcpy(&args.tf_array[0], req_task->hob_ports, HDIO_DRIVE_HOB_HDR_SIZE - 2);
-	memcpy(&args.tf_array[6], req_task->io_ports, HDIO_DRIVE_TASK_HDR_SIZE);
+	memcpy(&cmd.tf_array[0], req_task->hob_ports,
+	       HDIO_DRIVE_HOB_HDR_SIZE - 2);
+	memcpy(&cmd.tf_array[6], req_task->io_ports,
+	       HDIO_DRIVE_TASK_HDR_SIZE);
 
-	args.data_phase = req_task->data_phase;
+	cmd.data_phase = req_task->data_phase;
+	cmd.tf_flags   = IDE_TFLAG_IO_16BIT | IDE_TFLAG_DEVICE |
+			 IDE_TFLAG_IN_TF;
 
-	args.tf_flags = IDE_TFLAG_IO_16BIT | IDE_TFLAG_DEVICE |
-			IDE_TFLAG_IN_TF;
 	if (drive->dev_flags & IDE_DFLAG_LBA48)
-		args.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_IN_HOB);
+		cmd.tf_flags |= (IDE_TFLAG_LBA48 | IDE_TFLAG_IN_HOB);
 
 	if (req_task->out_flags.all) {
-		args.ftf_flags |= IDE_FTFLAG_FLAGGED;
+		cmd.ftf_flags |= IDE_FTFLAG_FLAGGED;
 
 		if (req_task->out_flags.b.data)
-			args.ftf_flags |= IDE_FTFLAG_OUT_DATA;
+			cmd.ftf_flags |= IDE_FTFLAG_OUT_DATA;
 
 		if (req_task->out_flags.b.nsector_hob)
-			args.tf_flags |= IDE_TFLAG_OUT_HOB_NSECT;
+			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_NSECT;
 		if (req_task->out_flags.b.sector_hob)
-			args.tf_flags |= IDE_TFLAG_OUT_HOB_LBAL;
+			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_LBAL;
 		if (req_task->out_flags.b.lcyl_hob)
-			args.tf_flags |= IDE_TFLAG_OUT_HOB_LBAM;
+			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_LBAM;
 		if (req_task->out_flags.b.hcyl_hob)
-			args.tf_flags |= IDE_TFLAG_OUT_HOB_LBAH;
+			cmd.tf_flags |= IDE_TFLAG_OUT_HOB_LBAH;
 
 		if (req_task->out_flags.b.error_feature)
-			args.tf_flags |= IDE_TFLAG_OUT_FEATURE;
+			cmd.tf_flags |= IDE_TFLAG_OUT_FEATURE;
 		if (req_task->out_flags.b.nsector)
-			args.tf_flags |= IDE_TFLAG_OUT_NSECT;
+			cmd.tf_flags |= IDE_TFLAG_OUT_NSECT;
 		if (req_task->out_flags.b.sector)
-			args.tf_flags |= IDE_TFLAG_OUT_LBAL;
+			cmd.tf_flags |= IDE_TFLAG_OUT_LBAL;
 		if (req_task->out_flags.b.lcyl)
-			args.tf_flags |= IDE_TFLAG_OUT_LBAM;
+			cmd.tf_flags |= IDE_TFLAG_OUT_LBAM;
 		if (req_task->out_flags.b.hcyl)
-			args.tf_flags |= IDE_TFLAG_OUT_LBAH;
+			cmd.tf_flags |= IDE_TFLAG_OUT_LBAH;
 	} else {
-		args.tf_flags |= IDE_TFLAG_OUT_TF;
-		if (args.tf_flags & IDE_TFLAG_LBA48)
-			args.tf_flags |= IDE_TFLAG_OUT_HOB;
+		cmd.tf_flags |= IDE_TFLAG_OUT_TF;
+		if (cmd.tf_flags & IDE_TFLAG_LBA48)
+			cmd.tf_flags |= IDE_TFLAG_OUT_HOB;
 	}
 
 	if (req_task->in_flags.b.data)
-		args.ftf_flags |= IDE_FTFLAG_IN_DATA;
+		cmd.ftf_flags |= IDE_FTFLAG_IN_DATA;
 
 	switch(req_task->data_phase) {
 		case TASKFILE_MULTI_OUT:
@@ -630,7 +634,7 @@ int ide_taskfile_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
 	if (req_task->req_cmd == IDE_DRIVE_TASK_NO_DATA)
 		nsect = 0;
 	else if (!nsect) {
-		nsect = (args.tf.hob_nsect << 8) | args.tf.nsect;
+		nsect = (cmd.tf.hob_nsect << 8) | cmd.tf.nsect;
 
 		if (!nsect) {
 			printk(KERN_ERR "%s: in/out command without data\n",
@@ -641,14 +645,16 @@ int ide_taskfile_ioctl (ide_drive_t *drive, unsigned int cmd, unsigned long arg)
 	}
 
 	if (req_task->req_cmd == IDE_DRIVE_TASK_RAW_WRITE)
-		args.tf_flags |= IDE_TFLAG_WRITE;
+		cmd.tf_flags |= IDE_TFLAG_WRITE;
 
-	err = ide_raw_taskfile(drive, &args, data_buf, nsect);
+	err = ide_raw_taskfile(drive, &cmd, data_buf, nsect);
 
-	memcpy(req_task->hob_ports, &args.tf_array[0], HDIO_DRIVE_HOB_HDR_SIZE - 2);
-	memcpy(req_task->io_ports, &args.tf_array[6], HDIO_DRIVE_TASK_HDR_SIZE);
+	memcpy(req_task->hob_ports, &cmd.tf_array[0],
+	       HDIO_DRIVE_HOB_HDR_SIZE - 2);
+	memcpy(req_task->io_ports, &cmd.tf_array[6],
+	       HDIO_DRIVE_TASK_HDR_SIZE);
 
-	if ((args.ftf_flags & IDE_FTFLAG_SET_IN_FLAGS) &&
+	if ((cmd.ftf_flags & IDE_FTFLAG_SET_IN_FLAGS) &&
 	    req_task->in_flags.all == 0) {
 		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
 		if (drive->dev_flags & IDE_DFLAG_LBA48)
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 159eb39c7932..d93c80016326 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -61,12 +61,12 @@ static u8 superio_dma_sff_read_status(ide_hwif_t *hwif)
 	return superio_ide_inb(hwif->dma_base + ATA_DMA_STATUS);
 }
 
-static void superio_tf_read(ide_drive_t *drive, ide_task_t *task)
+static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 
-	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data = inw(io_ports->data_addr);
 
 		tf->data = data & 0xff;
@@ -76,31 +76,31 @@ static void superio_tf_read(ide_drive_t *drive, ide_task_t *task)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
-	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = inb(io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
 		tf->lbal   = inb(io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
 		tf->lbam   = inb(io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
 		tf->lbah   = inb(io_ports->lbah_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
 		tf->device = superio_ide_inb(io_ports->device_addr);
 
-	if (task->tf_flags & IDE_TFLAG_LBA48) {
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = inb(io_ports->feature_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
 			tf->hob_nsect   = inb(io_ports->nsect_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
 			tf->hob_lbal    = inb(io_ports->lbal_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
 			tf->hob_lbam    = inb(io_ports->lbam_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
 			tf->hob_lbah    = inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 82929c725d82..d6336753bd2c 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -666,52 +666,52 @@ static int __devinit init_setup_scc(struct pci_dev *dev,
 	return rc;
 }
 
-static void scc_tf_load(ide_drive_t *drive, ide_task_t *task)
+static void scc_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
-	u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
+	struct ide_taskfile *tf = &cmd->tf;
+	u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF;
 
-	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
+	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA)
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA)
 		out_be32((void *)io_ports->data_addr,
 			 (tf->hob_data << 8) | tf->data);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		scc_ide_outb(tf->hob_feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
 		scc_ide_outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
 		scc_ide_outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
 		scc_ide_outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
 		scc_ide_outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
 		scc_ide_outb(tf->feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
 		scc_ide_outb(tf->nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
 		scc_ide_outb(tf->lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
 		scc_ide_outb(tf->lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
 		scc_ide_outb(tf->lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
 		scc_ide_outb((tf->device & HIHI) | drive->select,
 			     io_ports->device_addr);
 }
 
-static void scc_tf_read(ide_drive_t *drive, ide_task_t *task)
+static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 
-	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data = (u16)in_be32((void *)io_ports->data_addr);
 
 		tf->data = data & 0xff;
@@ -721,31 +721,31 @@ static void scc_tf_read(ide_drive_t *drive, ide_task_t *task)
 	/* be sure we're looking at the low order bits */
 	scc_ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
-	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = scc_ide_inb(io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = scc_ide_inb(io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
 		tf->lbal   = scc_ide_inb(io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
 		tf->lbam   = scc_ide_inb(io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
 		tf->lbah   = scc_ide_inb(io_ports->lbah_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
 		tf->device = scc_ide_inb(io_ports->device_addr);
 
-	if (task->tf_flags & IDE_TFLAG_LBA48) {
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		scc_ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = scc_ide_inb(io_ports->feature_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
 			tf->hob_nsect   = scc_ide_inb(io_ports->nsect_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
 			tf->hob_lbal    = scc_ide_inb(io_ports->lbal_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
 			tf->hob_lbam    = scc_ide_inb(io_ports->lbam_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
 			tf->hob_lbah    = scc_ide_inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 6b51e0c58af7..947596d3620c 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -82,57 +82,57 @@ static void tx4938ide_outb(u8 value, unsigned long port)
 	__raw_writeb(value, (void __iomem *)port);
 }
 
-static void tx4938ide_tf_load(ide_drive_t *drive, ide_task_t *task)
+static void tx4938ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
-	u8 HIHI = task->tf_flags & IDE_TFLAG_LBA48 ? 0xE0 : 0xEF;
+	struct ide_taskfile *tf = &cmd->tf;
+	u8 HIHI = cmd->tf_flags & IDE_TFLAG_LBA48 ? 0xE0 : 0xEF;
 
-	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
+	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		/* no endian swap */
 		__raw_writew(data, (void __iomem *)io_ports->data_addr);
 	}
 
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		tx4938ide_outb(tf->hob_feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
 		tx4938ide_outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
 		tx4938ide_outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
 		tx4938ide_outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
 		tx4938ide_outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
 		tx4938ide_outb(tf->feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
 		tx4938ide_outb(tf->nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
 		tx4938ide_outb(tf->lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
 		tx4938ide_outb(tf->lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
 		tx4938ide_outb(tf->lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
 		tx4938ide_outb((tf->device & HIHI) | drive->select,
 			       io_ports->device_addr);
 }
 
-static void tx4938ide_tf_read(ide_drive_t *drive, ide_task_t *task)
+static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 
-	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		/* no endian swap */
@@ -144,32 +144,32 @@ static void tx4938ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	/* be sure we're looking at the low order bits */
 	tx4938ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
-	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = tx4938ide_inb(io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4938ide_inb(io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
 		tf->lbal   = tx4938ide_inb(io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
 		tf->lbam   = tx4938ide_inb(io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
 		tf->lbah   = tx4938ide_inb(io_ports->lbah_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
 		tf->device = tx4938ide_inb(io_ports->device_addr);
 
-	if (task->tf_flags & IDE_TFLAG_LBA48) {
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4938ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature =
 				tx4938ide_inb(io_ports->feature_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
 			tf->hob_nsect   = tx4938ide_inb(io_ports->nsect_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
 			tf->hob_lbal    = tx4938ide_inb(io_ports->lbal_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
 			tf->hob_lbam    = tx4938ide_inb(io_ports->lbam_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
 			tf->hob_lbah    = tx4938ide_inb(io_ports->lbah_addr);
 	}
 }
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index ee86688d8461..bf11791476f0 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -435,7 +435,7 @@ static int tx4939ide_init_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
 	return ide_allocate_dma_engine(hwif);
 }
 
-static void tx4939ide_tf_load_fixup(ide_drive_t *drive, ide_task_t *task)
+static void tx4939ide_tf_load_fixup(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	void __iomem *base = TX4939IDE_BASE(hwif);
@@ -463,59 +463,59 @@ static void tx4939ide_outb(u8 value, unsigned long port)
 	__raw_writeb(value, (void __iomem *)port);
 }
 
-static void tx4939ide_tf_load(ide_drive_t *drive, ide_task_t *task)
+static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
-	u8 HIHI = task->tf_flags & IDE_TFLAG_LBA48 ? 0xE0 : 0xEF;
+	struct ide_taskfile *tf = &cmd->tf;
+	u8 HIHI = cmd->tf_flags & IDE_TFLAG_LBA48 ? 0xE0 : 0xEF;
 
-	if (task->ftf_flags & IDE_FTFLAG_FLAGGED)
+	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	if (task->ftf_flags & IDE_FTFLAG_OUT_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
 		/* no endian swap */
 		__raw_writew(data, (void __iomem *)io_ports->data_addr);
 	}
 
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE)
 		tx4939ide_outb(tf->hob_feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_NSECT)
 		tx4939ide_outb(tf->hob_nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAL)
 		tx4939ide_outb(tf->hob_lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAM)
 		tx4939ide_outb(tf->hob_lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_HOB_LBAH)
 		tx4939ide_outb(tf->hob_lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_FEATURE)
 		tx4939ide_outb(tf->feature, io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_NSECT)
 		tx4939ide_outb(tf->nsect, io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAL)
 		tx4939ide_outb(tf->lbal, io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAM)
 		tx4939ide_outb(tf->lbam, io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_OUT_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_OUT_LBAH)
 		tx4939ide_outb(tf->lbah, io_ports->lbah_addr);
 
-	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE) {
+	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE) {
 		tx4939ide_outb((tf->device & HIHI) | drive->select,
 			       io_ports->device_addr);
-		tx4939ide_tf_load_fixup(drive, task);
+		tx4939ide_tf_load_fixup(drive);
 	}
 }
 
-static void tx4939ide_tf_read(ide_drive_t *drive, ide_task_t *task)
+static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_taskfile *tf = &task->tf;
+	struct ide_taskfile *tf = &cmd->tf;
 
-	if (task->ftf_flags & IDE_FTFLAG_IN_DATA) {
+	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
 		u16 data;
 
 		/* no endian swap */
@@ -527,32 +527,32 @@ static void tx4939ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	/* be sure we're looking at the low order bits */
 	tx4939ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
-	if (task->tf_flags & IDE_TFLAG_IN_FEATURE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
 		tf->feature = tx4939ide_inb(io_ports->feature_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
+	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4939ide_inb(io_ports->nsect_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAL)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
 		tf->lbal   = tx4939ide_inb(io_ports->lbal_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAM)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAM)
 		tf->lbam   = tx4939ide_inb(io_ports->lbam_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_LBAH)
+	if (cmd->tf_flags & IDE_TFLAG_IN_LBAH)
 		tf->lbah   = tx4939ide_inb(io_ports->lbah_addr);
-	if (task->tf_flags & IDE_TFLAG_IN_DEVICE)
+	if (cmd->tf_flags & IDE_TFLAG_IN_DEVICE)
 		tf->device = tx4939ide_inb(io_ports->device_addr);
 
-	if (task->tf_flags & IDE_TFLAG_LBA48) {
+	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4939ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature =
 				tx4939ide_inb(io_ports->feature_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
 			tf->hob_nsect   = tx4939ide_inb(io_ports->nsect_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
 			tf->hob_lbal    = tx4939ide_inb(io_ports->lbal_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
 			tf->hob_lbam    = tx4939ide_inb(io_ports->lbam_addr);
-		if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
 			tf->hob_lbah    = tx4939ide_inb(io_ports->lbah_addr);
 	}
 }
@@ -599,11 +599,12 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 
 #else	/* __LITTLE_ENDIAN */
 
-static void tx4939ide_tf_load(ide_drive_t *drive, ide_task_t *task)
+static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	ide_tf_load(drive, task);
-	if (task->tf_flags & IDE_TFLAG_OUT_DEVICE)
-		tx4939ide_tf_load_fixup(drive, task);
+	ide_tf_load(drive, cmd);
+
+	if (cmd->tf_flags & IDE_TFLAG_OUT_DEVICE)
+		tx4939ide_tf_load_fixup(drive);
 }
 
 static const struct ide_tp_ops tx4939ide_tp_ops = {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2ee236d1f3ac..f0e3618c7257 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -295,7 +295,7 @@ enum {
 					  IDE_TFLAG_IN_DEVICE,
 	/* force 16-bit I/O operations */
 	IDE_TFLAG_IO_16BIT		= (1 << 26),
-	/* ide_task_t was allocated using kmalloc() */
+	/* struct ide_cmd was allocated using kmalloc() */
 	IDE_TFLAG_DYN			= (1 << 27),
 };
 
@@ -335,7 +335,7 @@ struct ide_taskfile {
 	};
 };
 
-typedef struct ide_task_s {
+struct ide_cmd {
 	union {
 		struct ide_taskfile	tf;
 		u8			tf_array[14];
@@ -345,7 +345,7 @@ typedef struct ide_task_s {
 	int			data_phase;
 	struct request		*rq;		/* copy of request */
 	void			*special;	/* valid_t generally */
-} ide_task_t;
+};
 
 /* ATAPI packet command flags */
 enum {
@@ -652,8 +652,8 @@ struct ide_tp_ops {
 
 	void	(*set_irq)(struct hwif_s *, int);
 
-	void	(*tf_load)(ide_drive_t *, struct ide_task_s *);
-	void	(*tf_read)(ide_drive_t *, struct ide_task_s *);
+	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
+	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
 
 	void	(*input_data)(ide_drive_t *, struct request *, void *,
 			      unsigned int);
@@ -775,7 +775,7 @@ typedef struct hwif_s {
 	int orig_sg_nents;
 	int sg_dma_direction;		/* dma transfer direction */
 
-	struct ide_task_s task;		/* current command */
+	struct ide_cmd cmd;		/* current command */
 
 	unsigned int nsect;
 	unsigned int nleft;
@@ -1161,7 +1161,7 @@ extern ide_startstop_t ide_do_reset (ide_drive_t *);
 extern int ide_devset_execute(ide_drive_t *drive,
 			      const struct ide_devset *setting, int arg);
 
-void ide_complete_task(ide_drive_t *, ide_task_t *, u8, u8);
+void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8);
 void ide_complete_rq(ide_drive_t *, u8);
 
 void ide_tf_dump(const char *, struct ide_taskfile *);
@@ -1172,8 +1172,8 @@ u8 ide_read_altstatus(ide_hwif_t *);
 
 void ide_set_irq(ide_hwif_t *, int);
 
-void ide_tf_load(ide_drive_t *, ide_task_t *);
-void ide_tf_read(ide_drive_t *, ide_task_t *);
+void ide_tf_load(ide_drive_t *, struct ide_cmd *);
+void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 
 void ide_input_data(ide_drive_t *, struct request *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct request *, void *, unsigned int);
@@ -1224,14 +1224,14 @@ int ide_cd_get_xferlen(struct request *);
 
 ide_startstop_t ide_issue_pc(ide_drive_t *);
 
-ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *);
+ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *);
 
 void task_end_request(ide_drive_t *, struct request *, u8);
 
-int ide_raw_taskfile(ide_drive_t *, ide_task_t *, u8 *, u16);
-int ide_no_data_taskfile(ide_drive_t *, ide_task_t *);
+int ide_raw_taskfile(ide_drive_t *, struct ide_cmd *, u8 *, u16);
+int ide_no_data_taskfile(ide_drive_t *, struct ide_cmd *);
 
-int ide_taskfile_ioctl(ide_drive_t *, unsigned int, unsigned long);
+int ide_taskfile_ioctl(ide_drive_t *, unsigned long);
 
 int ide_dev_read_id(ide_drive_t *, u8, u16 *);
 
-- 
cgit v1.2.3-71-gd317


From adb1af9803d167091c2cb4de14014185054bfe2c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:38 +0100
Subject: ide: pass command instead of request to ide_pio_datablock()

* Add IDE_TFLAG_FS taskfile flag and set it for REQ_TYPE_FS requests.

* Convert ->{in,out}put_data methods to take command instead of request
  as an argument.  Then convert pre_task_out_intr(), task_end_request(),
  task_error(), task_in_unexpected(), ide_pio_sector(), ide_pio_multi()
  and ide_pio_datablock() in similar way.

* Rename task_end_request() to ide_finish_cmd().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     |  4 +--
 drivers/ide/au1xxx-ide.c   |  4 +--
 drivers/ide/falconide.c    |  8 ++---
 drivers/ide/ide-disk.c     |  2 ++
 drivers/ide/ide-dma.c      |  4 +--
 drivers/ide/ide-h8300.c    |  4 +--
 drivers/ide/ide-io-std.c   |  4 +--
 drivers/ide/ide-taskfile.c | 88 ++++++++++++++++++++++------------------------
 drivers/ide/q40ide.c       |  8 ++---
 drivers/ide/scc_pata.c     |  4 +--
 drivers/ide/tx4938ide.c    |  4 +--
 include/linux/ide.h        | 17 ++++-----
 12 files changed, 76 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 6be7d87382ab..27547121daff 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -143,7 +143,7 @@ static void apply_timings(const u8 chipselect, const u8 pio,
 	set_smc_timings(chipselect, cycle, setup, pulse, data_float, use_iordy);
 }
 
-static void at91_ide_input_data(ide_drive_t *drive, struct request *rq,
+static void at91_ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
 				void *buf, unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -160,7 +160,7 @@ static void at91_ide_input_data(ide_drive_t *drive, struct request *rq,
 	leave_16bit(chipselect, mode);
 }
 
-static void at91_ide_output_data(ide_drive_t *drive, struct request *rq,
+static void at91_ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd,
 				 void *buf, unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 3fc3ced8192c..72d7d615e1fc 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -86,13 +86,13 @@ void auide_outsw(unsigned long port, void *addr, u32 count)
 	ctp->cur_ptr = au1xxx_ddma_get_nextptr_virt(dp);
 }
 
-static void au1xxx_input_data(ide_drive_t *drive, struct request *rq,
+static void au1xxx_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
 			      void *buf, unsigned int len)
 {
 	auide_insw(drive->hwif->io_ports.data_addr, buf, (len + 1) / 2);
 }
 
-static void au1xxx_output_data(ide_drive_t *drive, struct request *rq,
+static void au1xxx_output_data(ide_drive_t *drive, struct ide_cmd *cmd,
 			       void *buf, unsigned int len)
 {
 	auide_outsw(drive->hwif->io_ports.data_addr, buf, (len + 1) / 2);
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 6085feb1fae8..b368a5effc3a 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -62,23 +62,23 @@ static void falconide_get_lock(irq_handler_t handler, void *data)
 	}
 }
 
-static void falconide_input_data(ide_drive_t *drive, struct request *rq,
+static void falconide_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
 				 void *buf, unsigned int len)
 {
 	unsigned long data_addr = drive->hwif->io_ports.data_addr;
 
-	if (drive->media == ide_disk && rq && rq->cmd_type == REQ_TYPE_FS)
+	if (drive->media == ide_disk && cmd && (cmd->tf_flags & IDE_TFLAG_FS))
 		return insw(data_addr, buf, (len + 1) / 2);
 
 	raw_insw_swapw((u16 *)data_addr, buf, (len + 1) / 2);
 }
 
-static void falconide_output_data(ide_drive_t *drive, struct request *rq,
+static void falconide_output_data(ide_drive_t *drive, struct ide_cmd *cmd,
 				  void *buf, unsigned int len)
 {
 	unsigned long data_addr = drive->hwif->io_ports.data_addr;
 
-	if (drive->media == ide_disk && rq && rq->cmd_type == REQ_TYPE_FS)
+	if (drive->media == ide_disk && cmd && (cmd->tf_flags & IDE_TFLAG_FS))
 		return outsw(data_addr, buf, (len + 1) / 2);
 
 	raw_outsw_swapw((u16 *)data_addr, buf, (len + 1) / 2);
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 6647cb8bd910..f1555dd4e6a5 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -156,6 +156,8 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 		tf->device = head;
 	}
 
+	cmd.tf_flags |= IDE_TFLAG_FS;
+
 	if (rq_data_dir(rq))
 		cmd.tf_flags |= IDE_TFLAG_WRITE;
 
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 12c11b71402e..54f17ae9225d 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -96,9 +96,9 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 
 	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
 		if (!dma_stat) {
-			struct request *rq = hwif->rq;
+			struct ide_cmd *cmd = &hwif->cmd;
 
-			task_end_request(drive, rq, stat);
+			ide_finish_cmd(drive, cmd, stat);
 			return ide_stopped;
 		}
 		printk(KERN_ERR "%s: %s: bad DMA status (0x%02x)\n",
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index c7883f23c66a..ff8339ed59ab 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -143,13 +143,13 @@ static void mm_insw(unsigned long addr, void *buf, u32 len)
 		*bp = bswap(*(volatile u16 *)addr);
 }
 
-static void h8300_input_data(ide_drive_t *drive, struct request *rq,
+static void h8300_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
 			     void *buf, unsigned int len)
 {
 	mm_insw(drive->hwif->io_ports.data_addr, buf, (len + 1) / 2);
 }
 
-static void h8300_output_data(ide_drive_t *drive, struct request *rq,
+static void h8300_output_data(ide_drive_t *drive, struct ide_cmd *cmd,
 			      void *buf, unsigned int len)
 {
 	mm_outsw(drive->hwif->io_ports.data_addr, buf, (len + 1) / 2);
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 570c0cc4514d..2d9c6dc3f956 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -219,7 +219,7 @@ static void ata_vlb_sync(unsigned long port)
  * so if an odd len is specified, be sure that there's at least one
  * extra byte allocated for the buffer.
  */
-void ide_input_data(ide_drive_t *drive, struct request *rq, void *buf,
+void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		    unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -265,7 +265,7 @@ EXPORT_SYMBOL_GPL(ide_input_data);
 /*
  * This is used for most PIO data transfers *to* the IDE interface
  */
-void ide_output_data(ide_drive_t *drive, struct request *rq, void *buf,
+void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		     unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 2b85c137764a..d3bd93afbf2b 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -54,19 +54,20 @@ int taskfile_lib_get_identify (ide_drive_t *drive, u8 *buf)
 }
 
 static ide_startstop_t task_no_data_intr(ide_drive_t *);
-static ide_startstop_t pre_task_out_intr(ide_drive_t *, struct request *);
+static ide_startstop_t pre_task_out_intr(ide_drive_t *, struct ide_cmd *);
 static ide_startstop_t task_in_intr(ide_drive_t *);
 
-ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd)
+ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct ide_taskfile *tf = &cmd->tf;
 	ide_handler_t *handler = NULL;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 
-	if (cmd->data_phase == TASKFILE_MULTI_IN ||
-	    cmd->data_phase == TASKFILE_MULTI_OUT) {
+	if (orig_cmd->data_phase == TASKFILE_MULTI_IN ||
+	    orig_cmd->data_phase == TASKFILE_MULTI_OUT) {
 		if (!drive->mult_count) {
 			printk(KERN_ERR "%s: multimode not set!\n",
 					drive->name);
@@ -74,10 +75,10 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd)
 		}
 	}
 
-	if (cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
-		cmd->ftf_flags |= IDE_FTFLAG_SET_IN_FLAGS;
+	if (orig_cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
+		orig_cmd->ftf_flags |= IDE_FTFLAG_SET_IN_FLAGS;
 
-	memcpy(&hwif->cmd, cmd, sizeof(*cmd));
+	memcpy(cmd, orig_cmd, sizeof(*cmd));
 
 	if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
 		ide_tf_dump(drive->name, tf);
@@ -91,7 +92,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd)
 	case TASKFILE_OUT:
 		tp_ops->exec_command(hwif, tf->command);
 		ndelay(400);	/* FIXME */
-		return pre_task_out_intr(drive, cmd->rq);
+		return pre_task_out_intr(drive, cmd);
 	case TASKFILE_MULTI_IN:
 	case TASKFILE_IN:
 		handler = task_in_intr;
@@ -203,7 +204,7 @@ static u8 wait_drive_not_busy(ide_drive_t *drive)
 	return stat;
 }
 
-static void ide_pio_sector(ide_drive_t *drive, struct request *rq,
+static void ide_pio_sector(ide_drive_t *drive, struct ide_cmd *cmd,
 			   unsigned int write)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -244,9 +245,9 @@ static void ide_pio_sector(ide_drive_t *drive, struct request *rq,
 
 	/* do the actual data transfer */
 	if (write)
-		hwif->tp_ops->output_data(drive, rq, buf, SECTOR_SIZE);
+		hwif->tp_ops->output_data(drive, cmd, buf, SECTOR_SIZE);
 	else
-		hwif->tp_ops->input_data(drive, rq, buf, SECTOR_SIZE);
+		hwif->tp_ops->input_data(drive, cmd, buf, SECTOR_SIZE);
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
@@ -254,24 +255,23 @@ static void ide_pio_sector(ide_drive_t *drive, struct request *rq,
 #endif
 }
 
-static void ide_pio_multi(ide_drive_t *drive, struct request *rq,
+static void ide_pio_multi(ide_drive_t *drive, struct ide_cmd *cmd,
 			  unsigned int write)
 {
 	unsigned int nsect;
 
 	nsect = min_t(unsigned int, drive->hwif->nleft, drive->mult_count);
 	while (nsect--)
-		ide_pio_sector(drive, rq, write);
+		ide_pio_sector(drive, cmd, write);
 }
 
-static void ide_pio_datablock(ide_drive_t *drive, struct request *rq,
-				     unsigned int write)
+static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
+			      unsigned int write)
 {
-	struct ide_cmd *cmd = &drive->hwif->cmd;
 	u8 saved_io_32bit = drive->io_32bit;
 
-	if (blk_fs_request(rq))
-		rq->errors = 0;
+	if (cmd->tf_flags & IDE_TFLAG_FS)
+		cmd->rq->errors = 0;
 
 	if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
 		drive->io_32bit = 0;
@@ -281,22 +281,21 @@ static void ide_pio_datablock(ide_drive_t *drive, struct request *rq,
 	switch (cmd->data_phase) {
 	case TASKFILE_MULTI_IN:
 	case TASKFILE_MULTI_OUT:
-		ide_pio_multi(drive, rq, write);
+		ide_pio_multi(drive, cmd, write);
 		break;
 	default:
-		ide_pio_sector(drive, rq, write);
+		ide_pio_sector(drive, cmd, write);
 		break;
 	}
 
 	drive->io_32bit = saved_io_32bit;
 }
 
-static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
+static ide_startstop_t task_error(ide_drive_t *drive, struct ide_cmd *cmd,
 				  const char *s, u8 stat)
 {
-	if (blk_fs_request(rq)) {
+	if (cmd->tf_flags & IDE_TFLAG_FS) {
 		ide_hwif_t *hwif = drive->hwif;
-		struct ide_cmd *cmd = &hwif->cmd;
 		int sectors = hwif->nsect - hwif->nleft;
 
 		switch (cmd->data_phase) {
@@ -323,19 +322,17 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
 	return ide_error(drive, s, stat);
 }
 
-void task_end_request(ide_drive_t *drive, struct request *rq, u8 stat)
+void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
 {
-	if (blk_fs_request(rq) == 0) {
-		struct ide_cmd *cmd = rq->special;
+	if ((cmd->tf_flags & IDE_TFLAG_FS) == 0) {
 		u8 err = ide_read_error(drive);
 
-		if (cmd)
-			ide_complete_cmd(drive, cmd, stat, err);
+		ide_complete_cmd(drive, cmd, stat, err);
 		ide_complete_rq(drive, err);
 		return;
 	}
 
-	ide_end_request(drive, 1, rq->nr_sectors);
+	ide_end_request(drive, 1, cmd->rq->nr_sectors);
 }
 
 /*
@@ -344,11 +341,12 @@ void task_end_request(ide_drive_t *drive, struct request *rq, u8 stat)
  * It might be a spurious irq (shared irq), but it might be a
  * command that had no output.
  */
-static ide_startstop_t task_in_unexpected(ide_drive_t *drive, struct request *rq, u8 stat)
+static ide_startstop_t task_in_unexpected(ide_drive_t *drive,
+					  struct ide_cmd *cmd, u8 stat)
 {
 	/* Command all done? */
 	if (OK_STAT(stat, ATA_DRDY, ATA_BUSY)) {
-		task_end_request(drive, rq, stat);
+		ide_finish_cmd(drive, cmd, stat);
 		return ide_stopped;
 	}
 
@@ -363,25 +361,25 @@ static ide_startstop_t task_in_unexpected(ide_drive_t *drive, struct request *rq
 static ide_startstop_t task_in_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
+	struct ide_cmd *cmd = &drive->hwif->cmd;
 	u8 stat = hwif->tp_ops->read_status(hwif);
 
 	/* Error? */
 	if (stat & ATA_ERR)
-		return task_error(drive, rq, __func__, stat);
+		return task_error(drive, cmd, __func__, stat);
 
 	/* Didn't want any data? Odd. */
 	if ((stat & ATA_DRQ) == 0)
-		return task_in_unexpected(drive, rq, stat);
+		return task_in_unexpected(drive, cmd, stat);
 
-	ide_pio_datablock(drive, rq, 0);
+	ide_pio_datablock(drive, cmd, 0);
 
 	/* Are we done? Check status and finish transfer. */
 	if (!hwif->nleft) {
 		stat = wait_drive_not_busy(drive);
 		if (!OK_STAT(stat, 0, BAD_STAT))
-			return task_error(drive, rq, __func__, stat);
-		task_end_request(drive, rq, stat);
+			return task_error(drive, cmd, __func__, stat);
+		ide_finish_cmd(drive, cmd, stat);
 		return ide_stopped;
 	}
 
@@ -397,31 +395,31 @@ static ide_startstop_t task_in_intr(ide_drive_t *drive)
 static ide_startstop_t task_out_intr (ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
+	struct ide_cmd *cmd = &drive->hwif->cmd;
 	u8 stat = hwif->tp_ops->read_status(hwif);
 
 	if (!OK_STAT(stat, DRIVE_READY, drive->bad_wstat))
-		return task_error(drive, rq, __func__, stat);
+		return task_error(drive, cmd, __func__, stat);
 
 	/* Deal with unexpected ATA data phase. */
 	if (((stat & ATA_DRQ) == 0) ^ !hwif->nleft)
-		return task_error(drive, rq, __func__, stat);
+		return task_error(drive, cmd, __func__, stat);
 
 	if (!hwif->nleft) {
-		task_end_request(drive, rq, stat);
+		ide_finish_cmd(drive, cmd, stat);
 		return ide_stopped;
 	}
 
 	/* Still data left to transfer. */
-	ide_pio_datablock(drive, rq, 1);
+	ide_pio_datablock(drive, cmd, 1);
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
 
 	return ide_started;
 }
 
-static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, struct request *rq)
+static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
+					 struct ide_cmd *cmd)
 {
-	struct ide_cmd *cmd = &drive->hwif->cmd;
 	ide_startstop_t startstop;
 
 	if (ide_wait_stat(&startstop, drive, ATA_DRQ,
@@ -437,7 +435,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, struct request *rq)
 		local_irq_disable();
 
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
-	ide_pio_datablock(drive, rq, 1);
+	ide_pio_datablock(drive, cmd, 1);
 
 	return ide_started;
 }
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 32f669d656a6..2a43a2f49633 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -72,23 +72,23 @@ static void q40_ide_setup_ports(hw_regs_t *hw, unsigned long base,
 	hw->chipset = ide_generic;
 }
 
-static void q40ide_input_data(ide_drive_t *drive, struct request *rq,
+static void q40ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
 			      void *buf, unsigned int len)
 {
 	unsigned long data_addr = drive->hwif->io_ports.data_addr;
 
-	if (drive->media == ide_disk && rq && rq->cmd_type == REQ_TYPE_FS)
+	if (drive->media == ide_disk && cmd && (cmd->tf_flags & IDE_TFLAG_FS))
 		return insw(data_addr, buf, (len + 1) / 2);
 
 	raw_insw_swapw((u16 *)data_addr, buf, (len + 1) / 2);
 }
 
-static void q40ide_output_data(ide_drive_t *drive, struct request *rq,
+static void q40ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd,
 			       void *buf, unsigned int len)
 {
 	unsigned long data_addr = drive->hwif->io_ports.data_addr;
 
-	if (drive->media == ide_disk && rq && rq->cmd_type == REQ_TYPE_FS)
+	if (drive->media == ide_disk && cmd && (cmd->tf_flags & IDE_TFLAG_FS))
 		return outsw(data_addr, buf, (len + 1) / 2);
 
 	raw_outsw_swapw((u16 *)data_addr, buf, (len + 1) / 2);
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index d6336753bd2c..ada866744622 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -750,7 +750,7 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 }
 
-static void scc_input_data(ide_drive_t *drive, struct request *rq,
+static void scc_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
 			   void *buf, unsigned int len)
 {
 	unsigned long data_addr = drive->hwif->io_ports.data_addr;
@@ -766,7 +766,7 @@ static void scc_input_data(ide_drive_t *drive, struct request *rq,
 		scc_ide_insw(data_addr, buf, len / 2);
 }
 
-static void scc_output_data(ide_drive_t *drive,  struct request *rq,
+static void scc_output_data(ide_drive_t *drive,  struct ide_cmd *cmd,
 			    void *buf, unsigned int len)
 {
 	unsigned long data_addr = drive->hwif->io_ports.data_addr;
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 947596d3620c..657a61890b1c 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -174,7 +174,7 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 }
 
-static void tx4938ide_input_data_swap(ide_drive_t *drive, struct request *rq,
+static void tx4938ide_input_data_swap(ide_drive_t *drive, struct ide_cmd *cmd,
 				void *buf, unsigned int len)
 {
 	unsigned long port = drive->hwif->io_ports.data_addr;
@@ -186,7 +186,7 @@ static void tx4938ide_input_data_swap(ide_drive_t *drive, struct request *rq,
 	__ide_flush_dcache_range((unsigned long)buf, roundup(len, 2));
 }
 
-static void tx4938ide_output_data_swap(ide_drive_t *drive, struct request *rq,
+static void tx4938ide_output_data_swap(ide_drive_t *drive, struct ide_cmd *cmd,
 				void *buf, unsigned int len)
 {
 	unsigned long port = drive->hwif->io_ports.data_addr;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f0e3618c7257..1785582e1f86 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -297,6 +297,7 @@ enum {
 	IDE_TFLAG_IO_16BIT		= (1 << 26),
 	/* struct ide_cmd was allocated using kmalloc() */
 	IDE_TFLAG_DYN			= (1 << 27),
+	IDE_TFLAG_FS			= (1 << 28),
 };
 
 enum {
@@ -655,10 +656,10 @@ struct ide_tp_ops {
 	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
 
-	void	(*input_data)(ide_drive_t *, struct request *, void *,
-			      unsigned int);
-	void	(*output_data)(ide_drive_t *, struct request *, void *,
-			       unsigned int);
+	void	(*input_data)(ide_drive_t *, struct ide_cmd *,
+			      void *, unsigned int);
+	void	(*output_data)(ide_drive_t *, struct ide_cmd *,
+			       void *, unsigned int);
 };
 
 extern const struct ide_tp_ops default_tp_ops;
@@ -866,7 +867,7 @@ typedef ide_startstop_t (ide_handler_t)(ide_drive_t *);
 typedef int (ide_expiry_t)(ide_drive_t *);
 
 /* used by ide-cd, ide-floppy, etc. */
-typedef void (xfer_func_t)(ide_drive_t *, struct request *rq, void *, unsigned);
+typedef void (xfer_func_t)(ide_drive_t *, struct ide_cmd *, void *, unsigned);
 
 extern struct mutex ide_setting_mtx;
 
@@ -1175,8 +1176,8 @@ void ide_set_irq(ide_hwif_t *, int);
 void ide_tf_load(ide_drive_t *, struct ide_cmd *);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 
-void ide_input_data(ide_drive_t *, struct request *, void *, unsigned int);
-void ide_output_data(ide_drive_t *, struct request *, void *, unsigned int);
+void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
+void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 
 int ide_io_buffers(ide_drive_t *, struct ide_atapi_pc *, unsigned int, int);
 
@@ -1226,7 +1227,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *);
 
-void task_end_request(ide_drive_t *, struct request *, u8);
+void ide_finish_cmd(ide_drive_t *, struct ide_cmd *, u8);
 
 int ide_raw_taskfile(ide_drive_t *, struct ide_cmd *, u8 *, u16);
 int ide_no_data_taskfile(ide_drive_t *, struct ide_cmd *);
-- 
cgit v1.2.3-71-gd317


From b6308ee0c55acd2e943d849773c9f0a49c516317 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:38 +0100
Subject: ide: move command related fields from ide_hwif_t to struct ide_cmd

* Move command related fields from ide_hwif_t to struct ide_cmd.

* Make ide_init_sg_cmd() take command and sectors number as arguments.

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c   |  2 +-
 drivers/ide/icside.c       |  2 +-
 drivers/ide/ide-disk.c     | 12 ++++++------
 drivers/ide/ide-dma-sff.c  |  2 +-
 drivers/ide/ide-dma.c      | 16 +++++++++-------
 drivers/ide/ide-floppy.c   |  5 +++--
 drivers/ide/ide-io.c       | 24 ++++++++++--------------
 drivers/ide/ide-taskfile.c | 33 ++++++++++++++++-----------------
 drivers/ide/pmac.c         |  2 +-
 drivers/ide/sgiioc4.c      |  2 +-
 drivers/ide/tx4939ide.c    |  2 +-
 include/linux/ide.h        | 20 +++++++++++---------
 12 files changed, 61 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 72d7d615e1fc..3ace0cda5452 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -215,7 +215,7 @@ static int auide_build_dmatable(ide_drive_t *drive)
 	struct request *rq = hwif->rq;
 	_auide_hwif *ahwif = &auide_hwif;
 	struct scatterlist *sg;
-	int i = hwif->sg_nents, iswrite, count = 0;
+	int i = hwif->cmd.sg_nents, iswrite, count = 0;
 
 	iswrite = (rq_data_dir(rq) == WRITE);
 	/* Save for interrupt context */
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 78fc36f98d29..bdfeb1222d52 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -344,7 +344,7 @@ static int icside_dma_setup(ide_drive_t *drive)
 	 * Tell the DMA engine about the SG table and
 	 * data direction.
 	 */
-	set_dma_sg(ec->dma, hwif->sg_table, hwif->sg_nents);
+	set_dma_sg(ec->dma, hwif->sg_table, hwif->cmd.sg_nents);
 	set_dma_mode(ec->dma, dma_mode);
 
 	drive->waiting_for_dma = 1;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index f1555dd4e6a5..d00d807c0f53 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -104,14 +104,14 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 			lba48 = 0;
 	}
 
-	if (!dma) {
-		ide_init_sg_cmd(drive, rq);
-		ide_map_sg(drive, rq);
-	}
-
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
+	if (dma == 0) {
+		ide_init_sg_cmd(&cmd, nsectors);
+		ide_map_sg(drive, rq);
+	}
+
 	if (drive->dev_flags & IDE_DFLAG_LBA) {
 		if (lba48) {
 			pr_debug("%s: LBA=0x%012llx\n", drive->name,
@@ -170,7 +170,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 		/* fallback to PIO */
 		cmd.tf_flags |= IDE_TFLAG_DMA_PIO_FALLBACK;
 		ide_tf_set_cmd(drive, &cmd, 0);
-		ide_init_sg_cmd(drive, rq);
+		ide_init_sg_cmd(&cmd, nsectors);
 		rc = do_rw_taskfile(drive, &cmd);
 	}
 
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 22b3e751d19b..7bf28a9b6f65 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -120,7 +120,7 @@ int ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	struct scatterlist *sg;
 	u8 is_trm290 = !!(hwif->host_flags & IDE_HFLAG_TRM290);
 
-	for_each_sg(hwif->sg_table, sg, hwif->sg_nents, i) {
+	for_each_sg(hwif->sg_table, sg, hwif->cmd.sg_nents, i) {
 		u32 cur_addr, cur_len, xcount, bcount;
 
 		cur_addr = sg_dma_address(sg);
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 54f17ae9225d..cba9fe585d87 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -128,21 +128,22 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
+	struct ide_cmd *cmd = &hwif->cmd;
 	int i;
 
 	ide_map_sg(drive, rq);
 
 	if (rq_data_dir(rq) == READ)
-		hwif->sg_dma_direction = DMA_FROM_DEVICE;
+		cmd->sg_dma_direction = DMA_FROM_DEVICE;
 	else
-		hwif->sg_dma_direction = DMA_TO_DEVICE;
+		cmd->sg_dma_direction = DMA_TO_DEVICE;
 
-	i = dma_map_sg(hwif->dev, sg, hwif->sg_nents, hwif->sg_dma_direction);
+	i = dma_map_sg(hwif->dev, sg, cmd->sg_nents, cmd->sg_dma_direction);
 	if (i == 0)
 		ide_map_sg(drive, rq);
 	else {
-		hwif->orig_sg_nents = hwif->sg_nents;
-		hwif->sg_nents = i;
+		cmd->orig_sg_nents = cmd->sg_nents;
+		cmd->sg_nents = i;
 	}
 
 	return i;
@@ -162,9 +163,10 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq)
 void ide_destroy_dmatable(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 
-	dma_unmap_sg(hwif->dev, hwif->sg_table, hwif->orig_sg_nents,
-		     hwif->sg_dma_direction);
+	dma_unmap_sg(hwif->dev, hwif->sg_table, cmd->orig_sg_nents,
+		     cmd->sg_dma_direction);
 }
 EXPORT_SYMBOL_GPL(ide_destroy_dmatable);
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 5625946739ad..f56e9a918b99 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -244,6 +244,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct ide_atapi_pc *pc;
 
 	if (drive->debug_mask & IDE_DBG_RQ)
@@ -285,12 +286,12 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	}
 
 	if (blk_fs_request(rq) || pc->req_xfer) {
-		ide_init_sg_cmd(drive, rq);
+		ide_init_sg_cmd(cmd, rq->nr_sectors);
 		ide_map_sg(drive, rq);
 	}
 
 	pc->sg = hwif->sg_table;
-	pc->sg_cnt = hwif->sg_nents;
+	pc->sg_cnt = cmd->sg_nents;
 
 	pc->rq = rq;
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 2900271c6ddd..7917fa09bf15 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -274,30 +274,26 @@ static ide_startstop_t do_special (ide_drive_t *drive)
 void ide_map_sg(ide_drive_t *drive, struct request *rq)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct scatterlist *sg = hwif->sg_table;
 
 	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
 		sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE);
-		hwif->sg_nents = 1;
+		cmd->sg_nents = 1;
 	} else if (!rq->bio) {
 		sg_init_one(sg, rq->data, rq->data_len);
-		hwif->sg_nents = 1;
-	} else {
-		hwif->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
-	}
+		cmd->sg_nents = 1;
+	} else
+		cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
 }
-
 EXPORT_SYMBOL_GPL(ide_map_sg);
 
-void ide_init_sg_cmd(ide_drive_t *drive, struct request *rq)
+void ide_init_sg_cmd(struct ide_cmd *cmd, int nsect)
 {
-	ide_hwif_t *hwif = drive->hwif;
-
-	hwif->nsect = hwif->nleft = rq->nr_sectors;
-	hwif->cursg_ofs = 0;
-	hwif->cursg = NULL;
+	cmd->nsect = cmd->nleft = nsect;
+	cmd->cursg_ofs = 0;
+	cmd->cursg = NULL;
 }
-
 EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
 
 /**
@@ -323,7 +319,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 		case TASKFILE_OUT:
 		case TASKFILE_MULTI_IN:
 		case TASKFILE_IN:
-			ide_init_sg_cmd(drive, rq);
+			ide_init_sg_cmd(cmd, rq->nr_sectors);
 			ide_map_sg(drive, rq);
 		default:
 			break;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index d3bd93afbf2b..249a707f88a4 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -209,7 +209,7 @@ static void ide_pio_sector(ide_drive_t *drive, struct ide_cmd *cmd,
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
-	struct scatterlist *cursg = hwif->cursg;
+	struct scatterlist *cursg = cmd->cursg;
 	struct page *page;
 #ifdef CONFIG_HIGHMEM
 	unsigned long flags;
@@ -217,14 +217,14 @@ static void ide_pio_sector(ide_drive_t *drive, struct ide_cmd *cmd,
 	unsigned int offset;
 	u8 *buf;
 
-	cursg = hwif->cursg;
+	cursg = cmd->cursg;
 	if (!cursg) {
 		cursg = sg;
-		hwif->cursg = sg;
+		cmd->cursg = sg;
 	}
 
 	page = sg_page(cursg);
-	offset = cursg->offset + hwif->cursg_ofs * SECTOR_SIZE;
+	offset = cursg->offset + cmd->cursg_ofs * SECTOR_SIZE;
 
 	/* get the current page and offset */
 	page = nth_page(page, (offset >> PAGE_SHIFT));
@@ -235,12 +235,12 @@ static void ide_pio_sector(ide_drive_t *drive, struct ide_cmd *cmd,
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
-	hwif->nleft--;
-	hwif->cursg_ofs++;
+	cmd->nleft--;
+	cmd->cursg_ofs++;
 
-	if ((hwif->cursg_ofs * SECTOR_SIZE) == cursg->length) {
-		hwif->cursg = sg_next(hwif->cursg);
-		hwif->cursg_ofs = 0;
+	if ((cmd->cursg_ofs * SECTOR_SIZE) == cursg->length) {
+		cmd->cursg = sg_next(cmd->cursg);
+		cmd->cursg_ofs = 0;
 	}
 
 	/* do the actual data transfer */
@@ -260,7 +260,7 @@ static void ide_pio_multi(ide_drive_t *drive, struct ide_cmd *cmd,
 {
 	unsigned int nsect;
 
-	nsect = min_t(unsigned int, drive->hwif->nleft, drive->mult_count);
+	nsect = min_t(unsigned int, cmd->nleft, drive->mult_count);
 	while (nsect--)
 		ide_pio_sector(drive, cmd, write);
 }
@@ -295,19 +295,18 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct ide_cmd *cmd,
 				  const char *s, u8 stat)
 {
 	if (cmd->tf_flags & IDE_TFLAG_FS) {
-		ide_hwif_t *hwif = drive->hwif;
-		int sectors = hwif->nsect - hwif->nleft;
+		int sectors = cmd->nsect - cmd->nleft;
 
 		switch (cmd->data_phase) {
 		case TASKFILE_IN:
-			if (hwif->nleft)
+			if (cmd->nleft)
 				break;
 			/* fall through */
 		case TASKFILE_OUT:
 			sectors--;
 			break;
 		case TASKFILE_MULTI_IN:
-			if (hwif->nleft)
+			if (cmd->nleft)
 				break;
 			/* fall through */
 		case TASKFILE_MULTI_OUT:
@@ -375,7 +374,7 @@ static ide_startstop_t task_in_intr(ide_drive_t *drive)
 	ide_pio_datablock(drive, cmd, 0);
 
 	/* Are we done? Check status and finish transfer. */
-	if (!hwif->nleft) {
+	if (cmd->nleft == 0) {
 		stat = wait_drive_not_busy(drive);
 		if (!OK_STAT(stat, 0, BAD_STAT))
 			return task_error(drive, cmd, __func__, stat);
@@ -402,10 +401,10 @@ static ide_startstop_t task_out_intr (ide_drive_t *drive)
 		return task_error(drive, cmd, __func__, stat);
 
 	/* Deal with unexpected ATA data phase. */
-	if (((stat & ATA_DRQ) == 0) ^ !hwif->nleft)
+	if (((stat & ATA_DRQ) == 0) ^ (cmd->nleft == 0))
 		return task_error(drive, cmd, __func__, stat);
 
-	if (!hwif->nleft) {
+	if (cmd->nleft == 0) {
 		ide_finish_cmd(drive, cmd, stat);
 		return ide_stopped;
 	}
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 904fb54668e8..f5b85f4c1b65 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1432,7 +1432,7 @@ pmac_ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
 	struct scatterlist *sg;
 	int wr = (rq_data_dir(rq) == WRITE);
-	int i = hwif->sg_nents, count = 0;
+	int i = hwif->cmd.sg_nents, count = 0;
 
 	/* DMA table is already aligned */
 	table = (struct dbdma_cmd *) pmif->dma_table_cpu;
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index ab9433a7ad1f..b0769e96d32f 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -429,7 +429,7 @@ sgiioc4_build_dma_table(ide_drive_t * drive, struct request *rq, int ddir)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned int *table = hwif->dmatable_cpu;
-	unsigned int count = 0, i = hwif->sg_nents;
+	unsigned int count = 0, i = hwif->cmd.sg_nents;
 	struct scatterlist *sg = hwif->sg_table;
 
 	while (i && sg_dma_len(sg)) {
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index bf11791476f0..8d155ec8cca9 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -240,7 +240,7 @@ static int tx4939ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	int i;
 	struct scatterlist *sg;
 
-	for_each_sg(hwif->sg_table, sg, hwif->sg_nents, i) {
+	for_each_sg(hwif->sg_table, sg, hwif->cmd.sg_nents, i) {
 		u32 cur_addr, cur_len, bcount;
 
 		cur_addr = sg_dma_address(sg);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1785582e1f86..02128e9241d1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -344,6 +344,16 @@ struct ide_cmd {
 	u8			ftf_flags;	/* for TASKFILE ioctl */
 	u32			tf_flags;
 	int			data_phase;
+
+	int			sg_nents;	  /* number of sg entries */
+	int			orig_sg_nents;
+	int			sg_dma_direction; /* DMA transfer direction */
+
+	unsigned int		nsect;
+	unsigned int		nleft;
+	struct scatterlist	*cursg;
+	unsigned int		cursg_ofs;
+
 	struct request		*rq;		/* copy of request */
 	void			*special;	/* valid_t generally */
 };
@@ -772,17 +782,9 @@ typedef struct hwif_s {
 	/* Scatter-gather list used to build the above */
 	struct scatterlist *sg_table;
 	int sg_max_nents;		/* Maximum number of entries in it */
-	int sg_nents;			/* Current number of entries in it */
-	int orig_sg_nents;
-	int sg_dma_direction;		/* dma transfer direction */
 
 	struct ide_cmd cmd;		/* current command */
 
-	unsigned int nsect;
-	unsigned int nleft;
-	struct scatterlist *cursg;
-	unsigned int cursg_ofs;
-
 	int		rqsize;		/* max sectors per request */
 	int		irq;		/* our irq number */
 
@@ -1410,7 +1412,7 @@ int ide_pci_resume(struct pci_dev *);
 #endif
 
 void ide_map_sg(ide_drive_t *, struct request *);
-void ide_init_sg_cmd(ide_drive_t *, struct request *);
+void ide_init_sg_cmd(struct ide_cmd *, int);
 
 #define BAD_DMA_DRIVE		0
 #define GOOD_DMA_DRIVE		1
-- 
cgit v1.2.3-71-gd317


From 0dfb991c6943c810175376b58d1c29cfe532541b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:39 +0100
Subject: ide: use ata_tf_protocols enums

* Add IDE_TFLAG_MULTI_PIO taskfile flag and set it for commands
  using multi-PIO protocol.

* Use ata_tf_protocols enums instead of TASKFILE_* defines to
  denote command's protocol and then rename ->data_phase field
  to ->protocol.

* Remove no longer needed <linux/hdreg.h> includes.

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-disk.c      | 34 +++++++------------
 drivers/ide/ide-disk_proc.c |  5 ++-
 drivers/ide/ide-io.c        | 11 ++----
 drivers/ide/ide-ioctls.c    |  2 +-
 drivers/ide/ide-park.c      |  4 +--
 drivers/ide/ide-pm.c        |  5 ++-
 drivers/ide/ide-taskfile.c  | 83 +++++++++++++++++++--------------------------
 include/linux/ide.h         |  3 +-
 8 files changed, 58 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index d00d807c0f53..dae9d988de10 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -28,7 +28,6 @@
 #include <linux/mutex.h>
 #include <linux/leds.h>
 #include <linux/ide.h>
-#include <linux/hdreg.h>
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
@@ -53,15 +52,6 @@ static const u8 ide_rw_cmds[] = {
 	ATA_CMD_WRITE_EXT,
 };
 
-static const u8 ide_data_phases[] = {
-	TASKFILE_MULTI_IN,
-	TASKFILE_MULTI_OUT,
-	TASKFILE_IN,
-	TASKFILE_OUT,
-	TASKFILE_IN_DMA,
-	TASKFILE_OUT_DMA,
-};
-
 static void ide_tf_set_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 dma)
 {
 	u8 index, lba48, write;
@@ -69,17 +59,19 @@ static void ide_tf_set_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 dma)
 	lba48 = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 2 : 0;
 	write = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 0;
 
-	if (dma)
+	if (dma) {
+		cmd->protocol = ATA_PROT_DMA;
 		index = 8;
-	else
-		index = drive->mult_count ? 0 : 4;
+	} else {
+		cmd->protocol = ATA_PROT_PIO;
+		if (drive->mult_count) {
+			cmd->tf_flags |= IDE_TFLAG_MULTI_PIO;
+			index = 0;
+		} else
+			index = 4;
+	}
 
 	cmd->tf.command = ide_rw_cmds[index + lba48 + write];
-
-	if (dma)
-		index = 8; /* fixup index */
-
-	cmd->data_phase = ide_data_phases[index / 2 + write];
 }
 
 /*
@@ -401,9 +393,9 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 		cmd->tf.command = ATA_CMD_FLUSH_EXT;
 	else
 		cmd->tf.command = ATA_CMD_FLUSH;
-	cmd->tf_flags	 = IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
-			   IDE_TFLAG_DYN;
-	cmd->data_phase = TASKFILE_NO_DATA;
+	cmd->tf_flags = IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
+			IDE_TFLAG_DYN;
+	cmd->protocol = ATA_PROT_NODATA;
 
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 	rq->cmd_flags |= REQ_SOFTBARRIER;
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index afe4f47e9e19..eaea3bef2073 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -1,6 +1,5 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
-#include <linux/hdreg.h>
 
 #include "ide-disk.h"
 
@@ -30,8 +29,8 @@ static int get_smart_data(ide_drive_t *drive, u8 *buf, u8 sub_cmd)
 	tf->lbam    = ATA_SMART_LBAM_PASS;
 	tf->lbah    = ATA_SMART_LBAH_PASS;
 	tf->command = ATA_CMD_SMART;
-	cmd.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	cmd.data_phase	= TASKFILE_IN;
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.protocol = ATA_PROT_PIO;
 
 	return ide_raw_taskfile(drive, &cmd, buf, 1);
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 7917fa09bf15..c27eaab1ffcf 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -40,7 +40,6 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ide.h>
-#include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/reboot.h>
 #include <linux/cdrom.h>
@@ -220,7 +219,7 @@ static ide_startstop_t ide_disk_special(ide_drive_t *drive)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.data_phase = TASKFILE_NO_DATA;
+	cmd.protocol = ATA_PROT_NODATA;
 
 	if (s->b.set_geometry) {
 		s->b.set_geometry = 0;
@@ -314,15 +313,9 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 	struct ide_cmd *cmd = rq->special;
 
 	if (cmd) {
-		switch (cmd->data_phase) {
-		case TASKFILE_MULTI_OUT:
-		case TASKFILE_OUT:
-		case TASKFILE_MULTI_IN:
-		case TASKFILE_IN:
+		if (cmd->protocol == ATA_PROT_PIO) {
 			ide_init_sg_cmd(cmd, rq->nr_sectors);
 			ide_map_sg(drive, rq);
-		default:
-			break;
 		}
 
 		return do_rw_taskfile(drive, cmd);
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 4953028a13d4..770142767437 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -148,7 +148,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 			       IDE_TFLAG_IN_NSECT;
 	}
 	tf->command = args[0];
-	cmd.data_phase = args[3] ? TASKFILE_IN : TASKFILE_NO_DATA;
+	cmd.protocol = args[3] ? ATA_PROT_PIO : ATA_PROT_NODATA;
 
 	if (args[3]) {
 		cmd.tf_flags |= IDE_TFLAG_IO_16BIT;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 63c77f99a726..c575900d5596 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -1,6 +1,5 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
-#include <linux/hdreg.h>
 #include <linux/jiffies.h>
 #include <linux/blkdev.h>
 
@@ -80,8 +79,9 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 		tf->command = ATA_CMD_CHK_POWER;
 
 	cmd.tf_flags |= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.protocol = ATA_PROT_NODATA;
+
 	cmd.rq = rq;
-	cmd.data_phase = TASKFILE_NO_DATA;
 
 	return do_rw_taskfile(drive, &cmd);
 }
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 5c9fc20f95b5..ebf2d21ebdcb 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -1,6 +1,5 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
-#include <linux/hdreg.h>
 
 int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 {
@@ -164,8 +163,8 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 	return ide_stopped;
 
 out_do_tf:
-	cmd->tf_flags	 = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	cmd->data_phase = TASKFILE_NO_DATA;
+	cmd->tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd->protocol = ATA_PROT_NODATA;
 
 	return do_rw_taskfile(drive, cmd);
 }
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 647216c772d9..0c9d71485728 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -47,8 +47,8 @@ int taskfile_lib_get_identify (ide_drive_t *drive, u8 *buf)
 		cmd.tf.command = ATA_CMD_ID_ATA;
 	else
 		cmd.tf.command = ATA_CMD_ID_ATAPI;
-	cmd.tf_flags	= IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	cmd.data_phase	= TASKFILE_IN;
+	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	cmd.protocol = ATA_PROT_PIO;
 
 	return ide_raw_taskfile(drive, &cmd, buf, 1);
 }
@@ -66,13 +66,11 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 
-	if (orig_cmd->data_phase == TASKFILE_MULTI_IN ||
-	    orig_cmd->data_phase == TASKFILE_MULTI_OUT) {
-		if (!drive->mult_count) {
-			printk(KERN_ERR "%s: multimode not set!\n",
-					drive->name);
-			return ide_stopped;
-		}
+	if (orig_cmd->protocol == ATA_PROT_PIO &&
+	    (orig_cmd->tf_flags & IDE_TFLAG_MULTI_PIO) &&
+	    drive->mult_count == 0) {
+		printk(KERN_ERR "%s: multimode not set!\n", drive->name);
+		return ide_stopped;
 	}
 
 	if (orig_cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
@@ -87,17 +85,16 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		tp_ops->tf_load(drive, cmd);
 	}
 
-	switch (cmd->data_phase) {
-	case TASKFILE_MULTI_OUT:
-	case TASKFILE_OUT:
-		tp_ops->exec_command(hwif, tf->command);
-		ndelay(400);	/* FIXME */
-		return pre_task_out_intr(drive, cmd);
-	case TASKFILE_MULTI_IN:
-	case TASKFILE_IN:
+	switch (cmd->protocol) {
+	case ATA_PROT_PIO:
+		if (cmd->tf_flags & IDE_TFLAG_WRITE) {
+			tp_ops->exec_command(hwif, tf->command);
+			ndelay(400);	/* FIXME */
+			return pre_task_out_intr(drive, cmd);
+		}
 		handler = task_in_intr;
 		/* fall-through */
-	case TASKFILE_NO_DATA:
+	case ATA_PROT_NODATA:
 		if (handler == NULL)
 			handler = task_no_data_intr;
 		ide_execute_command(drive, tf->command, handler,
@@ -115,9 +112,6 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 }
 EXPORT_SYMBOL_GPL(do_rw_taskfile);
 
-/*
- * Handler for commands without a data phase
- */
 static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -278,15 +272,10 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 
 	touch_softlockup_watchdog();
 
-	switch (cmd->data_phase) {
-	case TASKFILE_MULTI_IN:
-	case TASKFILE_MULTI_OUT:
+	if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO)
 		ide_pio_multi(drive, cmd, write);
-		break;
-	default:
+	else
 		ide_pio_sector(drive, cmd, write);
-		break;
-	}
 
 	drive->io_32bit = saved_io_32bit;
 }
@@ -297,22 +286,12 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct ide_cmd *cmd,
 	if (cmd->tf_flags & IDE_TFLAG_FS) {
 		int sectors = cmd->nsect - cmd->nleft;
 
-		switch (cmd->data_phase) {
-		case TASKFILE_IN:
-			if (cmd->nleft)
-				break;
-			/* fall through */
-		case TASKFILE_OUT:
-			sectors--;
-			break;
-		case TASKFILE_MULTI_IN:
-			if (cmd->nleft)
-				break;
-			/* fall through */
-		case TASKFILE_MULTI_OUT:
-			sectors -= drive->mult_count;
-		default:
-			break;
+		if (cmd->protocol == ATA_PROT_PIO &&
+		    ((cmd->tf_flags & IDE_TFLAG_WRITE) || cmd->nleft == 0)) {
+			if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO)
+				sectors -= drive->mult_count;
+			else
+				sectors--;
 		}
 
 		if (sectors > 0)
@@ -425,7 +404,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
 			  drive->bad_wstat, WAIT_DRQ)) {
 		printk(KERN_ERR "%s: no DRQ after issuing %sWRITE%s\n",
 			drive->name,
-			cmd->data_phase == TASKFILE_MULTI_OUT ? "MULT" : "",
+			(cmd->tf_flags & IDE_TFLAG_MULTI_PIO) ? "MULT" : "",
 			(drive->dev_flags & IDE_DFLAG_LBA48) ? "_EXT" : "");
 		return startstop;
 	}
@@ -474,7 +453,7 @@ EXPORT_SYMBOL(ide_raw_taskfile);
 
 int ide_no_data_taskfile(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	cmd->data_phase = TASKFILE_NO_DATA;
+	cmd->protocol = ATA_PROT_NODATA;
 
 	return ide_raw_taskfile(drive, cmd, NULL, 0);
 }
@@ -545,7 +524,6 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 	memcpy(&cmd.tf_array[6], req_task->io_ports,
 	       HDIO_DRIVE_TASK_HDR_SIZE);
 
-	cmd.data_phase = req_task->data_phase;
 	cmd.tf_flags   = IDE_TFLAG_IO_16BIT | IDE_TFLAG_DEVICE |
 			 IDE_TFLAG_IN_TF;
 
@@ -590,10 +568,12 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 		/* fixup data phase if needed */
 		if (req_task->data_phase == TASKFILE_IN_DMAQ ||
 		    req_task->data_phase == TASKFILE_IN_DMA)
-			cmd.data_phase = TASKFILE_OUT_DMA;
+			cmd.tf_flags |= IDE_TFLAG_WRITE;
 	}
 
-	switch (cmd.data_phase) {
+	cmd.protocol = ATA_PROT_DMA;
+
+	switch (req_task->data_phase) {
 		case TASKFILE_MULTI_OUT:
 			if (!drive->mult_count) {
 				/* (hs): give up if multcount is not set */
@@ -603,8 +583,10 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 				err = -EPERM;
 				goto abort;
 			}
+			cmd.tf_flags |= IDE_TFLAG_MULTI_PIO;
 			/* fall through */
 		case TASKFILE_OUT:
+			cmd.protocol = ATA_PROT_PIO;
 			/* fall through */
 		case TASKFILE_OUT_DMAQ:
 		case TASKFILE_OUT_DMA:
@@ -621,8 +603,10 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 				err = -EPERM;
 				goto abort;
 			}
+			cmd.tf_flags |= IDE_TFLAG_MULTI_PIO;
 			/* fall through */
 		case TASKFILE_IN:
+			cmd.protocol = ATA_PROT_PIO;
 			/* fall through */
 		case TASKFILE_IN_DMAQ:
 		case TASKFILE_IN_DMA:
@@ -630,6 +614,7 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
 			data_buf = inbuf;
 			break;
 		case TASKFILE_NO_DATA:
+			cmd.protocol = ATA_PROT_NODATA;
 			break;
 		default:
 			err = -EFAULT;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 02128e9241d1..f9decc9852e9 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -298,6 +298,7 @@ enum {
 	/* struct ide_cmd was allocated using kmalloc() */
 	IDE_TFLAG_DYN			= (1 << 27),
 	IDE_TFLAG_FS			= (1 << 28),
+	IDE_TFLAG_MULTI_PIO		= (1 << 29),
 };
 
 enum {
@@ -343,7 +344,7 @@ struct ide_cmd {
 	};
 	u8			ftf_flags;	/* for TASKFILE ioctl */
 	u32			tf_flags;
-	int			data_phase;
+	int			protocol;
 
 	int			sg_nents;	  /* number of sg entries */
 	int			orig_sg_nents;
-- 
cgit v1.2.3-71-gd317


From 1caf236dafb7291f9fdfe54b12dd945aec0dca03 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:42 +0100
Subject: ide: add ide_end_rq() (v2)

* Move request dequeuing from __ide_end_request() to ide_end_request().

* Rename __ide_end_request() to ide_end_rq() and export it.

* Fix ide_end_rq() to pass original blk_end_request() return value.

* ide_end_dequeued_request() is used only in cdrom_end_request()
  so inline it there and then remove the function.

v2:
* Remove needless BUG_ON() while at it (start_request()'s one is enough).

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c |  4 ++--
 drivers/ide/ide-io.c | 45 ++++++++++-----------------------------------
 include/linux/ide.h  |  2 +-
 3 files changed, 13 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index bb804ae57bc5..830fd570e760 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -272,8 +272,8 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 			 * now end the failed request
 			 */
 			if (blk_fs_request(failed)) {
-				if (ide_end_dequeued_request(drive, failed, 0,
-						failed->hard_nr_sectors))
+				if (ide_end_rq(drive, failed, 0,
+						failed->hard_nr_sectors << 9))
 					BUG();
 			} else {
 				if (blk_end_request(failed, -EIO,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 0873887194f7..e5fcb283702a 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -54,10 +54,9 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-static int __ide_end_request(ide_drive_t *drive, struct request *rq,
-			     int uptodate, unsigned int nr_bytes, int dequeue)
+int ide_end_rq(ide_drive_t *drive, struct request *rq, int uptodate,
+	       unsigned int nr_bytes)
 {
-	int ret = 1;
 	int error = 0;
 
 	if (uptodate <= 0)
@@ -83,14 +82,9 @@ static int __ide_end_request(ide_drive_t *drive, struct request *rq,
 		ide_dma_on(drive);
 	}
 
-	if (!blk_end_request(rq, error, nr_bytes))
-		ret = 0;
-
-	if (ret == 0 && dequeue)
-		drive->hwif->rq = NULL;
-
-	return ret;
+	return blk_end_request(rq, error, nr_bytes);
 }
+EXPORT_SYMBOL_GPL(ide_end_rq);
 
 /**
  *	ide_end_request		-	complete an IDE I/O
@@ -107,6 +101,7 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
 {
 	unsigned int nr_bytes = nr_sectors << 9;
 	struct request *rq = drive->hwif->rq;
+	int rc;
 
 	if (!nr_bytes) {
 		if (blk_pc_request(rq))
@@ -115,33 +110,13 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
 			nr_bytes = rq->hard_cur_sectors << 9;
 	}
 
-	return __ide_end_request(drive, rq, uptodate, nr_bytes, 1);
-}
-EXPORT_SYMBOL(ide_end_request);
-
-/**
- *	ide_end_dequeued_request	-	complete an IDE I/O
- *	@drive: IDE device for the I/O
- *	@uptodate:
- *	@nr_sectors: number of sectors completed
- *
- *	Complete an I/O that is no longer on the request queue. This
- *	typically occurs when we pull the request and issue a REQUEST_SENSE.
- *	We must still finish the old request but we must not tamper with the
- *	queue in the meantime.
- *
- *	NOTE: This path does not handle barrier, but barrier is not supported
- *	on ide-cd anyway.
- */
-
-int ide_end_dequeued_request(ide_drive_t *drive, struct request *rq,
-			     int uptodate, int nr_sectors)
-{
-	BUG_ON(!blk_rq_started(rq));
+	rc = ide_end_rq(drive, rq, uptodate, nr_bytes);
+	if (rc == 0)
+		drive->hwif->rq = NULL;
 
-	return __ide_end_request(drive, rq, uptodate, nr_sectors << 9, 0);
+	return rc;
 }
-EXPORT_SYMBOL_GPL(ide_end_dequeued_request);
+EXPORT_SYMBOL(ide_end_request);
 
 void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f9decc9852e9..f910f4ccfaa0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1131,8 +1131,8 @@ int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned l
 extern int ide_vlb_clk;
 extern int ide_pci_clk;
 
+int ide_end_rq(ide_drive_t *, struct request *, int, unsigned int);
 int ide_end_request(ide_drive_t *, int, int);
-int ide_end_dequeued_request(ide_drive_t *, struct request *, int, int);
 void ide_kill_rq(ide_drive_t *, struct request *);
 
 void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int,
-- 
cgit v1.2.3-71-gd317


From 6902a5331256e1b9f4cef95a1e3622252113b260 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:43 +0100
Subject: ide: pass error value to ide_complete_rq()

Set rq->errors at ide_complete_rq() call sites and then pass
error value to ide_complete_rq().

[ Some rq->errors assignments look really wrong but this patch
  leaves them alone to not introduce too many changes at once. ]

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    |  5 +++--
 drivers/ide/ide-eh.c       |  5 +++--
 drivers/ide/ide-floppy.c   |  2 +-
 drivers/ide/ide-io.c       | 19 +++++++++----------
 drivers/ide/ide-tape.c     |  2 +-
 drivers/ide/ide-taskfile.c |  4 +++-
 include/linux/ide.h        |  2 +-
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 92c6ef6feb57..5d57af29c4c8 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -402,9 +402,10 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if (uptodate == 0)
 			drive->failed_pc = NULL;
 
-		if (blk_special_request(rq))
+		if (blk_special_request(rq)) {
+			rq->errors = 0;
 			ide_complete_rq(drive, 0);
-		else
+		} else
 			ide_end_request(drive, uptodate, 0);
 
 		return ide_stopped;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index f6e1a82a3cc5..d1385d332e94 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -123,17 +123,18 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 
 	/* retry only "normal" I/O: */
 	if (!blk_fs_request(rq)) {
-		rq->errors = 1;
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
 			struct ide_cmd *cmd = rq->special;
 
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
 		} else if (blk_pm_request(rq)) {
+			rq->errors = 1;
 			ide_complete_pm_rq(drive, rq);
 			return ide_stopped;
 		}
-		ide_complete_rq(drive, err);
+		rq->errors = err;
+		ide_complete_rq(drive, err ? -EIO : 0);
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index f56e9a918b99..407e4914dfd1 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -260,7 +260,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
 		if (blk_special_request(rq)) {
-			rq->errors = IDE_DRV_ERROR_GENERAL;
+			rq->errors = 0;
 			ide_complete_rq(drive, 0);
 			return ide_stopped;
 		} else
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c38426de6041..4cc2bb13f1d6 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -144,17 +144,14 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 		kfree(cmd);
 }
 
-void ide_complete_rq(ide_drive_t *drive, u8 err)
+void ide_complete_rq(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
 
 	hwif->rq = NULL;
 
-	rq->errors = err;
-
-	if (unlikely(blk_end_request(rq, (rq->errors ? -EIO : 0),
-				     blk_rq_bytes(rq))))
+	if (unlikely(blk_end_request(rq, error, blk_rq_bytes(rq))))
 		BUG();
 }
 EXPORT_SYMBOL(ide_complete_rq);
@@ -166,13 +163,14 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 
 	drive->failed_pc = NULL;
 
-	if ((media == ide_floppy && drv_req) || media == ide_tape)
-		rq->errors = IDE_DRV_ERROR_GENERAL;
-
-	if ((media == ide_floppy || media == ide_tape) && drv_req)
+	if ((media == ide_floppy || media == ide_tape) && drv_req) {
+		rq->errors = 0;
 		ide_complete_rq(drive, 0);
-	else
+	} else {
+		if (media == ide_tape)
+			rq->errors = IDE_DRV_ERROR_GENERAL;
 		ide_end_request(drive, 0, 0);
+	}
 }
 
 static void ide_tf_set_specify_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
@@ -312,6 +310,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 #ifdef DEBUG
  	printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
+	rq->errors = 0;
 	ide_complete_rq(drive, 0);
 
  	return ide_stopped;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index a42e49c6cc3f..3bfcd7290ce0 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -774,8 +774,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		if (rq != postponed_rq) {
 			printk(KERN_ERR "ide-tape: ide-tape.c bug - "
 					"Two DSC requests were queued\n");
-			rq->errors = IDE_DRV_ERROR_GENERAL;
 			drive->failed_pc = NULL;
+			rq->errors = 0;
 			ide_complete_rq(drive, 0);
 			return ide_stopped;
 		}
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index f99a6aaad9eb..e9d008ef3f33 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -289,10 +289,12 @@ static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 
 void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
 {
+	struct request *rq = drive->hwif->rq;
 	u8 err = ide_read_error(drive);
 
 	ide_complete_cmd(drive, cmd, stat, err);
-	ide_complete_rq(drive, err);
+	rq->errors = err;
+	ide_complete_rq(drive, err ? -EIO : 0);
 }
 
 /*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f910f4ccfaa0..32369d5797de 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1166,7 +1166,7 @@ extern int ide_devset_execute(ide_drive_t *drive,
 			      const struct ide_devset *setting, int arg);
 
 void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8);
-void ide_complete_rq(ide_drive_t *, u8);
+void ide_complete_rq(ide_drive_t *, int);
 
 void ide_tf_dump(const char *, struct ide_taskfile *);
 
-- 
cgit v1.2.3-71-gd317


From a9587fd8c48415cc93fef7f4ba7748a5d3477e7b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:44 +0100
Subject: ide: remove BUG() from ide_complete_rq()

It is no longer needed so remove it, also while at it dequeue the request
only on blk_end_request() success and make ide_complete_rq() return an error
value.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c | 10 ++++++----
 include/linux/ide.h  |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 28ac463dde1c..4a79d28600f5 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -141,15 +141,17 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 		kfree(cmd);
 }
 
-void ide_complete_rq(ide_drive_t *drive, int error)
+int ide_complete_rq(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
+	int rc;
 
-	hwif->rq = NULL;
+	rc = blk_end_request(rq, error, blk_rq_bytes(rq));
+	if (rc == 0)
+		hwif->rq = NULL;
 
-	if (unlikely(blk_end_request(rq, error, blk_rq_bytes(rq))))
-		BUG();
+	return rc;
 }
 EXPORT_SYMBOL(ide_complete_rq);
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 32369d5797de..bb62bfaf02e0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1166,7 +1166,7 @@ extern int ide_devset_execute(ide_drive_t *drive,
 			      const struct ide_devset *setting, int arg);
 
 void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8);
-void ide_complete_rq(ide_drive_t *, int);
+int ide_complete_rq(ide_drive_t *, int);
 
 void ide_tf_dump(const char *, struct ide_taskfile *);
 
-- 
cgit v1.2.3-71-gd317


From f974b196f58fe042c7b2b4c0ee15d5a6112dbf40 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:44 +0100
Subject: ide: pass number of bytes to complete to ide_complete_rq()

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 2 +-
 drivers/ide/ide-eh.c       | 2 +-
 drivers/ide/ide-floppy.c   | 2 +-
 drivers/ide/ide-io.c       | 8 ++++----
 drivers/ide/ide-tape.c     | 2 +-
 drivers/ide/ide-taskfile.c | 2 +-
 include/linux/ide.h        | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 5504a84e9bd6..30156aa61016 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -404,7 +404,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		if (blk_special_request(rq)) {
 			rq->errors = 0;
-			ide_complete_rq(drive, 0);
+			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		} else {
 			if (blk_fs_request(rq) == 0 && uptodate <= 0) {
 				if (rq->errors == 0)
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 6ad419414f95..ccfd06ef5bb9 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -134,7 +134,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 			return ide_stopped;
 		}
 		rq->errors = err;
-		ide_complete_rq(drive, err ? -EIO : 0);
+		ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 572aa9696dad..7ef2b90e530a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -261,7 +261,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 
 		if (blk_special_request(rq)) {
 			rq->errors = 0;
-			ide_complete_rq(drive, 0);
+			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
 		} else
 			goto out_end;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 4a79d28600f5..a4aa4bf84738 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -141,13 +141,13 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 		kfree(cmd);
 }
 
-int ide_complete_rq(ide_drive_t *drive, int error)
+int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
 	int rc;
 
-	rc = blk_end_request(rq, error, blk_rq_bytes(rq));
+	rc = blk_end_request(rq, error, nr_bytes);
 	if (rc == 0)
 		hwif->rq = NULL;
 
@@ -164,7 +164,7 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 
 	if ((media == ide_floppy || media == ide_tape) && drv_req) {
 		rq->errors = 0;
-		ide_complete_rq(drive, 0);
+		ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 	} else {
 		if (media == ide_tape)
 			rq->errors = IDE_DRV_ERROR_GENERAL;
@@ -312,7 +312,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
  	printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
 	rq->errors = 0;
-	ide_complete_rq(drive, 0);
+	ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 
  	return ide_stopped;
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 94f6fb8c147a..2df708927687 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -778,7 +778,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 					"Two DSC requests were queued\n");
 			drive->failed_pc = NULL;
 			rq->errors = 0;
-			ide_complete_rq(drive, 0);
+			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
 		}
 
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index e9d008ef3f33..b9d7ba2c8a00 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -294,7 +294,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
 
 	ide_complete_cmd(drive, cmd, stat, err);
 	rq->errors = err;
-	ide_complete_rq(drive, err ? -EIO : 0);
+	ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
 }
 
 /*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index bb62bfaf02e0..cbfb64fdeda7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1166,7 +1166,7 @@ extern int ide_devset_execute(ide_drive_t *drive,
 			      const struct ide_devset *setting, int arg);
 
 void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8);
-int ide_complete_rq(ide_drive_t *, int);
+int ide_complete_rq(ide_drive_t *, int, unsigned int);
 
 void ide_tf_dump(const char *, struct ide_taskfile *);
 
-- 
cgit v1.2.3-71-gd317


From 130e886708d6e11f3d54e5d27c266578de56f343 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:45 +0100
Subject: ide: remove ide_end_request()

* Add ide_rq_bytes() helper.

* Add blk_noretry_request() quirk to ide_complete_rq() (currently only fs
  requests can be marked as "noretry" so there is no change in behavior).

* Switch current ide_end_request() users to use ide_complete_rq().

  [ No need to check for rq->nr_sectors == 0 in {ide_dma,task_pio}_intr(),
    nsectors == 0 in cdrom_end_request() and err == 0 in ide_do_devset(). ]

* Remove no longer needed ide_end_request().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    |  6 +++--
 drivers/ide/ide-cd.c       |  7 +++---
 drivers/ide/ide-devsets.c  |  4 +--
 drivers/ide/ide-disk.c     |  2 +-
 drivers/ide/ide-dma.c      |  3 ++-
 drivers/ide/ide-eh.c       |  2 +-
 drivers/ide/ide-floppy.c   |  4 +--
 drivers/ide/ide-io.c       | 61 ++++++++++++++--------------------------------
 drivers/ide/ide-tape.c     |  2 +-
 drivers/ide/ide-taskfile.c |  4 +--
 include/linux/ide.h        |  2 +-
 11 files changed, 37 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 30156aa61016..c3fd528a1b4d 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -410,7 +410,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 				if (rq->errors == 0)
 					rq->errors = -EIO;
 			}
-			ide_end_request(drive, uptodate, 0);
+			ide_complete_rq(drive, uptodate ? 0 : -EIO,
+					ide_rq_bytes(rq));
 		}
 
 		return ide_stopped;
@@ -469,7 +470,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		/* FIXME: don't do partial completions */
 		if (drive->media == ide_floppy)
-			ide_end_request(drive, 1, done >> 9);
+			ide_complete_rq(drive, 0,
+					done ? done : ide_rq_bytes(rq));
 	} else
 		xferfunc(drive, NULL, pc->cur_pos, bcount);
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index e4fa807fdcfa..2f698c6e913f 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -298,7 +298,7 @@ static void cdrom_end_request(ide_drive_t *drive, int uptodate)
 	if (blk_fs_request(rq) == 0 && uptodate <= 0 && rq->errors == 0)
 		rq->errors = -EIO;
 
-	ide_end_request(drive, uptodate, nsectors);
+	ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
 }
 
 static void ide_dump_status_no_sense(ide_drive_t *drive, const char *msg, u8 st)
@@ -793,10 +793,11 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		if (dma_error)
 			return ide_error(drive, "dma error", stat);
 		if (blk_fs_request(rq)) {
-			ide_end_request(drive, 1, rq->nr_sectors);
+			ide_complete_rq(drive, 0, rq->nr_sectors
+				? (rq->nr_sectors << 9) : ide_rq_bytes(rq));
 			return ide_stopped;
 		} else if (rq->cmd_type == REQ_TYPE_ATA_PC && !rq->bio) {
-			ide_end_request(drive, 1, 1);
+			ide_complete_rq(drive, 0, 512);
 			return ide_stopped;
 		}
 		goto end_request;
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 7c3953414d47..5bf958e5b1d5 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -183,8 +183,6 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 	err = setfunc(drive, *(int *)&rq->cmd[1]);
 	if (err)
 		rq->errors = err;
-	else
-		err = 1;
-	ide_end_request(drive, err, 0);
+	ide_complete_rq(drive, err, ide_rq_bytes(rq));
 	return ide_stopped;
 }
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index ad9a3f54d21d..d8caa65ca7a5 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -186,7 +186,7 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 		blk_dump_rq_flags(rq, "ide_do_rw_disk - bad command");
 		if (rq->errors == 0)
 			rq->errors = -EIO;
-		ide_end_request(drive, 0, 0);
+		ide_complete_rq(drive, -EIO, ide_rq_bytes(rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 820e5104ba47..8f5e32e692f0 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -101,7 +101,8 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 			if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
 				ide_finish_cmd(drive, cmd, stat);
 			else
-				ide_end_request(drive, 1, cmd->rq->nr_sectors);
+				ide_complete_rq(drive, 0,
+						cmd->rq->nr_sectors << 9);
 			return ide_stopped;
 		}
 		printk(KERN_ERR "%s: %s: bad DMA status (0x%02x)\n",
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index ccfd06ef5bb9..aff1a9b04559 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -149,7 +149,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 	if (rq && blk_special_request(rq) && rq->cmd[0] == REQ_DRIVE_RESET) {
 		if (err <= 0 && rq->errors == 0)
 			rq->errors = -EIO;
-		ide_end_request(drive, err ? err : 1, 0);
+		ide_complete_rq(drive, err ? err : 0, ide_rq_bytes(rq));
 	}
 }
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 7ef2b90e530a..8c518c6a6477 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -68,7 +68,7 @@ static void idefloppy_update_buffers(ide_drive_t *drive,
 	struct bio *bio = rq->bio;
 
 	while ((bio = rq->bio) != NULL)
-		ide_end_request(drive, 1, 0);
+		ide_complete_rq(drive, 0, ide_rq_bytes(rq));
 }
 
 static int ide_floppy_callback(ide_drive_t *drive, int dsc)
@@ -300,7 +300,7 @@ out_end:
 	drive->failed_pc = NULL;
 	if (blk_fs_request(rq) == 0 && rq->errors == 0)
 		rq->errors = -EIO;
-	ide_end_request(drive, 0, 0);
+	ide_complete_rq(drive, -EIO, ide_rq_bytes(rq));
 	return ide_stopped;
 }
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 8e2868617a46..f59c709052d2 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -71,48 +71,6 @@ int ide_end_rq(ide_drive_t *drive, struct request *rq, int error,
 }
 EXPORT_SYMBOL_GPL(ide_end_rq);
 
-/**
- *	ide_end_request		-	complete an IDE I/O
- *	@drive: IDE device for the I/O
- *	@uptodate:
- *	@nr_sectors: number of sectors completed
- *
- *	This is our end_request wrapper function. We complete the I/O
- *	update random number input and dequeue the request, which if
- *	it was tagged may be out of order.
- */
-
-int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
-{
-	unsigned int nr_bytes = nr_sectors << 9;
-	struct request *rq = drive->hwif->rq;
-	int rc, error = 0;
-
-	if (!nr_bytes) {
-		if (blk_pc_request(rq))
-			nr_bytes = rq->data_len;
-		else
-			nr_bytes = rq->hard_cur_sectors << 9;
-	}
-
-	/*
-	 * if failfast is set on a request, override number of sectors
-	 * and complete the whole request right now
-	 */
-	if (blk_noretry_request(rq) && uptodate <= 0)
-		nr_bytes = rq->hard_nr_sectors << 9;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	rc = ide_end_rq(drive, rq, error, nr_bytes);
-	if (rc == 0)
-		drive->hwif->rq = NULL;
-
-	return rc;
-}
-EXPORT_SYMBOL(ide_end_request);
-
 void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 {
 	struct ide_taskfile *tf = &cmd->tf;
@@ -141,12 +99,29 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 		kfree(cmd);
 }
 
+/* obsolete, blk_rq_bytes() should be used instead */
+unsigned int ide_rq_bytes(struct request *rq)
+{
+	if (blk_pc_request(rq))
+		return rq->data_len;
+	else
+		return rq->hard_cur_sectors << 9;
+}
+EXPORT_SYMBOL_GPL(ide_rq_bytes);
+
 int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
 	int rc;
 
+	/*
+	 * if failfast is set on a request, override number of sectors
+	 * and complete the whole request right now
+	 */
+	if (blk_noretry_request(rq) && error <= 0)
+		nr_bytes = rq->hard_nr_sectors << 9;
+
 	rc = ide_end_rq(drive, rq, error, nr_bytes);
 	if (rc == 0)
 		hwif->rq = NULL;
@@ -170,7 +145,7 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 			rq->errors = IDE_DRV_ERROR_GENERAL;
 		else if (blk_fs_request(rq) == 0 && rq->errors == 0)
 			rq->errors = -EIO;
-		ide_end_request(drive, 0, 0);
+		ide_complete_rq(drive, -EIO, ide_rq_bytes(rq));
 	}
 }
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2df708927687..853d047aa78f 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -762,7 +762,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			"request queue (%d)\n", drive->name, rq->cmd_type);
 		if (blk_fs_request(rq) == 0 && rq->errors == 0)
 			rq->errors = -EIO;
-		ide_end_request(drive, 0, 0);
+		ide_complete_rq(drive, -EIO, ide_rq_bytes(rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index b9d7ba2c8a00..db6d7821e45b 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -283,7 +283,7 @@ static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 		}
 
 		if (sectors > 0)
-			ide_end_request(drive, 1, sectors);
+			ide_complete_rq(drive, 0, sectors << 9);
 	}
 }
 
@@ -352,7 +352,7 @@ out_end:
 	if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
 		ide_finish_cmd(drive, cmd, stat);
 	else
-		ide_end_request(drive, 1, cmd->rq->nr_sectors);
+		ide_complete_rq(drive, 0, cmd->rq->nr_sectors << 9);
 	return ide_stopped;
 out_err:
 	ide_error_cmd(drive, cmd);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cbfb64fdeda7..b6142171baf0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1131,8 +1131,8 @@ int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned l
 extern int ide_vlb_clk;
 extern int ide_pci_clk;
 
+unsigned int ide_rq_bytes(struct request *);
 int ide_end_rq(ide_drive_t *, struct request *, int, unsigned int);
-int ide_end_request(ide_drive_t *, int, int);
 void ide_kill_rq(ide_drive_t *, struct request *);
 
 void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int,
-- 
cgit v1.2.3-71-gd317


From 2298169418f43ba5e0919762a4bab95a1227872a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:46 +0100
Subject: ide: pass command to ide_map_sg()

* Set IDE_TFLAG_WRITE flag and ->rq also for ATA_CMD_PACKET
  commands.

* Pass command to ->dma_setup method and update all its
  implementations accordingly.

* Pass command instead of request to ide_build_sglist(),
  *_build_dmatable() and ide_map_sg().

While at it:

* Fix scc_dma_setup() documentation + use ATA_DMA_WR define.

* Rename sgiioc4_build_dma_table() to sgiioc4_build_dmatable(),
  change return value type to 'int' and drop unused 'ddir'
  argument.

* Do some minor cleanups in [tx4939]ide_dma_setup().

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c     |  7 ++++---
 drivers/ide/au1xxx-ide.c   | 15 ++++++---------
 drivers/ide/icside.c       |  7 +++----
 drivers/ide/ide-atapi.c    | 16 ++++++++++++----
 drivers/ide/ide-disk.c     | 10 +++++-----
 drivers/ide/ide-dma-sff.c  | 18 +++++++++---------
 drivers/ide/ide-dma.c      | 15 +++++++--------
 drivers/ide/ide-floppy.c   |  6 +++++-
 drivers/ide/ide-io.c       |  6 +++---
 drivers/ide/ide-taskfile.c |  4 ++--
 drivers/ide/ns87415.c      |  4 ++--
 drivers/ide/pmac.c         | 19 ++++++++-----------
 drivers/ide/scc_pata.c     | 19 +++++++------------
 drivers/ide/sgiioc4.c      | 21 +++++++--------------
 drivers/ide/trm290.c       | 10 +++++-----
 drivers/ide/tx4939ide.c    | 26 ++++++++++----------------
 include/linux/ide.h        | 12 ++++++------
 17 files changed, 101 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index d3513b6b8530..e837fd9f196f 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -191,17 +191,18 @@ static void ali_set_dma_mode(ide_drive_t *drive, const u8 speed)
 /**
  *	ali15x3_dma_setup	-	begin a DMA phase
  *	@drive:	target device
+ *	@cmd: command
  *
  *	Returns 1 if the DMA cannot be performed, zero on success.
  */
 
-static int ali15x3_dma_setup(ide_drive_t *drive)
+static int ali15x3_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	if (m5229_revision < 0xC2 && drive->media != ide_disk) {
-		if (rq_data_dir(drive->hwif->rq))
+		if (cmd->tf_flags & IDE_TFLAG_WRITE)
 			return 1;	/* try PIO instead of DMA */
 	}
-	return ide_dma_setup(drive);
+	return ide_dma_setup(drive, cmd);
 }
 
 /**
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 3ace0cda5452..58485d6cb026 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -209,15 +209,14 @@ static void auide_set_dma_mode(ide_drive_t *drive, const u8 speed)
  */
 
 #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
-static int auide_build_dmatable(ide_drive_t *drive)
+static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
 	_auide_hwif *ahwif = &auide_hwif;
 	struct scatterlist *sg;
-	int i = hwif->cmd.sg_nents, iswrite, count = 0;
+	int i = cmd->sg_nents, count = 0;
+	int iswrite = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
 
-	iswrite = (rq_data_dir(rq) == WRITE);
 	/* Save for interrupt context */
 	ahwif->drive = drive;
 
@@ -298,12 +297,10 @@ static void auide_dma_exec_cmd(ide_drive_t *drive, u8 command)
 			    (2*WAIT_CMD), NULL);
 }
 
-static int auide_dma_setup(ide_drive_t *drive)
+static int auide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	struct request *rq = drive->hwif->rq;
-
-	if (!auide_build_dmatable(drive)) {
-		ide_map_sg(drive, rq);
+	if (auide_build_dmatable(drive, cmd) == 0) {
+		ide_map_sg(drive, cmd);
 		return 1;
 	}
 
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index bdfeb1222d52..3628b2147902 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -307,15 +307,14 @@ static void icside_dma_start(ide_drive_t *drive)
 	enable_dma(ec->dma);
 }
 
-static int icside_dma_setup(ide_drive_t *drive)
+static int icside_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct expansion_card *ec = ECARD_DEV(hwif->dev);
 	struct icside_state *state = ecard_get_drvdata(ec);
-	struct request *rq = hwif->rq;
 	unsigned int dma_mode;
 
-	if (rq_data_dir(rq))
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
 		dma_mode = DMA_MODE_WRITE;
 	else
 		dma_mode = DMA_MODE_READ;
@@ -344,7 +343,7 @@ static int icside_dma_setup(ide_drive_t *drive)
 	 * Tell the DMA engine about the SG table and
 	 * data direction.
 	 */
-	set_dma_sg(ec->dma, hwif->sg_table, hwif->cmd.sg_nents);
+	set_dma_sg(ec->dma, hwif->sg_table, cmd->sg_nents);
 	set_dma_mode(ec->dma, dma_mode);
 
 	drive->waiting_for_dma = 1;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index c3fd528a1b4d..b56af49f876b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -638,12 +638,20 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 {
 	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
+	struct ide_cmd *cmd = &hwif->cmd;
 	ide_expiry_t *expiry = NULL;
 	struct request *rq = hwif->rq;
 	unsigned int timeout;
 	u32 tf_flags;
 	u16 bcount;
 
+	if (drive->media != ide_floppy) {
+		if (rq_data_dir(rq))
+			cmd->tf_flags |= IDE_TFLAG_WRITE;
+		cmd->rq = rq;
+	}
+
 	if (dev_is_idecd(drive)) {
 		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
 		bcount = ide_cd_get_xferlen(rq);
@@ -651,8 +659,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 		timeout = ATAPI_WAIT_PC;
 
 		if (drive->dma) {
-			if (ide_build_sglist(drive, rq))
-				drive->dma = !hwif->dma_ops->dma_setup(drive);
+			if (ide_build_sglist(drive, cmd))
+				drive->dma = !dma_ops->dma_setup(drive, cmd);
 			else
 				drive->dma = 0;
 		}
@@ -675,8 +683,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 
 		if ((pc->flags & PC_FLAG_DMA_OK) &&
 		     (drive->dev_flags & IDE_DFLAG_USING_DMA)) {
-			if (ide_build_sglist(drive, rq))
-				drive->dma = !hwif->dma_ops->dma_setup(drive);
+			if (ide_build_sglist(drive, cmd))
+				drive->dma = !dma_ops->dma_setup(drive, cmd);
 			else
 				drive->dma = 0;
 		}
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index d8caa65ca7a5..4b32c4eb7b82 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -99,11 +99,6 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
-	if (dma == 0) {
-		ide_init_sg_cmd(&cmd, nsectors);
-		ide_map_sg(drive, rq);
-	}
-
 	if (drive->dev_flags & IDE_DFLAG_LBA) {
 		if (lba48) {
 			pr_debug("%s: LBA=0x%012llx\n", drive->name,
@@ -156,6 +151,11 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	ide_tf_set_cmd(drive, &cmd, dma);
 	cmd.rq = rq;
 
+	if (dma == 0) {
+		ide_init_sg_cmd(&cmd, nsectors);
+		ide_map_sg(drive, &cmd);
+	}
+
 	rc = do_rw_taskfile(drive, &cmd);
 
 	if (rc == ide_stopped && dma) {
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 7bf28a9b6f65..b7eb810c7b8f 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(ide_dma_host_set);
  *	May also be invoked from trm290.c
  */
 
-int ide_build_dmatable(ide_drive_t *drive, struct request *rq)
+int ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	__le32 *table = (__le32 *)hwif->dmatable_cpu;
@@ -120,7 +120,7 @@ int ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	struct scatterlist *sg;
 	u8 is_trm290 = !!(hwif->host_flags & IDE_HFLAG_TRM290);
 
-	for_each_sg(hwif->sg_table, sg, hwif->cmd.sg_nents, i) {
+	for_each_sg(hwif->sg_table, sg, cmd->sg_nents, i) {
 		u32 cur_addr, cur_len, xcount, bcount;
 
 		cur_addr = sg_dma_address(sg);
@@ -175,6 +175,7 @@ EXPORT_SYMBOL_GPL(ide_build_dmatable);
 /**
  *	ide_dma_setup	-	begin a DMA phase
  *	@drive: target device
+ *	@cmd: command
  *
  *	Build an IDE DMA PRD (IDE speak for scatter gather table)
  *	and then set up the DMA transfer registers for a device
@@ -185,17 +186,16 @@ EXPORT_SYMBOL_GPL(ide_build_dmatable);
  *	is returned.
  */
 
-int ide_dma_setup(ide_drive_t *drive)
+int ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
-	unsigned int reading = rq_data_dir(rq) ? 0 : ATA_DMA_WR;
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
+	u8 rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 0 : ATA_DMA_WR;
 	u8 dma_stat;
 
 	/* fall back to pio! */
-	if (!ide_build_dmatable(drive, rq)) {
-		ide_map_sg(drive, rq);
+	if (ide_build_dmatable(drive, cmd) == 0) {
+		ide_map_sg(drive, cmd);
 		return 1;
 	}
 
@@ -208,9 +208,9 @@ int ide_dma_setup(ide_drive_t *drive)
 
 	/* specify r/w */
 	if (mmio)
-		writeb(reading, (void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
+		writeb(rw, (void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
 	else
-		outb(reading, hwif->dma_base + ATA_DMA_CMD);
+		outb(rw, hwif->dma_base + ATA_DMA_CMD);
 
 	/* read DMA status for INTR & ERROR flags */
 	dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 8f5e32e692f0..ad4edab9b0a9 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -120,7 +120,7 @@ int ide_dma_good_drive(ide_drive_t *drive)
 /**
  *	ide_build_sglist	-	map IDE scatter gather for DMA I/O
  *	@drive: the drive to build the DMA table for
- *	@rq: the request holding the sg list
+ *	@cmd: command
  *
  *	Perform the DMA mapping magic necessary to access the source or
  *	target buffers of a request via DMA.  The lower layers of the
@@ -128,23 +128,22 @@ int ide_dma_good_drive(ide_drive_t *drive)
  *	operate in a portable fashion.
  */
 
-int ide_build_sglist(ide_drive_t *drive, struct request *rq)
+int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
-	struct ide_cmd *cmd = &hwif->cmd;
 	int i;
 
-	ide_map_sg(drive, rq);
+	ide_map_sg(drive, cmd);
 
-	if (rq_data_dir(rq) == READ)
-		cmd->sg_dma_direction = DMA_FROM_DEVICE;
-	else
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
 		cmd->sg_dma_direction = DMA_TO_DEVICE;
+	else
+		cmd->sg_dma_direction = DMA_FROM_DEVICE;
 
 	i = dma_map_sg(hwif->dev, sg, cmd->sg_nents, cmd->sg_dma_direction);
 	if (i == 0)
-		ide_map_sg(drive, rq);
+		ide_map_sg(drive, cmd);
 	else {
 		cmd->orig_sg_nents = cmd->sg_nents;
 		cmd->sg_nents = i;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 8c518c6a6477..ee3e77a7a727 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -285,9 +285,13 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	if (rq_data_dir(rq))
+		cmd->tf_flags |= IDE_TFLAG_WRITE;
+	cmd->rq = rq;
+
 	if (blk_fs_request(rq) || pc->req_xfer) {
 		ide_init_sg_cmd(cmd, rq->nr_sectors);
-		ide_map_sg(drive, rq);
+		ide_map_sg(drive, cmd);
 	}
 
 	pc->sg = hwif->sg_table;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index f59c709052d2..47404f5526f1 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -228,11 +228,11 @@ static ide_startstop_t do_special (ide_drive_t *drive)
 	return ide_stopped;
 }
 
-void ide_map_sg(ide_drive_t *drive, struct request *rq)
+void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
 	struct scatterlist *sg = hwif->sg_table;
+	struct request *rq = cmd->rq;
 
 	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
 		sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE);
@@ -273,7 +273,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 	if (cmd) {
 		if (cmd->protocol == ATA_PROT_PIO) {
 			ide_init_sg_cmd(cmd, rq->nr_sectors);
-			ide_map_sg(drive, rq);
+			ide_map_sg(drive, cmd);
 		}
 
 		return do_rw_taskfile(drive, cmd);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index db6d7821e45b..3b23bd11945e 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -102,8 +102,8 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		return ide_started;
 	default:
 		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-		    ide_build_sglist(drive, hwif->rq) == 0 ||
-		    dma_ops->dma_setup(drive))
+		    ide_build_sglist(drive, cmd) == 0 ||
+		    dma_ops->dma_setup(drive, cmd))
 			return ide_stopped;
 		dma_ops->dma_exec_cmd(drive, tf->command);
 		dma_ops->dma_start(drive);
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index d93c80016326..cf6d9a9c8a27 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -216,11 +216,11 @@ static int ns87415_dma_end(ide_drive_t *drive)
 	return (dma_stat & 7) != 4;
 }
 
-static int ns87415_dma_setup(ide_drive_t *drive)
+static int ns87415_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	/* select DMA xfer */
 	ns87415_prepare_drive(drive, 1);
-	if (!ide_dma_setup(drive))
+	if (ide_dma_setup(drive, cmd) == 0)
 		return 0;
 	/* DMA failed: select PIO xfer */
 	ns87415_prepare_drive(drive, 0);
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index f5b85f4c1b65..337d2d5b3028 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -404,7 +404,6 @@ kauai_lookup_timing(struct kauai_timing* table, int cycle_time)
 #define IDE_WAKEUP_DELAY	(1*HZ)
 
 static int pmac_ide_init_dma(ide_hwif_t *, const struct ide_port_info *);
-static int pmac_ide_build_dmatable(ide_drive_t *drive, struct request *rq);
 static void pmac_ide_selectproc(ide_drive_t *drive);
 static void pmac_ide_kauai_selectproc(ide_drive_t *drive);
 
@@ -1422,8 +1421,7 @@ out:
  * pmac_ide_build_dmatable builds the DBDMA command list
  * for a transfer and sets the DBDMA channel to point to it.
  */
-static int
-pmac_ide_build_dmatable(ide_drive_t *drive, struct request *rq)
+static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
@@ -1431,8 +1429,8 @@ pmac_ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	struct dbdma_cmd *table;
 	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
 	struct scatterlist *sg;
-	int wr = (rq_data_dir(rq) == WRITE);
-	int i = hwif->cmd.sg_nents, count = 0;
+	int wr = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
+	int i = cmd->sg_nents, count = 0;
 
 	/* DMA table is already aligned */
 	table = (struct dbdma_cmd *) pmif->dma_table_cpu;
@@ -1504,23 +1502,22 @@ use_pio_instead:
  * Prepare a DMA transfer. We build the DMA table, adjust the timings for
  * a read on KeyLargo ATA/66 and mark us as waiting for DMA completion
  */
-static int
-pmac_ide_dma_setup(ide_drive_t *drive)
+static int pmac_ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
 		(pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent);
-	struct request *rq = hwif->rq;
 	u8 unit = drive->dn & 1, ata4 = (pmif->kind == controller_kl_ata4);
+	u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
 
-	if (!pmac_ide_build_dmatable(drive, rq)) {
-		ide_map_sg(drive, rq);
+	if (pmac_ide_build_dmatable(drive, cmd) == 0) {
+		ide_map_sg(drive, cmd);
 		return 1;
 	}
 
 	/* Apple adds 60ns to wrDataSetup on reads */
 	if (ata4 && (pmif->timings[unit] & TR_66_UDMA_EN)) {
-		writel(pmif->timings[unit] + (!rq_data_dir(rq) ? 0x00800000UL : 0),
+		writel(pmif->timings[unit] + (write ? 0 : 0x00800000UL),
 			PMAC_IDE_REG(IDE_TIMING_CONFIG));
 		(void)readl(PMAC_IDE_REG(IDE_TIMING_CONFIG));
 	}
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index ada866744622..1f2805ce9889 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -303,8 +303,9 @@ static void scc_dma_host_set(ide_drive_t *drive, int on)
 }
 
 /**
- *	scc_ide_dma_setup	-	begin a DMA phase
+ *	scc_dma_setup	-	begin a DMA phase
  *	@drive: target device
+ *	@cmd: command
  *
  *	Build an IDE DMA PRD (IDE speak for scatter gather table)
  *	and then set up the DMA transfer registers.
@@ -313,21 +314,15 @@ static void scc_dma_host_set(ide_drive_t *drive, int on)
  *	is returned.
  */
 
-static int scc_dma_setup(ide_drive_t *drive)
+static int scc_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
-	unsigned int reading;
+	u32 rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 0 : ATA_DMA_WR;
 	u8 dma_stat;
 
-	if (rq_data_dir(rq))
-		reading = 0;
-	else
-		reading = 1 << 3;
-
 	/* fall back to pio! */
-	if (!ide_build_dmatable(drive, rq)) {
-		ide_map_sg(drive, rq);
+	if (ide_build_dmatable(drive, cmd) == 0) {
+		ide_map_sg(drive, cmd);
 		return 1;
 	}
 
@@ -335,7 +330,7 @@ static int scc_dma_setup(ide_drive_t *drive)
 	out_be32((void __iomem *)(hwif->dma_base + 8), hwif->dmatable_dma);
 
 	/* specify r/w */
-	out_be32((void __iomem *)hwif->dma_base, reading);
+	out_be32((void __iomem *)hwif->dma_base, rw);
 
 	/* read DMA status for INTR & ERROR flags */
 	dma_stat = scc_dma_sff_read_status(hwif);
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index b0769e96d32f..b12de8346c73 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -424,12 +424,11 @@ sgiioc4_configure_for_dma(int dma_direction, ide_drive_t * drive)
 /* | Upper 32 bits - Zero	    |EOL| 15 unused     | 16 Bit Length| */
 /* --------------------------------------------------------------------- */
 /* Creates the scatter gather list, DMA Table */
-static unsigned int
-sgiioc4_build_dma_table(ide_drive_t * drive, struct request *rq, int ddir)
+static int sgiioc4_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned int *table = hwif->dmatable_cpu;
-	unsigned int count = 0, i = hwif->cmd.sg_nents;
+	unsigned int count = 0, i = cmd->sg_nents;
 	struct scatterlist *sg = hwif->sg_table;
 
 	while (i && sg_dma_len(sg)) {
@@ -484,24 +483,18 @@ use_pio_instead:
 	return 0;		/* revert to PIO for this request */
 }
 
-static int sgiioc4_dma_setup(ide_drive_t *drive)
+static int sgiioc4_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	struct request *rq = drive->hwif->rq;
-	unsigned int count = 0;
 	int ddir;
+	u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
 
-	if (rq_data_dir(rq))
-		ddir = PCI_DMA_TODEVICE;
-	else
-		ddir = PCI_DMA_FROMDEVICE;
-
-	if (!(count = sgiioc4_build_dma_table(drive, rq, ddir))) {
+	if (sgiioc4_build_dmatable(drive, cmd) == 0) {
 		/* try PIO instead of DMA */
-		ide_map_sg(drive, rq);
+		ide_map_sg(drive, cmd);
 		return 1;
 	}
 
-	if (rq_data_dir(rq))
+	if (write)
 		/* Writes TO the IOC4 FROM Main Memory */
 		ddir = IOC4_DMA_READ;
 	else
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index e8279f32f9a2..746858a7338d 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -181,13 +181,12 @@ static void trm290_dma_exec_cmd(ide_drive_t *drive, u8 command)
 	ide_execute_command(drive, command, &ide_dma_intr, WAIT_CMD, NULL);
 }
 
-static int trm290_dma_setup(ide_drive_t *drive)
+static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
 	unsigned int count, rw;
 
-	if (rq_data_dir(rq)) {
+	if (cmd->tf_flags & IDE_TFLAG_WRITE) {
 #ifdef TRM290_NO_DMA_WRITES
 		/* always use PIO for writes */
 		trm290_prepare_drive(drive, 0);	/* select PIO xfer */
@@ -197,8 +196,9 @@ static int trm290_dma_setup(ide_drive_t *drive)
 	} else
 		rw = 2;
 
-	if (!(count = ide_build_dmatable(drive, rq))) {
-		ide_map_sg(drive, rq);
+	count = ide_build_dmatable(drive, cmd);
+	if (count == 0) {
+		ide_map_sg(drive, cmd);
 		/* try PIO instead of DMA */
 		trm290_prepare_drive(drive, 0); /* select PIO xfer */
 		return 1;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 8d155ec8cca9..39e3316ab63f 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -232,7 +232,7 @@ static u8 tx4939ide_clear_dma_status(void __iomem *base)
 
 #ifdef __BIG_ENDIAN
 /* custom ide_build_dmatable to handle swapped layout */
-static int tx4939ide_build_dmatable(ide_drive_t *drive, struct request *rq)
+static int tx4939ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	u32 *table = (u32 *)hwif->dmatable_cpu;
@@ -240,7 +240,7 @@ static int tx4939ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 	int i;
 	struct scatterlist *sg;
 
-	for_each_sg(hwif->sg_table, sg, hwif->cmd.sg_nents, i) {
+	for_each_sg(hwif->sg_table, sg, cmd->sg_nents, i) {
 		u32 cur_addr, cur_len, bcount;
 
 		cur_addr = sg_dma_address(sg);
@@ -287,23 +287,15 @@ use_pio_instead:
 #define tx4939ide_build_dmatable	ide_build_dmatable
 #endif
 
-static int tx4939ide_dma_setup(ide_drive_t *drive)
+static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	void __iomem *base = TX4939IDE_BASE(hwif);
-	struct request *rq = hwif->rq;
-	u8 reading;
-	int nent;
-
-	if (rq_data_dir(rq))
-		reading = 0;
-	else
-		reading = ATA_DMA_WR;
+	u8 rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 0 : ATA_DMA_WR;
 
 	/* fall back to PIO! */
-	nent = tx4939ide_build_dmatable(drive, rq);
-	if (!nent) {
-		ide_map_sg(drive, rq);
+	if (tx4939ide_build_dmatable(drive, cmd) == 0) {
+		ide_map_sg(drive, cmd);
 		return 1;
 	}
 
@@ -311,7 +303,7 @@ static int tx4939ide_dma_setup(ide_drive_t *drive)
 	tx4939ide_writel(hwif->dmatable_dma, base, TX4939IDE_PRD_Ptr);
 
 	/* specify r/w */
-	tx4939ide_writeb(reading, base, TX4939IDE_DMA_Cmd);
+	tx4939ide_writeb(rw, base, TX4939IDE_DMA_Cmd);
 
 	/* clear INTR & ERROR flags */
 	tx4939ide_clear_dma_status(base);
@@ -320,7 +312,9 @@ static int tx4939ide_dma_setup(ide_drive_t *drive)
 
 	tx4939ide_writew(SECTOR_SIZE / 2, base, drive->dn ?
 			 TX4939IDE_Xfer_Cnt_2 : TX4939IDE_Xfer_Cnt_1);
-	tx4939ide_writew(rq->nr_sectors, base, TX4939IDE_Sec_Cnt);
+
+	tx4939ide_writew(cmd->rq->nr_sectors, base, TX4939IDE_Sec_Cnt);
+
 	return 0;
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b6142171baf0..b30e79c6ff57 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -714,7 +714,7 @@ struct ide_port_ops {
 
 struct ide_dma_ops {
 	void	(*dma_host_set)(struct ide_drive_s *, int);
-	int	(*dma_setup)(struct ide_drive_s *);
+	int	(*dma_setup)(struct ide_drive_s *, struct ide_cmd *);
 	void	(*dma_exec_cmd)(struct ide_drive_s *, u8);
 	void	(*dma_start)(struct ide_drive_s *);
 	int	(*dma_end)(struct ide_drive_s *);
@@ -1412,7 +1412,7 @@ int ide_pci_resume(struct pci_dev *);
 #define ide_pci_resume NULL
 #endif
 
-void ide_map_sg(ide_drive_t *, struct request *);
+void ide_map_sg(ide_drive_t *, struct ide_cmd *);
 void ide_init_sg_cmd(struct ide_cmd *, int);
 
 #define BAD_DMA_DRIVE		0
@@ -1447,14 +1447,14 @@ ide_startstop_t ide_dma_intr(ide_drive_t *);
 int ide_allocate_dma_engine(ide_hwif_t *);
 void ide_release_dma_engine(ide_hwif_t *);
 
-int ide_build_sglist(ide_drive_t *, struct request *);
+int ide_build_sglist(ide_drive_t *, struct ide_cmd *);
 void ide_destroy_dmatable(ide_drive_t *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_SFF
 int config_drive_for_dma(ide_drive_t *);
-extern int ide_build_dmatable(ide_drive_t *, struct request *);
+int ide_build_dmatable(ide_drive_t *, struct ide_cmd *);
 void ide_dma_host_set(ide_drive_t *, int);
-extern int ide_dma_setup(ide_drive_t *);
+int ide_dma_setup(ide_drive_t *, struct ide_cmd *);
 void ide_dma_exec_cmd(ide_drive_t *, u8);
 extern void ide_dma_start(ide_drive_t *);
 int ide_dma_end(ide_drive_t *);
@@ -1482,7 +1482,7 @@ static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
 static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_build_sglist(ide_drive_t *drive,
-				   struct request *rq) { return 0; }
+				   struct ide_cmd *cmd) { return 0; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3-71-gd317


From b788ee9c6561fd9219a503216284d61036a0dc0b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:46 +0100
Subject: ide: use do_rw_taskfile() for ATA_CMD_PACKET commands

* Pass command to ide_issue_pc() and update ->do_request methods
  in ide-{cd,floppy,tape}.c accordingly.

* Convert ide_pktcmd_tf_load() to ide_init_packet_cmd() which
  just initializes command structure and use do_rw_taskfile()
  to load ATA_CMD_PACKET commands.

While at it:

* Rename ide{floppy,tape}_issue_pc() to ide_{floppy,tape}_issue_pc().

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 37 ++++++++++++-------------------------
 drivers/ide/ide-cd.c       | 11 ++++++++++-
 drivers/ide/ide-floppy.c   | 24 ++++++++++++++----------
 drivers/ide/ide-tape.c     | 19 ++++++++++++++-----
 drivers/ide/ide-taskfile.c |  3 ++-
 include/linux/ide.h        |  2 +-
 6 files changed, 53 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b56af49f876b..75df05add1b9 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -487,23 +487,15 @@ next_irq:
 	return ide_started;
 }
 
-static void ide_pktcmd_tf_load(ide_drive_t *drive, u32 tf_flags, u16 bcount)
+static void ide_init_packet_cmd(struct ide_cmd *cmd, u32 tf_flags,
+				u16 bcount, u8 dma)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd cmd;
-	u8 dma = drive->dma;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
-		       IDE_TFLAG_OUT_FEATURE | tf_flags;
-	cmd.tf.feature = dma;		/* Use PIO/DMA */
-	cmd.tf.lbam    = bcount & 0xff;
-	cmd.tf.lbah    = (bcount >> 8) & 0xff;
-
-	ide_tf_dump(drive->name, &cmd.tf);
-	hwif->tp_ops->set_irq(hwif, 1);
-	SELECT_MASK(drive, 0);
-	hwif->tp_ops->tf_load(drive, &cmd);
+	cmd->protocol  = dma ? ATAPI_PROT_DMA : ATAPI_PROT_PIO;
+	cmd->tf_flags |= IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
+			 IDE_TFLAG_OUT_FEATURE | tf_flags;
+	cmd->tf.feature = dma;		/* Use PIO/DMA */
+	cmd->tf.lbam    = bcount & 0xff;
+	cmd->tf.lbah    = (bcount >> 8) & 0xff;
 }
 
 static u8 ide_read_ireason(ide_drive_t *drive)
@@ -634,24 +626,17 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 	return ide_started;
 }
 
-ide_startstop_t ide_issue_pc(ide_drive_t *drive)
+ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
-	struct ide_cmd *cmd = &hwif->cmd;
 	ide_expiry_t *expiry = NULL;
 	struct request *rq = hwif->rq;
 	unsigned int timeout;
 	u32 tf_flags;
 	u16 bcount;
 
-	if (drive->media != ide_floppy) {
-		if (rq_data_dir(rq))
-			cmd->tf_flags |= IDE_TFLAG_WRITE;
-		cmd->rq = rq;
-	}
-
 	if (dev_is_idecd(drive)) {
 		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
 		bcount = ide_cd_get_xferlen(rq);
@@ -696,7 +681,9 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 						       : WAIT_TAPE_CMD;
 	}
 
-	ide_pktcmd_tf_load(drive, tf_flags, bcount);
+	ide_init_packet_cmd(cmd, tf_flags, bcount, drive->dma);
+
+	(void)do_rw_taskfile(drive, cmd);
 
 	/* Issue the packet command */
 	if (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT) {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 2f698c6e913f..a6c847d31b9d 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1066,6 +1066,8 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
+	struct ide_cmd cmd;
+
 	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, block: %llu",
 				  rq->cmd[0], (unsigned long long)block);
 
@@ -1094,7 +1096,14 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		return ide_stopped;
 	}
 
-	return ide_issue_pc(drive);
+	memset(&cmd, 0, sizeof(cmd));
+
+	if (rq_data_dir(rq))
+		cmd.tf_flags |= IDE_TFLAG_WRITE;
+
+	cmd.rq = rq;
+
+	return ide_issue_pc(drive, &cmd);
 }
 
 /*
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index ee3e77a7a727..f3ed5de3141b 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -130,8 +130,9 @@ static void ide_floppy_report_error(struct ide_disk_obj *floppy,
 
 }
 
-static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
-		struct ide_atapi_pc *pc)
+static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
+					   struct ide_cmd *cmd,
+					   struct ide_atapi_pc *pc)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
 
@@ -157,7 +158,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 
 	pc->retries++;
 
-	return ide_issue_pc(drive);
+	return ide_issue_pc(drive, cmd);
 }
 
 void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *pc)
@@ -244,7 +245,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
 	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
+	struct ide_cmd cmd;
 	struct ide_atapi_pc *pc;
 
 	if (drive->debug_mask & IDE_DBG_RQ)
@@ -285,21 +286,24 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	memset(&cmd, 0, sizeof(cmd));
+
 	if (rq_data_dir(rq))
-		cmd->tf_flags |= IDE_TFLAG_WRITE;
-	cmd->rq = rq;
+		cmd.tf_flags |= IDE_TFLAG_WRITE;
+
+	cmd.rq = rq;
 
 	if (blk_fs_request(rq) || pc->req_xfer) {
-		ide_init_sg_cmd(cmd, rq->nr_sectors);
-		ide_map_sg(drive, cmd);
+		ide_init_sg_cmd(&cmd, rq->nr_sectors);
+		ide_map_sg(drive, &cmd);
 	}
 
 	pc->sg = hwif->sg_table;
-	pc->sg_cnt = cmd->sg_nents;
+	pc->sg_cnt = cmd.sg_nents;
 
 	pc->rq = rq;
 
-	return idefloppy_issue_pc(drive, pc);
+	return ide_floppy_issue_pc(drive, &cmd, pc);
 out_end:
 	drive->failed_pc = NULL;
 	if (blk_fs_request(rq) == 0 && rq->errors == 0)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 853d047aa78f..64dfa7458f8d 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -580,7 +580,7 @@ static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
  *
  * The handling will be done in three stages:
  *
- * 1. idetape_issue_pc will send the packet command to the drive, and will set
+ * 1. ide_tape_issue_pc will send the packet command to the drive, and will set
  * the interrupt handler to ide_pc_intr.
  *
  * 2. On each interrupt, ide_pc_intr will be called. This step will be
@@ -608,8 +608,9 @@ static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
  * request.
  */
 
-static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
-		struct ide_atapi_pc *pc)
+static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
+					 struct ide_cmd *cmd,
+					 struct ide_atapi_pc *pc)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
@@ -654,7 +655,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 
 	pc->retries++;
 
-	return ide_issue_pc(drive);
+	return ide_issue_pc(drive, cmd);
 }
 
 /* A mode sense command is used to "sense" tape parameters. */
@@ -749,6 +750,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc *pc = NULL;
 	struct request *postponed_rq = tape->postponed_rq;
+	struct ide_cmd cmd;
 	u8 stat;
 
 	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu,"
@@ -844,7 +846,14 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	BUG();
 
 out:
-	return idetape_issue_pc(drive, pc);
+	memset(&cmd, 0, sizeof(cmd));
+
+	if (rq_data_dir(rq))
+		cmd.tf_flags |= IDE_TFLAG_WRITE;
+
+	cmd.rq = rq;
+
+	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
 /*
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 3b23bd11945e..63ab233ba942 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -100,13 +100,14 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		ide_execute_command(drive, tf->command, handler,
 				    WAIT_WORSTCASE, NULL);
 		return ide_started;
-	default:
+	case ATA_PROT_DMA:
 		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
 		    ide_build_sglist(drive, cmd) == 0 ||
 		    dma_ops->dma_setup(drive, cmd))
 			return ide_stopped;
 		dma_ops->dma_exec_cmd(drive, tf->command);
 		dma_ops->dma_start(drive);
+	default:
 		return ide_started;
 	}
 }
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b30e79c6ff57..e339d6646552 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1226,7 +1226,7 @@ int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
 
-ide_startstop_t ide_issue_pc(ide_drive_t *);
+ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_cmd *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *);
 
-- 
cgit v1.2.3-71-gd317


From 60c0cd02b254805691cdc61101ada6af7bd56fde Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:46 +0100
Subject: ide: set hwif->expiry prior to calling [__]ide_set_handler()

* Set hwif->expiry prior to calling [__]ide_set_handler()
  and drop 'expiry' argument.

* Set hwif->expiry to NULL in ide_{timer_expiry,intr}()
  and remove 'hwif->expiry = NULL' assignments.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    |  6 ++++--
 drivers/ide/ide-cd.c       |  3 ++-
 drivers/ide/ide-eh.c       |  9 ++++-----
 drivers/ide/ide-io.c       |  2 ++
 drivers/ide/ide-iops.c     | 13 +++++++------
 drivers/ide/ide-taskfile.c |  6 +++---
 include/linux/ide.h        |  6 ++----
 7 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 75df05add1b9..f1b1b71cb74c 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -483,7 +483,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		  rq->cmd[0], bcount);
 next_irq:
 	/* And set the interrupt handler again */
-	ide_set_handler(drive, ide_pc_intr, timeout, NULL);
+	ide_set_handler(drive, ide_pc_intr, timeout);
 	return ide_started;
 }
 
@@ -602,11 +602,13 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 		}
 	}
 
+	hwif->expiry = expiry;
+
 	/* Set the interrupt routine */
 	ide_set_handler(drive,
 			(dev_is_idecd(drive) ? drive->irq_handler
 					     : ide_pc_intr),
-			timeout, expiry);
+			timeout);
 
 	/* Begin DMA, if necessary */
 	if (dev_is_idecd(drive)) {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index a6c847d31b9d..3f630e4080d4 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -959,7 +959,8 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			expiry = ide_cd_expiry;
 	}
 
-	ide_set_handler(drive, cdrom_newpc_intr, timeout, expiry);
+	hwif->expiry = expiry;
+	ide_set_handler(drive, cdrom_newpc_intr, timeout);
 	return ide_started;
 
 end_request:
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index aff1a9b04559..11664976eea3 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -175,8 +175,7 @@ static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive)
 		printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name);
 	else {
 		if (time_before(jiffies, hwif->poll_timeout)) {
-			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20,
-					NULL);
+			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20);
 			/* continue polling */
 			return ide_started;
 		}
@@ -238,7 +237,7 @@ static ide_startstop_t reset_pollfunc(ide_drive_t *drive)
 
 	if (!OK_STAT(tmp, 0, ATA_BUSY)) {
 		if (time_before(jiffies, hwif->poll_timeout)) {
-			ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
+			ide_set_handler(drive, &reset_pollfunc, HZ/20);
 			/* continue polling */
 			return ide_started;
 		}
@@ -355,7 +354,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 		ndelay(400);
 		hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
 		hwif->polling = 1;
-		__ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL);
+		__ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20);
 		spin_unlock_irqrestore(&hwif->lock, flags);
 		return ide_started;
 	}
@@ -415,7 +414,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	udelay(10);
 	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
 	hwif->polling = 1;
-	__ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
+	__ide_set_handler(drive, &reset_pollfunc, HZ/20);
 
 	/*
 	 * Some weird controller like resetting themselves to a strange
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 47404f5526f1..b4901b690c9a 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -651,6 +651,7 @@ void ide_timer_expiry (unsigned long data)
 			}
 		}
 		hwif->handler = NULL;
+		hwif->expiry = NULL;
 		/*
 		 * We need to simulate a real interrupt when invoking
 		 * the handler() function, which means we need to
@@ -830,6 +831,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 		goto out;
 
 	hwif->handler = NULL;
+	hwif->expiry = NULL;
 	hwif->req_gen++;
 	del_timer(&hwif->timer);
 	spin_unlock(&hwif->lock);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index c3023de7270c..916495ba45df 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -425,26 +425,25 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
  * See also ide_execute_command
  */
 void __ide_set_handler(ide_drive_t *drive, ide_handler_t *handler,
-		       unsigned int timeout, ide_expiry_t *expiry)
+		       unsigned int timeout)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
 	BUG_ON(hwif->handler);
 	hwif->handler		= handler;
-	hwif->expiry		= expiry;
 	hwif->timer.expires	= jiffies + timeout;
 	hwif->req_gen_timer	= hwif->req_gen;
 	add_timer(&hwif->timer);
 }
 
-void ide_set_handler (ide_drive_t *drive, ide_handler_t *handler,
-		      unsigned int timeout, ide_expiry_t *expiry)
+void ide_set_handler(ide_drive_t *drive, ide_handler_t *handler,
+		     unsigned int timeout)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
 
 	spin_lock_irqsave(&hwif->lock, flags);
-	__ide_set_handler(drive, handler, timeout, expiry);
+	__ide_set_handler(drive, handler, timeout);
 	spin_unlock_irqrestore(&hwif->lock, flags);
 }
 EXPORT_SYMBOL(ide_set_handler);
@@ -469,8 +468,10 @@ void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
 
+	hwif->expiry = expiry;
+
 	spin_lock_irqsave(&hwif->lock, flags);
-	__ide_set_handler(drive, handler, timeout, expiry);
+	__ide_set_handler(drive, handler, timeout);
 	hwif->tp_ops->exec_command(hwif, cmd);
 	/*
 	 * Drive takes 400nS to respond, we must avoid the IRQ being
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 63ab233ba942..286804142e4d 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -140,7 +140,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 		} else if (custom && tf->command == ATA_CMD_INIT_DEV_PARAMS) {
 			if ((stat & (ATA_ERR | ATA_DRQ)) == 0) {
 				ide_set_handler(drive, &task_no_data_intr,
-						WAIT_WORSTCASE, NULL);
+						WAIT_WORSTCASE);
 				return ide_started;
 			}
 		}
@@ -347,7 +347,7 @@ static ide_startstop_t task_pio_intr(ide_drive_t *drive)
 	}
 out_wait:
 	/* Still data left to transfer. */
-	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE, NULL);
+	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
 	return ide_started;
 out_end:
 	if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
@@ -377,7 +377,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
 	if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
 		local_irq_disable();
 
-	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE, NULL);
+	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
 
 	ide_pio_datablock(drive, cmd, 1);
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e339d6646552..476f59885fda 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1135,10 +1135,8 @@ unsigned int ide_rq_bytes(struct request *);
 int ide_end_rq(ide_drive_t *, struct request *, int, unsigned int);
 void ide_kill_rq(ide_drive_t *, struct request *);
 
-void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int,
-		       ide_expiry_t *);
-void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int,
-		     ide_expiry_t *);
+void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
+void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
 
 void ide_execute_command(ide_drive_t *, u8, ide_handler_t *, unsigned int,
 			 ide_expiry_t *);
-- 
cgit v1.2.3-71-gd317


From 22117d6eaac50d366d9013c88318a869ea4d8739 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:47 +0100
Subject: ide: add ->dma_timer_expiry method and remove ->dma_exec_cmd one (v2)

* Rename dma_timer_expiry() to ide_dma_sff_timer_expiry() and export it.

* Add ->dma_timer_expiry method and use it to set hwif->expiry for
  ATA_PROT_DMA protocol in do_rw_taskfile().

* Initialize ->dma_timer_expiry to ide_dma_sff_timer_expiry() for SFF hosts.

* Move setting hwif->expiry from ide_execute_command() to its users and drop
  'expiry' argument.

* Use ide_execute_command() instead of ->dma_exec_cmd in do_rw_taskfile().

* Remove ->dma_exec_cmd method and its implementations.

* Unexport ide_execute_command() and ide_dma_intr().

v2:
* Fix CONFIG_BLK_DEV_IDEDMA=n build (noticed by Randy Dunlap).

* Fix *dma_expiry naming (suggested by Sergei Shtylyov).

There should be no functional changes caused by this patch.

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c     |  2 +-
 drivers/ide/au1xxx-ide.c   |  8 --------
 drivers/ide/cmd64x.c       |  6 +++---
 drivers/ide/cs5536.c       |  2 +-
 drivers/ide/hpt366.c       |  6 +++---
 drivers/ide/icside.c       |  7 -------
 drivers/ide/ide-atapi.c    |  3 ++-
 drivers/ide/ide-dma-sff.c  | 15 ++++-----------
 drivers/ide/ide-dma.c      |  1 -
 drivers/ide/ide-iops.c     |  6 +-----
 drivers/ide/ide-taskfile.c |  6 ++++--
 drivers/ide/it821x.c       |  2 +-
 drivers/ide/ns87415.c      |  2 +-
 drivers/ide/pdc202xx_old.c |  4 ++--
 drivers/ide/pmac.c         |  8 --------
 drivers/ide/sc1200.c       |  2 +-
 drivers/ide/scc_pata.c     |  2 +-
 drivers/ide/siimage.c      |  2 +-
 drivers/ide/sl82c105.c     |  2 +-
 drivers/ide/tc86c001.c     |  2 +-
 drivers/ide/trm290.c       |  6 ------
 drivers/ide/tx4939ide.c    |  2 +-
 include/linux/ide.h        |  8 ++++----
 23 files changed, 33 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index e837fd9f196f..d516168464fc 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -504,11 +504,11 @@ static const struct ide_port_ops ali_port_ops = {
 static const struct ide_dma_ops ali_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ali15x3_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 58485d6cb026..d3a9d6c15328 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -290,13 +290,6 @@ static void auide_dma_start(ide_drive_t *drive )
 }
 
 
-static void auide_dma_exec_cmd(ide_drive_t *drive, u8 command)
-{
-	/* issue cmd to drive */
-	ide_execute_command(drive, command, &ide_dma_intr,
-			    (2*WAIT_CMD), NULL);
-}
-
 static int auide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	if (auide_build_dmatable(drive, cmd) == 0) {
@@ -356,7 +349,6 @@ static void auide_init_dbdma_dev(dbdev_tab_t *dev, u32 dev_id, u32 tsize, u32 de
 static const struct ide_dma_ops au1xxx_dma_ops = {
 	.dma_host_set		= auide_dma_host_set,
 	.dma_setup		= auide_dma_setup,
-	.dma_exec_cmd		= auide_dma_exec_cmd,
 	.dma_start		= auide_dma_start,
 	.dma_end		= auide_dma_end,
 	.dma_test_irq		= auide_dma_test_irq,
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index aeee036b1503..bf0e3f470824 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -379,11 +379,11 @@ static const struct ide_port_ops cmd64x_port_ops = {
 static const struct ide_dma_ops cmd64x_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= cmd64x_dma_end,
 	.dma_test_irq		= cmd64x_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
@@ -391,11 +391,11 @@ static const struct ide_dma_ops cmd64x_dma_ops = {
 static const struct ide_dma_ops cmd646_rev1_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= cmd646_1_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
@@ -403,11 +403,11 @@ static const struct ide_dma_ops cmd646_rev1_dma_ops = {
 static const struct ide_dma_ops cmd648_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= cmd648_dma_end,
 	.dma_test_irq		= cmd648_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c
index 7a62db719a46..d5dcf4899607 100644
--- a/drivers/ide/cs5536.c
+++ b/drivers/ide/cs5536.c
@@ -231,11 +231,11 @@ static const struct ide_port_ops cs5536_port_ops = {
 static const struct ide_dma_ops cs5536_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= cs5536_dma_start,
 	.dma_end		= cs5536_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 };
 
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index d3b3e824f445..dbaf184ed9c5 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -1418,11 +1418,11 @@ static const struct ide_port_ops hpt3xx_port_ops = {
 static const struct ide_dma_ops hpt37x_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= hpt374_dma_end,
 	.dma_test_irq		= hpt374_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
@@ -1430,11 +1430,11 @@ static const struct ide_dma_ops hpt37x_dma_ops = {
 static const struct ide_dma_ops hpt370_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= hpt370_dma_start,
 	.dma_end		= hpt370_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= hpt370_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
@@ -1442,11 +1442,11 @@ static const struct ide_dma_ops hpt370_dma_ops = {
 static const struct ide_dma_ops hpt36x_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= hpt366_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 3628b2147902..51ce404fe532 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -351,12 +351,6 @@ static int icside_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	return 0;
 }
 
-static void icside_dma_exec_cmd(ide_drive_t *drive, u8 cmd)
-{
-	/* issue cmd to drive */
-	ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD, NULL);
-}
-
 static int icside_dma_test_irq(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -380,7 +374,6 @@ static int icside_dma_init(ide_hwif_t *hwif, const struct ide_port_info *d)
 static const struct ide_dma_ops icside_v6_dma_ops = {
 	.dma_host_set		= icside_dma_host_set,
 	.dma_setup		= icside_dma_setup,
-	.dma_exec_cmd		= icside_dma_exec_cmd,
 	.dma_start		= icside_dma_start,
 	.dma_end		= icside_dma_end,
 	.dma_test_irq		= icside_dma_test_irq,
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f1b1b71cb74c..f7fe1decb59d 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -691,8 +691,9 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT) {
 		if (drive->dma)
 			drive->waiting_for_dma = 0;
+		hwif->expiry = expiry;
 		ide_execute_command(drive, ATA_CMD_PACKET, ide_transfer_pc,
-				    timeout, expiry);
+				    timeout);
 		return ide_started;
 	} else {
 		ide_execute_pkt_cmd(drive);
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index b7eb810c7b8f..75a9ea2e4c82 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -224,7 +224,7 @@ int ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 EXPORT_SYMBOL_GPL(ide_dma_setup);
 
 /**
- *	dma_timer_expiry	-	handle a DMA timeout
+ *	ide_dma_sff_timer_expiry	-	handle a DMA timeout
  *	@drive: Drive that timed out
  *
  *	An IDE DMA transfer timed out. In the event of an error we ask
@@ -237,7 +237,7 @@ EXPORT_SYMBOL_GPL(ide_dma_setup);
  *	This can occur if an interrupt is lost or due to hang or bugs.
  */
 
-static int dma_timer_expiry(ide_drive_t *drive)
+int ide_dma_sff_timer_expiry(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	u8 dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
@@ -261,14 +261,7 @@ static int dma_timer_expiry(ide_drive_t *drive)
 
 	return 0;	/* Status is unknown -- reset the bus */
 }
-
-void ide_dma_exec_cmd(ide_drive_t *drive, u8 command)
-{
-	/* issue cmd to drive */
-	ide_execute_command(drive, command, &ide_dma_intr, 2 * WAIT_CMD,
-			    dma_timer_expiry);
-}
-EXPORT_SYMBOL_GPL(ide_dma_exec_cmd);
+EXPORT_SYMBOL_GPL(ide_dma_sff_timer_expiry);
 
 void ide_dma_start(ide_drive_t *drive)
 {
@@ -342,10 +335,10 @@ EXPORT_SYMBOL_GPL(ide_dma_test_irq);
 const struct ide_dma_ops sff_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index ad4edab9b0a9..3dbf80c15491 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -110,7 +110,6 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 	}
 	return ide_error(drive, "dma_intr", stat);
 }
-EXPORT_SYMBOL_GPL(ide_dma_intr);
 
 int ide_dma_good_drive(ide_drive_t *drive)
 {
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 916495ba45df..52c1258ba9f4 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -454,7 +454,6 @@ EXPORT_SYMBOL(ide_set_handler);
  *	@command: command byte to write
  *	@handler: handler for next phase
  *	@timeout: timeout for command
- *	@expiry:  handler to run on timeout
  *
  *	Helper function to issue an IDE command. This handles the
  *	atomicity requirements, command timing and ensures that the
@@ -463,13 +462,11 @@ EXPORT_SYMBOL(ide_set_handler);
  */
 
 void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
-			 unsigned timeout, ide_expiry_t *expiry)
+			 unsigned timeout)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
 
-	hwif->expiry = expiry;
-
 	spin_lock_irqsave(&hwif->lock, flags);
 	__ide_set_handler(drive, handler, timeout);
 	hwif->tp_ops->exec_command(hwif, cmd);
@@ -482,7 +479,6 @@ void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
 	ndelay(400);
 	spin_unlock_irqrestore(&hwif->lock, flags);
 }
-EXPORT_SYMBOL(ide_execute_command);
 
 void ide_execute_pkt_cmd(ide_drive_t *drive)
 {
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 286804142e4d..f5cf04cf5712 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -98,14 +98,16 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		if (handler == NULL)
 			handler = task_no_data_intr;
 		ide_execute_command(drive, tf->command, handler,
-				    WAIT_WORSTCASE, NULL);
+				    WAIT_WORSTCASE);
 		return ide_started;
 	case ATA_PROT_DMA:
 		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
 		    ide_build_sglist(drive, cmd) == 0 ||
 		    dma_ops->dma_setup(drive, cmd))
 			return ide_stopped;
-		dma_ops->dma_exec_cmd(drive, tf->command);
+		hwif->expiry = dma_ops->dma_timer_expiry;
+		ide_execute_command(drive, tf->command, ide_dma_intr,
+				    2 * WAIT_CMD);
 		dma_ops->dma_start(drive);
 	default:
 		return ide_started;
diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c
index 6b9fc950b4af..0d4ac65cf949 100644
--- a/drivers/ide/it821x.c
+++ b/drivers/ide/it821x.c
@@ -508,10 +508,10 @@ static void it821x_quirkproc(ide_drive_t *drive)
 static struct ide_dma_ops it821x_pass_through_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= it821x_dma_start,
 	.dma_end		= it821x_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index cf6d9a9c8a27..7b65fe5bf449 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -301,11 +301,11 @@ static const struct ide_port_ops ns87415_port_ops = {
 static const struct ide_dma_ops ns87415_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ns87415_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ns87415_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= superio_dma_sff_read_status,
 };
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index cba66ebce4e3..f7536d1943f7 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -331,11 +331,11 @@ static const struct ide_port_ops pdc2026x_port_ops = {
 static const struct ide_dma_ops pdc20246_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= pdc202xx_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
@@ -343,11 +343,11 @@ static const struct ide_dma_ops pdc20246_dma_ops = {
 static const struct ide_dma_ops pdc2026x_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= pdc202xx_dma_start,
 	.dma_end		= pdc202xx_dma_end,
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= pdc202xx_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 337d2d5b3028..2bfcfedaa076 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1527,13 +1527,6 @@ static int pmac_ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	return 0;
 }
 
-static void
-pmac_ide_dma_exec_cmd(ide_drive_t *drive, u8 command)
-{
-	/* issue cmd to drive */
-	ide_execute_command(drive, command, &ide_dma_intr, 2*WAIT_CMD, NULL);
-}
-
 /*
  * Kick the DMA controller into life after the DMA command has been issued
  * to the drive.
@@ -1654,7 +1647,6 @@ pmac_ide_dma_lost_irq (ide_drive_t *drive)
 static const struct ide_dma_ops pmac_dma_ops = {
 	.dma_host_set		= pmac_ide_dma_host_set,
 	.dma_setup		= pmac_ide_dma_setup,
-	.dma_exec_cmd		= pmac_ide_dma_exec_cmd,
 	.dma_start		= pmac_ide_dma_start,
 	.dma_end		= pmac_ide_dma_end,
 	.dma_test_irq		= pmac_ide_dma_test_irq,
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index dbdd2985a0d8..1c3a82914999 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -286,11 +286,11 @@ static const struct ide_port_ops sc1200_port_ops = {
 static const struct ide_dma_ops sc1200_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= sc1200_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 1f2805ce9889..0cc137cfe76d 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -868,12 +868,12 @@ static const struct ide_port_ops scc_port_ops = {
 static const struct ide_dma_ops scc_dma_ops = {
 	.dma_host_set		= scc_dma_host_set,
 	.dma_setup		= scc_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= scc_dma_start,
 	.dma_end		= scc_dma_end,
 	.dma_test_irq		= scc_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timeout		= ide_dma_timeout,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= scc_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index 1811ae9cd843..075cb1243b2a 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -711,10 +711,10 @@ static const struct ide_port_ops sil_sata_port_ops = {
 static const struct ide_dma_ops sil_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= siimage_dma_test_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index dba213c51baa..d25137b04e7a 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -293,11 +293,11 @@ static const struct ide_port_ops sl82c105_port_ops = {
 static const struct ide_dma_ops sl82c105_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= sl82c105_dma_start,
 	.dma_end		= sl82c105_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= sl82c105_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= sl82c105_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c
index 84109f5a1632..427d4b3c2c63 100644
--- a/drivers/ide/tc86c001.c
+++ b/drivers/ide/tc86c001.c
@@ -182,11 +182,11 @@ static const struct ide_port_ops tc86c001_port_ops = {
 static const struct ide_dma_ops tc86c001_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
 	.dma_setup		= ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= tc86c001_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index 746858a7338d..ed1496845a93 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -176,11 +176,6 @@ static void trm290_selectproc (ide_drive_t *drive)
 	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
 }
 
-static void trm290_dma_exec_cmd(ide_drive_t *drive, u8 command)
-{
-	ide_execute_command(drive, command, &ide_dma_intr, WAIT_CMD, NULL);
-}
-
 static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -315,7 +310,6 @@ static const struct ide_port_ops trm290_port_ops = {
 static struct ide_dma_ops trm290_dma_ops = {
 	.dma_host_set		= trm290_dma_host_set,
 	.dma_setup 		= trm290_dma_setup,
-	.dma_exec_cmd		= trm290_dma_exec_cmd,
 	.dma_start 		= trm290_dma_start,
 	.dma_end		= trm290_dma_end,
 	.dma_test_irq		= trm290_dma_test_irq,
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 39e3316ab63f..e0e0a803dde3 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -627,11 +627,11 @@ static const struct ide_port_ops tx4939ide_port_ops = {
 static const struct ide_dma_ops tx4939ide_dma_ops = {
 	.dma_host_set		= tx4939ide_dma_host_set,
 	.dma_setup		= tx4939ide_dma_setup,
-	.dma_exec_cmd		= ide_dma_exec_cmd,
 	.dma_start		= ide_dma_start,
 	.dma_end		= tx4939ide_dma_end,
 	.dma_test_irq		= tx4939ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= tx4939ide_dma_sff_read_status,
 };
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 476f59885fda..9476939101be 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -715,11 +715,11 @@ struct ide_port_ops {
 struct ide_dma_ops {
 	void	(*dma_host_set)(struct ide_drive_s *, int);
 	int	(*dma_setup)(struct ide_drive_s *, struct ide_cmd *);
-	void	(*dma_exec_cmd)(struct ide_drive_s *, u8);
 	void	(*dma_start)(struct ide_drive_s *);
 	int	(*dma_end)(struct ide_drive_s *);
 	int	(*dma_test_irq)(struct ide_drive_s *);
 	void	(*dma_lost_irq)(struct ide_drive_s *);
+	int	(*dma_timer_expiry)(struct ide_drive_s *);
 	void	(*dma_timeout)(struct ide_drive_s *);
 	/*
 	 * The following method is optional and only required to be
@@ -1138,8 +1138,7 @@ void ide_kill_rq(ide_drive_t *, struct request *);
 void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
 void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
 
-void ide_execute_command(ide_drive_t *, u8, ide_handler_t *, unsigned int,
-			 ide_expiry_t *);
+void ide_execute_command(ide_drive_t *, u8, ide_handler_t *, unsigned int);
 
 void ide_execute_pkt_cmd(ide_drive_t *);
 
@@ -1453,10 +1452,10 @@ int config_drive_for_dma(ide_drive_t *);
 int ide_build_dmatable(ide_drive_t *, struct ide_cmd *);
 void ide_dma_host_set(ide_drive_t *, int);
 int ide_dma_setup(ide_drive_t *, struct ide_cmd *);
-void ide_dma_exec_cmd(ide_drive_t *, u8);
 extern void ide_dma_start(ide_drive_t *);
 int ide_dma_end(ide_drive_t *);
 int ide_dma_test_irq(ide_drive_t *);
+int ide_dma_sff_timer_expiry(ide_drive_t *);
 u8 ide_dma_sff_read_status(ide_hwif_t *);
 extern const struct ide_dma_ops sff_dma_ops;
 #else
@@ -1477,6 +1476,7 @@ static inline void ide_dma_on(ide_drive_t *drive) { ; }
 static inline void ide_dma_verbose(ide_drive_t *drive) { ; }
 static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
 static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
+static inline ide_startstop_t ide_dma_intr(ide_drive_t *drive) { return ide_stopped; }
 static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_build_sglist(ide_drive_t *drive,
-- 
cgit v1.2.3-71-gd317


From 35b5d0be3d8de9a5ac51471c12029fb115200cdc Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:47 +0100
Subject: ide: remove ide_execute_pkt_cmd() (v2)

* Pass command structure to ide_execute_command() and skip
  __ide_set_handler() for ATAPI protocols on non-DRQ devices.

* Convert ide_issue_pc() to always use ide_execute_command()
  and remove no longer needed ide_execute_pkt_cmd().

v2:
* Fix for non-DRQ devices (based on report from Borislav).

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 15 +++++++--------
 drivers/ide/ide-iops.c     | 25 ++++++++-----------------
 drivers/ide/ide-taskfile.c |  6 ++----
 include/linux/ide.h        |  5 ++---
 4 files changed, 19 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f7fe1decb59d..2fb5d28a9be5 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -493,6 +493,7 @@ static void ide_init_packet_cmd(struct ide_cmd *cmd, u32 tf_flags,
 	cmd->protocol  = dma ? ATAPI_PROT_DMA : ATAPI_PROT_PIO;
 	cmd->tf_flags |= IDE_TFLAG_OUT_LBAH | IDE_TFLAG_OUT_LBAM |
 			 IDE_TFLAG_OUT_FEATURE | tf_flags;
+	cmd->tf.command = ATA_CMD_PACKET;
 	cmd->tf.feature = dma;		/* Use PIO/DMA */
 	cmd->tf.lbam    = bcount & 0xff;
 	cmd->tf.lbah    = (bcount >> 8) & 0xff;
@@ -638,6 +639,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 	unsigned int timeout;
 	u32 tf_flags;
 	u16 bcount;
+	u8 drq_int = !!(drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT);
 
 	if (dev_is_idecd(drive)) {
 		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
@@ -687,17 +689,14 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
 	(void)do_rw_taskfile(drive, cmd);
 
-	/* Issue the packet command */
-	if (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT) {
+	if (drq_int) {
 		if (drive->dma)
 			drive->waiting_for_dma = 0;
 		hwif->expiry = expiry;
-		ide_execute_command(drive, ATA_CMD_PACKET, ide_transfer_pc,
-				    timeout);
-		return ide_started;
-	} else {
-		ide_execute_pkt_cmd(drive);
-		return ide_transfer_pc(drive);
 	}
+
+	ide_execute_command(drive, cmd, ide_transfer_pc, timeout);
+
+	return drq_int ? ide_started : ide_transfer_pc(drive);
 }
 EXPORT_SYMBOL_GPL(ide_issue_pc);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 52c1258ba9f4..5403e4a44be4 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -451,7 +451,7 @@ EXPORT_SYMBOL(ide_set_handler);
 /**
  *	ide_execute_command	-	execute an IDE command
  *	@drive: IDE drive to issue the command against
- *	@command: command byte to write
+ *	@cmd: command
  *	@handler: handler for next phase
  *	@timeout: timeout for command
  *
@@ -461,15 +461,18 @@ EXPORT_SYMBOL(ide_set_handler);
  *	should go via this function or do equivalent locking.
  */
 
-void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
-			 unsigned timeout)
+void ide_execute_command(ide_drive_t *drive, struct ide_cmd *cmd,
+			 ide_handler_t *handler, unsigned timeout)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
 
 	spin_lock_irqsave(&hwif->lock, flags);
-	__ide_set_handler(drive, handler, timeout);
-	hwif->tp_ops->exec_command(hwif, cmd);
+	if ((cmd->protocol != ATAPI_PROT_DMA &&
+	     cmd->protocol != ATAPI_PROT_PIO) ||
+	    (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT))
+		__ide_set_handler(drive, handler, timeout);
+	hwif->tp_ops->exec_command(hwif, cmd->tf.command);
 	/*
 	 * Drive takes 400nS to respond, we must avoid the IRQ being
 	 * serviced before that.
@@ -480,18 +483,6 @@ void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
 	spin_unlock_irqrestore(&hwif->lock, flags);
 }
 
-void ide_execute_pkt_cmd(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned long flags;
-
-	spin_lock_irqsave(&hwif->lock, flags);
-	hwif->tp_ops->exec_command(hwif, ATA_CMD_PACKET);
-	ndelay(400);
-	spin_unlock_irqrestore(&hwif->lock, flags);
-}
-EXPORT_SYMBOL_GPL(ide_execute_pkt_cmd);
-
 /*
  * ide_wait_not_busy() waits for the currently selected device on the hwif
  * to report a non-busy status, see comments in ide_probe_port().
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index f5cf04cf5712..329fd6f13f79 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -97,8 +97,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 	case ATA_PROT_NODATA:
 		if (handler == NULL)
 			handler = task_no_data_intr;
-		ide_execute_command(drive, tf->command, handler,
-				    WAIT_WORSTCASE);
+		ide_execute_command(drive, cmd, handler, WAIT_WORSTCASE);
 		return ide_started;
 	case ATA_PROT_DMA:
 		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
@@ -106,8 +105,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		    dma_ops->dma_setup(drive, cmd))
 			return ide_stopped;
 		hwif->expiry = dma_ops->dma_timer_expiry;
-		ide_execute_command(drive, tf->command, ide_dma_intr,
-				    2 * WAIT_CMD);
+		ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD);
 		dma_ops->dma_start(drive);
 	default:
 		return ide_started;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9476939101be..2d0c7afd5e58 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1138,9 +1138,8 @@ void ide_kill_rq(ide_drive_t *, struct request *);
 void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
 void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
 
-void ide_execute_command(ide_drive_t *, u8, ide_handler_t *, unsigned int);
-
-void ide_execute_pkt_cmd(ide_drive_t *);
+void ide_execute_command(ide_drive_t *, struct ide_cmd *, ide_handler_t *,
+			 unsigned int);
 
 void ide_pad_transfer(ide_drive_t *, int, int);
 
-- 
cgit v1.2.3-71-gd317


From bf717c0a2e18dbe82eeb28e57b0abede3cdf45d6 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 27 Mar 2009 12:46:47 +0100
Subject: ide: keep track of number of bytes instead of sectors in struct
 ide_cmd

* Pass number of bytes instead of sectors to ide_init_sg_cmd().

* Pass number of bytes to process to ide_pio_sector() and rename
  it to ide_pio_bytes().

* Rename ->nsect field to ->nbytes in struct ide_cmd and use
  ->nbytes, ->nleft and ->cursg_ofs to keep track of number of
  bytes instead of sectors.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-disk.c     |  4 ++--
 drivers/ide/ide-floppy.c   |  2 +-
 drivers/ide/ide-io.c       |  6 +++---
 drivers/ide/ide-taskfile.c | 32 ++++++++++++++++----------------
 include/linux/ide.h        |  4 ++--
 5 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 4b32c4eb7b82..ca934c8a1289 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -152,7 +152,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	cmd.rq = rq;
 
 	if (dma == 0) {
-		ide_init_sg_cmd(&cmd, nsectors);
+		ide_init_sg_cmd(&cmd, nsectors << 9);
 		ide_map_sg(drive, &cmd);
 	}
 
@@ -162,7 +162,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 		/* fallback to PIO */
 		cmd.tf_flags |= IDE_TFLAG_DMA_PIO_FALLBACK;
 		ide_tf_set_cmd(drive, &cmd, 0);
-		ide_init_sg_cmd(&cmd, nsectors);
+		ide_init_sg_cmd(&cmd, nsectors << 9);
 		rc = do_rw_taskfile(drive, &cmd);
 	}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index f3ed5de3141b..7ae662334835 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -294,7 +294,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	cmd.rq = rq;
 
 	if (blk_fs_request(rq) || pc->req_xfer) {
-		ide_init_sg_cmd(&cmd, rq->nr_sectors);
+		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
 		ide_map_sg(drive, &cmd);
 	}
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index b4901b690c9a..1adc5e2e7fb3 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -245,9 +245,9 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 }
 EXPORT_SYMBOL_GPL(ide_map_sg);
 
-void ide_init_sg_cmd(struct ide_cmd *cmd, int nsect)
+void ide_init_sg_cmd(struct ide_cmd *cmd, unsigned int nr_bytes)
 {
-	cmd->nsect = cmd->nleft = nsect;
+	cmd->nbytes = cmd->nleft = nr_bytes;
 	cmd->cursg_ofs = 0;
 	cmd->cursg = NULL;
 }
@@ -272,7 +272,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 
 	if (cmd) {
 		if (cmd->protocol == ATA_PROT_PIO) {
-			ide_init_sg_cmd(cmd, rq->nr_sectors);
+			ide_init_sg_cmd(cmd, rq->nr_sectors << 9);
 			ide_map_sg(drive, cmd);
 		}
 
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 329fd6f13f79..84532be97c00 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -188,8 +188,8 @@ static u8 wait_drive_not_busy(ide_drive_t *drive)
 	return stat;
 }
 
-static void ide_pio_sector(ide_drive_t *drive, struct ide_cmd *cmd,
-			   unsigned int write)
+static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
+			  unsigned int write, unsigned int nr_bytes)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -208,7 +208,7 @@ static void ide_pio_sector(ide_drive_t *drive, struct ide_cmd *cmd,
 	}
 
 	page = sg_page(cursg);
-	offset = cursg->offset + cmd->cursg_ofs * SECTOR_SIZE;
+	offset = cursg->offset + cmd->cursg_ofs;
 
 	/* get the current page and offset */
 	page = nth_page(page, (offset >> PAGE_SHIFT));
@@ -219,19 +219,19 @@ static void ide_pio_sector(ide_drive_t *drive, struct ide_cmd *cmd,
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
-	cmd->nleft--;
-	cmd->cursg_ofs++;
+	cmd->nleft -= nr_bytes;
+	cmd->cursg_ofs += nr_bytes;
 
-	if ((cmd->cursg_ofs * SECTOR_SIZE) == cursg->length) {
+	if (cmd->cursg_ofs == cursg->length) {
 		cmd->cursg = sg_next(cmd->cursg);
 		cmd->cursg_ofs = 0;
 	}
 
 	/* do the actual data transfer */
 	if (write)
-		hwif->tp_ops->output_data(drive, cmd, buf, SECTOR_SIZE);
+		hwif->tp_ops->output_data(drive, cmd, buf, nr_bytes);
 	else
-		hwif->tp_ops->input_data(drive, cmd, buf, SECTOR_SIZE);
+		hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes);
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
@@ -244,9 +244,9 @@ static void ide_pio_multi(ide_drive_t *drive, struct ide_cmd *cmd,
 {
 	unsigned int nsect;
 
-	nsect = min_t(unsigned int, cmd->nleft, drive->mult_count);
+	nsect = min_t(unsigned int, cmd->nleft >> 9, drive->mult_count);
 	while (nsect--)
-		ide_pio_sector(drive, cmd, write);
+		ide_pio_bytes(drive, cmd, write, SECTOR_SIZE);
 }
 
 static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
@@ -265,7 +265,7 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 	if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO)
 		ide_pio_multi(drive, cmd, write);
 	else
-		ide_pio_sector(drive, cmd, write);
+		ide_pio_bytes(drive, cmd, write, SECTOR_SIZE);
 
 	drive->io_32bit = saved_io_32bit;
 }
@@ -273,18 +273,18 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	if (cmd->tf_flags & IDE_TFLAG_FS) {
-		int sectors = cmd->nsect - cmd->nleft;
+		int nr_bytes = cmd->nbytes - cmd->nleft;
 
 		if (cmd->protocol == ATA_PROT_PIO &&
 		    ((cmd->tf_flags & IDE_TFLAG_WRITE) || cmd->nleft == 0)) {
 			if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO)
-				sectors -= drive->mult_count;
+				nr_bytes -= drive->mult_count << 9;
 			else
-				sectors--;
+				nr_bytes -= SECTOR_SIZE;
 		}
 
-		if (sectors > 0)
-			ide_complete_rq(drive, 0, sectors << 9);
+		if (nr_bytes > 0)
+			ide_complete_rq(drive, 0, nr_bytes);
 	}
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2d0c7afd5e58..d5d832271f44 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -350,7 +350,7 @@ struct ide_cmd {
 	int			orig_sg_nents;
 	int			sg_dma_direction; /* DMA transfer direction */
 
-	unsigned int		nsect;
+	unsigned int		nbytes;
 	unsigned int		nleft;
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
@@ -1409,7 +1409,7 @@ int ide_pci_resume(struct pci_dev *);
 #endif
 
 void ide_map_sg(ide_drive_t *, struct ide_cmd *);
-void ide_init_sg_cmd(struct ide_cmd *, int);
+void ide_init_sg_cmd(struct ide_cmd *, unsigned int);
 
 #define BAD_DMA_DRIVE		0
 #define GOOD_DMA_DRIVE		1
-- 
cgit v1.2.3-71-gd317


From 05fd8e73e1357feaea9c48938d937eae76b4aef4 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Sat, 7 Mar 2009 12:55:49 +0100
Subject: clkdev: add possibility to get a clock based on the device name

This adds clk_get_sys to get a clock without the associated struct
device.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
 arch/arm/common/clkdev.c | 11 +++++++++--
 include/linux/clk.h      | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/common/clkdev.c b/arch/arm/common/clkdev.c
index 1037bba18329..5589444ff437 100644
--- a/arch/arm/common/clkdev.c
+++ b/arch/arm/common/clkdev.c
@@ -62,9 +62,8 @@ static struct clk *clk_find(const char *dev_id, const char *con_id)
 	return clk;
 }
 
-struct clk *clk_get(struct device *dev, const char *con_id)
+struct clk *clk_get_sys(const char *dev_id, const char *con_id)
 {
-	const char *dev_id = dev ? dev_name(dev) : NULL;
 	struct clk *clk;
 
 	mutex_lock(&clocks_mutex);
@@ -75,6 +74,14 @@ struct clk *clk_get(struct device *dev, const char *con_id)
 
 	return clk ? clk : ERR_PTR(-ENOENT);
 }
+EXPORT_SYMBOL(clk_get_sys);
+
+struct clk *clk_get(struct device *dev, const char *con_id)
+{
+	const char *dev_id = dev ? dev_name(dev) : NULL;
+
+	return clk_get_sys(dev_id, con_id);
+}
 EXPORT_SYMBOL(clk_get);
 
 void clk_put(struct clk *clk)
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 778777316ea4..1db9bbf444a3 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -125,4 +125,21 @@ int clk_set_parent(struct clk *clk, struct clk *parent);
  */
 struct clk *clk_get_parent(struct clk *clk);
 
+/**
+ * clk_get_sys - get a clock based upon the device name
+ * @dev_id: device name
+ * @con_id: connection ID
+ *
+ * Returns a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev_id and @con_id to determine the clock consumer, and
+ * thereby the clock producer. In contrast to clk_get() this function
+ * takes the device name instead of the device itself for identification.
+ *
+ * Drivers must assume that the clock source is not enabled.
+ *
+ * clk_get_sys should not be called from within interrupt context.
+ */
+struct clk *clk_get_sys(const char *dev_id, const char *con_id);
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 9b57896e62bfa752ee7435e6cfe57fb210c0db8c Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Thu, 26 Mar 2009 21:58:14 +0900
Subject: sony-laptop: Add support for extra keyboard events

The current sony-laptop code assumes that the keyboard event method is
always located at slot 2 in the platform code. Remove this assumption and
add support for some additional hotkeys.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/platform/x86/sony-laptop.c | 59 +++++++++++++++++++++-----------------
 include/linux/sonypi.h             |  1 +
 2 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 04deed826180..f6cdc8929fd7 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -211,6 +211,7 @@ static int sony_laptop_input_index[] = {
 	48,	/* 61 SONYPI_EVENT_WIRELESS_OFF */
 	49,	/* 62 SONYPI_EVENT_ZOOM_IN_PRESSED */
 	50,	/* 63 SONYPI_EVENT_ZOOM_OUT_PRESSED */
+	51,	/* 64 SONYPI_EVENT_CD_EJECT_PRESSED */
 };
 
 static int sony_laptop_input_keycode_map[] = {
@@ -264,7 +265,8 @@ static int sony_laptop_input_keycode_map[] = {
 	KEY_WLAN,	/* 47 SONYPI_EVENT_WIRELESS_ON */
 	KEY_WLAN,	/* 48 SONYPI_EVENT_WIRELESS_OFF */
 	KEY_ZOOMIN,	/* 49 SONYPI_EVENT_ZOOM_IN_PRESSED */
-	KEY_ZOOMOUT	/* 50 SONYPI_EVENT_ZOOM_OUT_PRESSED */
+	KEY_ZOOMOUT,	/* 50 SONYPI_EVENT_ZOOM_OUT_PRESSED */
+	KEY_EJECTCD	/* 51 SONYPI_EVENT_CD_EJECT_PRESSED */
 };
 
 /* release buttons after a short delay if pressed */
@@ -834,7 +836,11 @@ struct sony_nc_event {
 	u8	event;
 };
 
-static struct sony_nc_event sony_C_events[] = {
+static struct sony_nc_event sony_nc_events[] = {
+	{ 0x90, SONYPI_EVENT_PKEY_P1 },
+	{ 0x10, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x91, SONYPI_EVENT_PKEY_P1 },
+	{ 0x11, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0x81, SONYPI_EVENT_FNKEY_F1 },
 	{ 0x01, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x85, SONYPI_EVENT_FNKEY_F5 },
@@ -843,10 +849,14 @@ static struct sony_nc_event sony_C_events[] = {
 	{ 0x06, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x87, SONYPI_EVENT_FNKEY_F7 },
 	{ 0x07, SONYPI_EVENT_FNKEY_RELEASED },
+	{ 0x89, SONYPI_EVENT_FNKEY_F9 },
+	{ 0x09, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x8A, SONYPI_EVENT_FNKEY_F10 },
 	{ 0x0A, SONYPI_EVENT_FNKEY_RELEASED },
 	{ 0x8C, SONYPI_EVENT_FNKEY_F12 },
 	{ 0x0C, SONYPI_EVENT_FNKEY_RELEASED },
+	{ 0x9f, SONYPI_EVENT_CD_EJECT_PRESSED },
+	{ 0x1f, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0, 0 },
 };
 
@@ -855,38 +865,33 @@ static struct sony_nc_event sony_C_events[] = {
  */
 static void sony_acpi_notify(acpi_handle handle, u32 event, void *data)
 {
-	int i;
 	u32 ev = event;
 	int result;
 
-	if (ev == 0x92 || ev == 0x90) {
+	if (ev >= 0x90) {
+		/* New-style event */
 		int origev = ev;
-		/* read the key pressed from EC.GECR
-		 * A call to SN07 with 0x0202 will do it as well respecting
-		 * the current protocol on different OSes
-		 *
-		 * Note: the path for GECR may be
-		 *   \_SB.PCI0.LPCB.EC (C, FE, AR, N and friends)
-		 *   \_SB.PCI0.PIB.EC0 (VGN-FR notifications are sent directly, no GECR)
-		 *
-		 * TODO: we may want to do the same for the older GHKE -need
-		 *       dmi list- so this snippet may become one more callback.
-		 */
-		if (sony_call_snc_handle(0x100, 0x200, &result))
-			dprintk("sony_acpi_notify, unable to decode event 0x%.2x\n", ev);
-		else
-			ev = result & 0xFF;
+		ev -= 0x90;
 
-		for (i = 0; sony_C_events[i].data; i++) {
-			if (sony_C_events[i].data == ev) {
-				ev = sony_C_events[i].event;
-				break;
+		if (sony_find_snc_handle(0x100) == ev) {
+			int i;
+
+			if (sony_call_snc_handle(0x100, 0x200, &result))
+				dprintk("sony_acpi_notify, unable to decode event 0x%.2x\n", ev);
+			else
+				ev = result & 0xFF;
+
+			for (i = 0; sony_nc_events[i].event; i++) {
+				if (sony_nc_events[i].data == ev) {
+					ev = sony_nc_events[i].event;
+					break;
+				}
 			}
-		}
 
-		if (!sony_C_events[i].data)
-			printk(KERN_INFO DRV_PFX "Unknown event: %x %x\n",
-			       origev, ev);
+			if (!sony_nc_events[i].data)
+				printk(KERN_INFO DRV_PFX
+				       "Unknown event: %x %x\n", origev, ev);
+		}
 	}
 
 	dprintk("sony_acpi_notify, event: 0x%.2x\n", ev);
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index f41ffd7c2dd9..8458dbe95862 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -103,6 +103,7 @@
 #define SONYPI_EVENT_WIRELESS_OFF		61
 #define SONYPI_EVENT_ZOOM_IN_PRESSED		62
 #define SONYPI_EVENT_ZOOM_OUT_PRESSED		63
+#define SONYPI_EVENT_CD_EJECT_PRESSED		64
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3-71-gd317


From 45c7942ba8f6b7d5d1147c10f84f0cbf5fa3a2b8 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Thu, 26 Mar 2009 21:58:16 +0900
Subject: sony-laptop: Add support for extended hotkeys

Recent Sony SR-series machines have an additional set of buttons accessed
via the 0x127 method rather than the 0x100 method. Add support for these.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/platform/x86/sony-laptop.c | 65 +++++++++++++++++++++++++++++++-------
 include/linux/sonypi.h             |  4 +++
 2 files changed, 57 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index f458870c30b6..e000c9f6cdf5 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -226,6 +226,10 @@ static int sony_laptop_input_index[] = {
 	49,	/* 62 SONYPI_EVENT_ZOOM_IN_PRESSED */
 	50,	/* 63 SONYPI_EVENT_ZOOM_OUT_PRESSED */
 	51,	/* 64 SONYPI_EVENT_CD_EJECT_PRESSED */
+	52,	/* 65 SONYPI_EVENT_MODEKEY_PRESSED */
+	53,	/* 66 SONYPI_EVENT_PKEY_P4 */
+	54,	/* 67 SONYPI_EVENT_PKEY_P5 */
+	55,	/* 68 SONYPI_EVENT_SETTINGKEY_PRESSED */
 };
 
 static int sony_laptop_input_keycode_map[] = {
@@ -280,7 +284,11 @@ static int sony_laptop_input_keycode_map[] = {
 	KEY_WLAN,	/* 48 SONYPI_EVENT_WIRELESS_OFF */
 	KEY_ZOOMIN,	/* 49 SONYPI_EVENT_ZOOM_IN_PRESSED */
 	KEY_ZOOMOUT,	/* 50 SONYPI_EVENT_ZOOM_OUT_PRESSED */
-	KEY_EJECTCD	/* 51 SONYPI_EVENT_CD_EJECT_PRESSED */
+	KEY_EJECTCD,	/* 51 SONYPI_EVENT_CD_EJECT_PRESSED */
+	KEY_F13,	/* 52 SONYPI_EVENT_MODEKEY_PRESSED */
+	KEY_PROG4,	/* 53 SONYPI_EVENT_PKEY_P4 */
+	KEY_F14,	/* 54 SONYPI_EVENT_PKEY_P5 */
+	KEY_F15,	/* 55 SONYPI_EVENT_SETTINGKEY_PRESSED */
 };
 
 /* release buttons after a short delay if pressed */
@@ -850,7 +858,7 @@ struct sony_nc_event {
 	u8	event;
 };
 
-static struct sony_nc_event sony_nc_events[] = {
+static struct sony_nc_event sony_100_events[] = {
 	{ 0x90, SONYPI_EVENT_PKEY_P1 },
 	{ 0x10, SONYPI_EVENT_ANYBUTTON_RELEASED },
 	{ 0x91, SONYPI_EVENT_PKEY_P1 },
@@ -874,6 +882,25 @@ static struct sony_nc_event sony_nc_events[] = {
 	{ 0, 0 },
 };
 
+static struct sony_nc_event sony_127_events[] = {
+	{ 0x81, SONYPI_EVENT_MODEKEY_PRESSED },
+	{ 0x01, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x82, SONYPI_EVENT_PKEY_P1 },
+	{ 0x02, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x83, SONYPI_EVENT_PKEY_P2 },
+	{ 0x03, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x84, SONYPI_EVENT_PKEY_P3 },
+	{ 0x04, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x85, SONYPI_EVENT_PKEY_P4 },
+	{ 0x05, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x86, SONYPI_EVENT_PKEY_P5 },
+	{ 0x06, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x06, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0x87, SONYPI_EVENT_SETTINGKEY_PRESSED },
+	{ 0x07, SONYPI_EVENT_ANYBUTTON_RELEASED },
+	{ 0, 0 },
+};
+
 /*
  * ACPI callbacks
  */
@@ -884,27 +911,41 @@ static void sony_acpi_notify(acpi_handle handle, u32 event, void *data)
 
 	if (ev >= 0x90) {
 		/* New-style event */
-		int origev = ev;
+		int key_handle = 0;
 		ev -= 0x90;
 
-		if (sony_find_snc_handle(0x100) == ev) {
-			int i;
+		if (sony_find_snc_handle(0x100) == ev)
+			key_handle = 0x100;
+		if (sony_find_snc_handle(0x127) == ev)
+			key_handle = 0x127;
+
+		if (handle) {
+			struct sony_nc_event *key_event;
 
-			if (sony_call_snc_handle(0x100, 0x200, &result))
-				dprintk("sony_acpi_notify, unable to decode event 0x%.2x\n", ev);
+			if (sony_call_snc_handle(key_handle, 0x200, &result))
+				dprintk("sony_acpi_notify, unable to decode"
+					" event 0x%.2x 0x%.2x\n", key_handle,
+					ev);
 			else
 				ev = result & 0xFF;
 
-			for (i = 0; sony_nc_events[i].event; i++) {
-				if (sony_nc_events[i].data == ev) {
-					ev = sony_nc_events[i].event;
+			if (key_handle == 0x100)
+				key_event = sony_100_events;
+			else
+				key_event = sony_127_events;
+
+			for (; key_event->data; key_event++) {
+				if (key_event->data == ev) {
+					ev = key_event->event;
 					break;
 				}
 			}
 
-			if (!sony_nc_events[i].data)
+			if (!key_event->data) {
 				printk(KERN_INFO DRV_PFX
-				       "Unknown event: %x %x\n", origev, ev);
+				       "Unknown event: 0x%x 0x%x\n", key_handle,
+				       ev);
+			}
 		} else if (sony_find_snc_handle(0x124) == ev) {
 			sony_nc_rfkill_update();
 			return;
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index 8458dbe95862..bb835019ac7f 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -104,6 +104,10 @@
 #define SONYPI_EVENT_ZOOM_IN_PRESSED		62
 #define SONYPI_EVENT_ZOOM_OUT_PRESSED		63
 #define SONYPI_EVENT_CD_EJECT_PRESSED		64
+#define SONYPI_EVENT_MODEKEY_PRESSED		65
+#define SONYPI_EVENT_PKEY_P4			66
+#define SONYPI_EVENT_PKEY_P5			67
+#define SONYPI_EVENT_SETTINGKEY_PRESSED		68
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3-71-gd317


From 1cae71032183776e833036fe828315dcd3444df1 Mon Sep 17 00:00:00 2001
From: Harald Jenny <harald@a-little-linux-box.at>
Date: Thu, 26 Mar 2009 21:58:18 +0900
Subject: sony-laptop: VGN-A317M hotkey support

This laptop has 5 SPIC managed buttons above the keyboard:
sound + and - as well as brightness, zoom and S1.
Possibly the entire VGN-A serie behaves the same.

Signed-off-by: Harald Jenny <harald@a-little-linux-box.at>
Signed-off-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/platform/x86/sony-laptop.c | 22 ++++++++++++++++++++++
 include/linux/sonypi.h             |  3 +++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 3e45c65b8f81..4f932889569b 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -230,6 +230,9 @@ static int sony_laptop_input_index[] = {
 	53,	/* 66 SONYPI_EVENT_PKEY_P4 */
 	54,	/* 67 SONYPI_EVENT_PKEY_P5 */
 	55,	/* 68 SONYPI_EVENT_SETTINGKEY_PRESSED */
+	56,	/* 69 SONYPI_EVENT_VOLUME_INC_PRESSED */
+	57,	/* 70 SONYPI_EVENT_VOLUME_DEC_PRESSED */
+	-1,	/* 71 SONYPI_EVENT_BRIGHTNESS_PRESSED */
 };
 
 static int sony_laptop_input_keycode_map[] = {
@@ -289,6 +292,8 @@ static int sony_laptop_input_keycode_map[] = {
 	KEY_PROG4,	/* 53 SONYPI_EVENT_PKEY_P4 */
 	KEY_F14,	/* 54 SONYPI_EVENT_PKEY_P5 */
 	KEY_F15,	/* 55 SONYPI_EVENT_SETTINGKEY_PRESSED */
+	KEY_VOLUMEUP,	/* 56 SONYPI_EVENT_VOLUME_INC_PRESSED */
+	KEY_VOLUMEDOWN,	/* 57 SONYPI_EVENT_VOLUME_DEC_PRESSED */
 };
 
 /* release buttons after a short delay if pressed */
@@ -1555,6 +1560,7 @@ static struct sonypi_event sonypi_pkeyev[] = {
 	{ 0x01, SONYPI_EVENT_PKEY_P1 },
 	{ 0x02, SONYPI_EVENT_PKEY_P2 },
 	{ 0x04, SONYPI_EVENT_PKEY_P3 },
+	{ 0x20, SONYPI_EVENT_PKEY_P1 },
 	{ 0, 0 }
 };
 
@@ -1598,6 +1604,7 @@ static struct sonypi_event sonypi_zoomev[] = {
 	{ 0x39, SONYPI_EVENT_ZOOM_PRESSED },
 	{ 0x10, SONYPI_EVENT_ZOOM_IN_PRESSED },
 	{ 0x20, SONYPI_EVENT_ZOOM_OUT_PRESSED },
+	{ 0x04, SONYPI_EVENT_ZOOM_PRESSED },
 	{ 0, 0 }
 };
 
@@ -1628,6 +1635,19 @@ static struct sonypi_event sonypi_batteryev[] = {
 	{ 0, 0 }
 };
 
+/* The set of possible volume events */
+static struct sonypi_event sonypi_volumeev[] = {
+	{ 0x01, SONYPI_EVENT_VOLUME_INC_PRESSED },
+	{ 0x02, SONYPI_EVENT_VOLUME_DEC_PRESSED },
+	{ 0, 0 }
+};
+
+/* The set of possible brightness events */
+static struct sonypi_event sonypi_brightnessev[] = {
+	{ 0x80, SONYPI_EVENT_BRIGHTNESS_PRESSED },
+	{ 0, 0 }
+};
+
 static struct sonypi_eventtypes type1_events[] = {
 	{ 0, 0xffffffff, sonypi_releaseev },
 	{ 0x70, SONYPI_MEYE_MASK, sonypi_meyeev },
@@ -1668,6 +1688,8 @@ static struct sonypi_eventtypes type3_events[] = {
 	{ 0x05, SONYPI_PKEY_MASK, sonypi_pkeyev },
 	{ 0x05, SONYPI_ZOOM_MASK, sonypi_zoomev },
 	{ 0x05, SONYPI_CAPTURE_MASK, sonypi_captureev },
+	{ 0x05, SONYPI_PKEY_MASK, sonypi_volumeev },
+	{ 0x05, SONYPI_PKEY_MASK, sonypi_brightnessev },
 	{ 0 },
 };
 
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index bb835019ac7f..34c4475ac4a2 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -108,6 +108,9 @@
 #define SONYPI_EVENT_PKEY_P4			66
 #define SONYPI_EVENT_PKEY_P5			67
 #define SONYPI_EVENT_SETTINGKEY_PRESSED		68
+#define SONYPI_EVENT_VOLUME_INC_PRESSED		69
+#define SONYPI_EVENT_VOLUME_DEC_PRESSED		70
+#define SONYPI_EVENT_BRIGHTNESS_PRESSED		71
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3-71-gd317


From 2b1c6bd77d4e6a727ffac8630cd154b2144b751a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 28 Nov 2008 10:09:09 +0100
Subject: generic compat_sys_ustat

Due to a different size of ino_t ustat needs a compat handler, but
currently only x86 and mips provide one.  Add a generic compat_sys_ustat
and switch all architectures over to it.  Instead of doing various
user copy hacks compat_sys_ustat just reimplements sys_ustat as
it's trivial.  This was suggested by Arnd Bergmann.

Found by Eric Sandeen when running xfstests/017 on ppc64, which causes
stack smashing warnings on RHEL/Fedora due to the too large amount of
data writen by the syscall.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/ia32/ia32_entry.S        |  2 +-
 arch/mips/kernel/linux32.c         | 34 ----------------------------------
 arch/mips/kernel/scall64-n32.S     |  2 +-
 arch/mips/kernel/scall64-o32.S     |  2 +-
 arch/parisc/kernel/syscall_table.S |  2 +-
 arch/powerpc/include/asm/systbl.h  |  2 +-
 arch/s390/kernel/compat_wrapper.S  |  2 +-
 arch/x86/ia32/ia32entry.S          |  2 +-
 arch/x86/ia32/sys_ia32.c           | 22 ----------------------
 arch/x86/include/asm/ia32.h        |  7 -------
 arch/x86/include/asm/sys_ia32.h    |  2 --
 fs/compat.c                        | 28 ++++++++++++++++++++++++++++
 include/linux/compat.h             |  8 ++++++++
 13 files changed, 43 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/ia32_entry.S b/arch/ia64/ia32/ia32_entry.S
index a46f8395e9a5..af9405cd70e5 100644
--- a/arch/ia64/ia32/ia32_entry.S
+++ b/arch/ia64/ia32/ia32_entry.S
@@ -240,7 +240,7 @@ ia32_syscall_table:
 	data8 sys_ni_syscall
 	data8 sys_umask		  /* 60 */
 	data8 sys_chroot
-	data8 sys_ustat
+	data8 compat_sys_ustat
 	data8 sys_dup2
 	data8 sys_getppid
 	data8 sys_getpgrp	  /* 65 */
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 1a86f84fa947..784859cedef7 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -356,40 +356,6 @@ SYSCALL_DEFINE1(32_personality, unsigned long, personality)
 	return ret;
 }
 
-/* ustat compatibility */
-struct ustat32 {
-	compat_daddr_t	f_tfree;
-	compat_ino_t	f_tinode;
-	char		f_fname[6];
-	char		f_fpack[6];
-};
-
-extern asmlinkage long sys_ustat(dev_t dev, struct ustat __user * ubuf);
-
-SYSCALL_DEFINE2(32_ustat, dev_t, dev, struct ustat32 __user *, ubuf32)
-{
-	int err;
-	struct ustat tmp;
-	struct ustat32 tmp32;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	err = sys_ustat(dev, (struct ustat __user *)&tmp);
-	set_fs(old_fs);
-
-	if (err)
-		goto out;
-
-	memset(&tmp32, 0, sizeof(struct ustat32));
-	tmp32.f_tfree = tmp.f_tfree;
-	tmp32.f_tinode = tmp.f_tinode;
-
-	err = copy_to_user(ubuf32, &tmp32, sizeof(struct ustat32)) ? -EFAULT : 0;
-
-out:
-	return err;
-}
-
 SYSCALL_DEFINE4(32_sendfile, long, out_fd, long, in_fd,
 	compat_off_t __user *, offset, s32, count)
 {
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 7438e92f8a01..f61d6b0e5731 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -253,7 +253,7 @@ EXPORT(sysn32_call_table)
 	PTR	compat_sys_utime		/* 6130 */
 	PTR	sys_mknod
 	PTR	sys_32_personality
-	PTR	sys_32_ustat
+	PTR	compat_sys_ustat
 	PTR	compat_sys_statfs
 	PTR	compat_sys_fstatfs		/* 6135 */
 	PTR	sys_sysfs
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index b0fef4ff9827..60997f1f69d4 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -265,7 +265,7 @@ sys_call_table:
 	PTR	sys_olduname
 	PTR	sys_umask			/* 4060 */
 	PTR	sys_chroot
-	PTR	sys_32_ustat
+	PTR	compat_sys_ustat
 	PTR	sys_dup2
 	PTR	sys_getppid
 	PTR	sys_getpgrp			/* 4065 */
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index 303d2b647e41..03b9a01bc16c 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -130,7 +130,7 @@
 	ENTRY_OURS(newuname)
 	ENTRY_SAME(umask)		/* 60 */
 	ENTRY_SAME(chroot)
-	ENTRY_SAME(ustat)
+	ENTRY_COMP(ustat)
 	ENTRY_SAME(dup2)
 	ENTRY_SAME(getppid)
 	ENTRY_SAME(getpgrp)		/* 65 */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 72353f6070a4..fe166491e9dc 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -65,7 +65,7 @@ SYSCALL(ni_syscall)
 SYSX(sys_ni_syscall,sys_olduname, sys_olduname)
 COMPAT_SYS_SPU(umask)
 SYSCALL_SPU(chroot)
-SYSCALL(ustat)
+COMPAT_SYS(ustat)
 SYSCALL_SPU(dup2)
 SYSCALL_SPU(getppid)
 SYSCALL_SPU(getpgrp)
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 62c706eb0de6..87cf5a79a351 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -252,7 +252,7 @@ sys32_chroot_wrapper:
 sys32_ustat_wrapper:
 	llgfr	%r2,%r2			# dev_t
 	llgtr	%r3,%r3			# struct ustat *
-	jg	sys_ustat
+	jg	compat_sys_ustat
 
 	.globl	sys32_dup2_wrapper
 sys32_dup2_wrapper:
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5a0d76dc56a4..8ef8876666b2 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -557,7 +557,7 @@ ia32_sys_call_table:
 	.quad sys32_olduname
 	.quad sys_umask		/* 60 */
 	.quad sys_chroot
-	.quad sys32_ustat
+	.quad compat_sys_ustat
 	.quad sys_dup2
 	.quad sys_getppid
 	.quad sys_getpgrp		/* 65 */
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 6c0d7f6231af..efac92fd1efb 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -638,28 +638,6 @@ long sys32_uname(struct old_utsname __user *name)
 	return err ? -EFAULT : 0;
 }
 
-long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
-{
-	struct ustat u;
-	mm_segment_t seg;
-	int ret;
-
-	seg = get_fs();
-	set_fs(KERNEL_DS);
-	ret = sys_ustat(dev, (struct ustat __user *)&u);
-	set_fs(seg);
-	if (ret < 0)
-		return ret;
-
-	if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) ||
-	    __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
-	    __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
-	    __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
-	    __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
-		ret = -EFAULT;
-	return ret;
-}
-
 asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
 			     compat_uptr_t __user *envp, struct pt_regs *regs)
 {
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 50ca486fd88c..1f7e62517284 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -129,13 +129,6 @@ typedef struct compat_siginfo {
 	} _sifields;
 } compat_siginfo_t;
 
-struct ustat32 {
-	__u32			f_tfree;
-	compat_ino_t		f_tinode;
-	char			f_fname[6];
-	char			f_fpack[6];
-};
-
 #define IA32_STACK_TOP IA32_PAGE_OFFSET
 
 #ifdef __KERNEL__
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index ffb08be2a530..72a6dcd1299b 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -70,8 +70,6 @@ struct old_utsname;
 asmlinkage long sys32_olduname(struct oldold_utsname __user *);
 long sys32_uname(struct old_utsname __user *);
 
-long sys32_ustat(unsigned, struct ustat32 __user *);
-
 asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
 			     compat_uptr_t __user *, struct pt_regs *);
 asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
diff --git a/fs/compat.c b/fs/compat.c
index d0145ca27572..4e0db94b5353 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -378,6 +378,34 @@ out:
 	return error;
 }
 
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
+{
+	struct super_block *sb;
+	struct compat_ustat tmp;
+	struct kstatfs sbuf;
+	int err;
+
+	sb = user_get_super(new_decode_dev(dev));
+	if (!sb)
+		return -EINVAL;
+	err = vfs_statfs(sb->s_root, &sbuf);
+	drop_super(sb);
+	if (err)
+		return err;
+
+	memset(&tmp, 0, sizeof(struct compat_ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+	if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+		return -EFAULT;
+	return 0;
+}
+
 static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
 {
 	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 3fd2194ff573..b880864672de 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -125,6 +125,13 @@ struct compat_dirent {
 	char		d_name[256];
 };
 
+struct compat_ustat {
+	compat_daddr_t		f_tfree;
+	compat_ino_t		f_tinode;
+	char			f_fname[6];
+	char			f_fpack[6];
+};
+
 typedef union compat_sigval {
 	compat_int_t	sival_int;
 	compat_uptr_t	sival_ptr;
@@ -178,6 +185,7 @@ long compat_sys_semtimedop(int semid, struct sembuf __user *tsems,
 		unsigned nsems, const struct compat_timespec __user *timeout);
 asmlinkage long compat_sys_keyctl(u32 option,
 			      u32 arg2, u32 arg3, u32 arg4, u32 arg5);
+asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32);
 
 asmlinkage ssize_t compat_sys_readv(unsigned long fd,
 		const struct compat_iovec __user *vec, unsigned long vlen);
-- 
cgit v1.2.3-71-gd317


From 10f303ae1e5e77a9f7cb053e6329906afb132c67 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Wed, 14 Jan 2009 17:01:33 +0800
Subject: do_pipe cleanup: drop its last user in arch/alpha/

The last user of do_pipe is in arch/alpha/, after replacing it with
do_pipe_flags, the do_pipe can be totally dropped.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Acked-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/entry.S   | 3 ++-
 arch/alpha/kernel/osf_sys.c | 2 --
 fs/pipe.c                   | 5 -----
 include/linux/fs.h          | 1 -
 4 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S
index e4a54b615894..b45d913a51c3 100644
--- a/arch/alpha/kernel/entry.S
+++ b/arch/alpha/kernel/entry.S
@@ -903,8 +903,9 @@ sys_alpha_pipe:
 	stq	$26, 0($sp)
 	.prologue 0
 
+	mov	$31, $17
 	lda	$16, 8($sp)
-	jsr	$26, do_pipe
+	jsr	$26, do_pipe_flags
 
 	ldq	$26, 0($sp)
 	bne	$0, 1f
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index ae41f097864b..42ee05981e71 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -46,8 +46,6 @@
 #include <asm/hwrpb.h>
 #include <asm/processor.h>
 
-extern int do_pipe(int *);
-
 /*
  * Brk needs to return an error.  Still support Linux's brk(0) query idiom,
  * which OSF programs just shouldn't be doing.  We're still not quite
diff --git a/fs/pipe.c b/fs/pipe.c
index 14f502b89cf5..df3719562fc1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1034,11 +1034,6 @@ int do_pipe_flags(int *fd, int flags)
 	return error;
 }
 
-int do_pipe(int *fd)
-{
-	return do_pipe_flags(fd, 0);
-}
-
 /*
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 92734c0012e6..51de83bd8a87 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1881,7 +1881,6 @@ static inline void allow_write_access(struct file *file)
 	if (file)
 		atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
 }
-extern int do_pipe(int *);
 extern int do_pipe_flags(int *, int);
 extern struct file *create_read_pipe(struct file *f, int flags);
 extern struct file *create_write_pipe(int flags);
-- 
cgit v1.2.3-71-gd317


From c2aca5e529a2499d454c41e01f59f1d5fe4a1364 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 20 Jan 2009 10:29:45 +0000
Subject: vfs: Update fs.h to use inline functions when no file locking set

This avoids various issues which might give rise to compiler warnings
about missing functions and/or unused variable with the previous
macros. This also fixes a bug where one of the macros was returning
0, but it should have been void.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 165 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 139 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51de83bd8a87..d84020b7e676 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1063,34 +1063,147 @@ extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
 #else /* !CONFIG_FILE_LOCKING */
-#define fcntl_getlk(a, b) ({ -EINVAL; })
-#define fcntl_setlk(a, b, c, d) ({ -EACCES; })
+static inline int fcntl_getlk(struct file *file, struct flock __user *user)
+{
+	return -EINVAL;
+}
+
+static inline int fcntl_setlk(unsigned int fd, struct file *file,
+			      unsigned int cmd, struct flock __user *user)
+{
+	return -EACCES;
+}
+
 #if BITS_PER_LONG == 32
-#define fcntl_getlk64(a, b) ({ -EINVAL; })
-#define fcntl_setlk64(a, b, c, d) ({ -EACCES; })
+static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user)
+{
+	return -EINVAL;
+}
+
+static inline int fcntl_setlk64(unsigned int fd, struct file *file,
+				unsigned int cmd, struct flock64 __user *user)
+{
+	return -EACCES;
+}
 #endif
-#define fcntl_setlease(a, b, c) ({ 0; })
-#define fcntl_getlease(a) ({ 0; })
-#define locks_init_lock(a) ({ })
-#define __locks_copy_lock(a, b) ({ })
-#define locks_copy_lock(a, b) ({ })
-#define locks_remove_posix(a, b) ({ })
-#define locks_remove_flock(a) ({ })
-#define posix_test_lock(a, b) ({ 0; })
-#define posix_lock_file(a, b, c) ({ -ENOLCK; })
-#define posix_lock_file_wait(a, b) ({ -ENOLCK; })
-#define posix_unblock_lock(a, b) (-ENOENT)
-#define vfs_test_lock(a, b) ({ 0; })
-#define vfs_lock_file(a, b, c, d) (-ENOLCK)
-#define vfs_cancel_lock(a, b) ({ 0; })
-#define flock_lock_file_wait(a, b) ({ -ENOLCK; })
-#define __break_lease(a, b) ({ 0; })
-#define lease_get_mtime(a, b) ({ })
-#define generic_setlease(a, b, c) ({ -EINVAL; })
-#define vfs_setlease(a, b, c) ({ -EINVAL; })
-#define lease_modify(a, b) ({ -EINVAL; })
-#define lock_may_read(a, b, c) ({ 1; })
-#define lock_may_write(a, b, c) ({ 1; })
+static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+{
+	return 0;
+}
+
+static inline int fcntl_getlease(struct file *filp)
+{
+	return 0;
+}
+
+static inline void locks_init_lock(struct file_lock *fl)
+{
+	return;
+}
+
+static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	return;
+}
+
+static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	return;
+}
+
+static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
+{
+	return;
+}
+
+static inline void locks_remove_flock(struct file *filp)
+{
+	return;
+}
+
+static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
+{
+	return;
+}
+
+static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
+				  struct file_lock *conflock)
+{
+	return -ENOLCK;
+}
+
+static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return -ENOLCK;
+}
+
+static inline int posix_unblock_lock(struct file *filp,
+				     struct file_lock *waiter)
+{
+	return -ENOENT;
+}
+
+static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
+{
+	return 0;
+}
+
+static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
+				struct file_lock *fl, struct file_lock *conf)
+{
+	return -ENOLCK;
+}
+
+static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
+{
+	return 0;
+}
+
+static inline int flock_lock_file_wait(struct file *filp,
+				       struct file_lock *request)
+{
+	return -ENOLCK;
+}
+
+static inline int __break_lease(struct inode *inode, unsigned int mode)
+{
+	return 0;
+}
+
+static inline void lease_get_mtime(struct inode *inode, struct timespec *time)
+{
+	return;
+}
+
+static inline int generic_setlease(struct file *filp, long arg,
+				    struct file_lock **flp)
+{
+	return -EINVAL;
+}
+
+static inline int vfs_setlease(struct file *filp, long arg,
+			       struct file_lock **lease)
+{
+	return -EINVAL;
+}
+
+static inline int lease_modify(struct file_lock **before, int arg)
+{
+	return -EINVAL;
+}
+
+static inline int lock_may_read(struct inode *inode, loff_t start,
+				unsigned long len)
+{
+	return 1;
+}
+
+static inline int lock_may_write(struct inode *inode, loff_t start,
+				 unsigned long len)
+{
+	return 1;
+}
+
 #endif /* !CONFIG_FILE_LOCKING */
 
 
-- 
cgit v1.2.3-71-gd317


From af5df56688acfb75c1b15b4e000ec5e82a9cdc29 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 20 Jan 2009 10:29:46 +0000
Subject: vfs: Further changes from macro to inline function in fs.h

There is a second set of macros for when CONFIG_FILE_LOCKING is
not set. This patch updates those to become inline functions
as well.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d84020b7e676..5f74d616cd7d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1800,13 +1800,44 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 	return 0;
 }
 #else /* !CONFIG_FILE_LOCKING */
-#define locks_mandatory_locked(a) ({ 0; })
-#define locks_mandatory_area(a, b, c, d, e) ({ 0; })
-#define __mandatory_lock(a) ({ 0; })
-#define mandatory_lock(a) ({ 0; })
-#define locks_verify_locked(a) ({ 0; })
-#define locks_verify_truncate(a, b, c) ({ 0; })
-#define break_lease(a, b) ({ 0; })
+static inline int locks_mandatory_locked(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int locks_mandatory_area(int rw, struct inode *inode,
+				       struct file *filp, loff_t offset,
+				       size_t count)
+{
+	return 0;
+}
+
+static inline int __mandatory_lock(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int mandatory_lock(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int locks_verify_locked(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
+					size_t size)
+{
+	return 0;
+}
+
+static inline int break_lease(struct inode *inode, unsigned int mode)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
-- 
cgit v1.2.3-71-gd317


From e56980d451904b623573ef4966cbab768e433c79 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 11 Feb 2009 13:14:54 -0800
Subject: fs: make struct dentry->d_op const

This change will allow for tagging many dentry_operations const in the
source tree.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c66d22487bf8..15156364d196 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -112,7 +112,7 @@ struct dentry {
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
 	unsigned long d_time;		/* used by d_revalidate */
-	struct dentry_operations *d_op;
+	const struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
 
-- 
cgit v1.2.3-71-gd317


From f786aa90e026f2174bb0c26d49f338c5c46ede55 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Feb 2009 05:51:22 +0000
Subject: constify dentry_operations: NFS

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c            | 4 ++--
 fs/nfs/nfs4_fs.h        | 2 +-
 include/linux/nfs_fs.h  | 2 +-
 include/linux/nfs_xdr.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 672368f865ca..78bf72fc1db3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -899,7 +899,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
-struct dentry_operations nfs_dentry_operations = {
+const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
@@ -967,7 +967,7 @@ out:
 #ifdef CONFIG_NFS_V4
 static int nfs_open_revalidate(struct dentry *, struct nameidata *);
 
-struct dentry_operations nfs4_dentry_operations = {
+const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs_open_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4e4d33204376..84345deab26f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -179,7 +179,7 @@ struct nfs4_state_recovery_ops {
 	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 };
 
-extern struct dentry_operations nfs4_dentry_operations;
+extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
 
 /* inode.c */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index db867b04ac3c..8cc8807f77d6 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -415,7 +415,7 @@ extern const struct inode_operations nfs_dir_inode_operations;
 extern const struct inode_operations nfs3_dir_inode_operations;
 #endif /* CONFIG_NFS_V3 */
 extern const struct file_operations nfs_dir_operations;
-extern struct dentry_operations nfs_dentry_operations;
+extern const struct dentry_operations nfs_dentry_operations;
 
 extern void nfs_force_lookup_revalidate(struct inode *dir);
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2e5f00066afd..43a713fce11c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -785,7 +785,7 @@ struct nfs_access_entry;
  */
 struct nfs_rpc_ops {
 	u32	version;		/* Protocol version */
-	struct dentry_operations *dentry_ops;
+	const struct dentry_operations *dentry_ops;
 	const struct inode_operations *dir_inode_ops;
 	const struct inode_operations *file_inode_ops;
 
-- 
cgit v1.2.3-71-gd317


From e16404ed0f3f330dc3e99b95cef69bb60bcd27f7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Feb 2009 05:55:13 +0000
Subject: constify dentry_operations: misc filesystems

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/adfs.h          | 2 +-
 fs/adfs/dir.c           | 2 +-
 fs/affs/affs.h          | 3 +--
 fs/affs/namei.c         | 4 ++--
 fs/coda/dir.c           | 2 +-
 fs/hfs/hfs_fs.h         | 2 +-
 fs/hfs/sysdep.c         | 2 +-
 fs/hfsplus/hfsplus_fs.h | 2 +-
 fs/hfsplus/inode.c      | 2 +-
 fs/hostfs/hostfs_kern.c | 4 ++--
 fs/hpfs/dentry.c        | 2 +-
 fs/isofs/inode.c        | 2 +-
 fs/ncpfs/dir.c          | 4 ++--
 fs/reiserfs/xattr.c     | 2 +-
 fs/smbfs/dir.c          | 4 ++--
 fs/sysv/namei.c         | 2 +-
 fs/sysv/sysv.h          | 2 +-
 include/linux/ncp_fs.h  | 2 +-
 18 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 831157502d5a..e0a85dbeeb88 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -86,7 +86,7 @@ void __adfs_error(struct super_block *sb, const char *function,
 /* dir_*.c */
 extern const struct inode_operations adfs_dir_inode_operations;
 extern const struct file_operations adfs_dir_operations;
-extern struct dentry_operations adfs_dentry_operations;
+extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
 
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 85a30e929800..e867ccf37246 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -263,7 +263,7 @@ adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
 	return 0;
 }
 
-struct dentry_operations adfs_dentry_operations = {
+const struct dentry_operations adfs_dentry_operations = {
 	.d_hash		= adfs_hash,
 	.d_compare	= adfs_compare,
 };
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e9ec915f7553..1a2d5e3c7f4e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -199,8 +199,7 @@ extern const struct address_space_operations	 affs_symlink_aops;
 extern const struct address_space_operations	 affs_aops;
 extern const struct address_space_operations	 affs_aops_ofs;
 
-extern struct dentry_operations	 affs_dentry_operations;
-extern struct dentry_operations	 affs_dentry_operations_intl;
+extern const struct dentry_operations	 affs_dentry_operations;
 
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index cfcf1b6cf82b..960d336ec694 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -19,12 +19,12 @@ static int	 affs_intl_toupper(int ch);
 static int	 affs_intl_hash_dentry(struct dentry *, struct qstr *);
 static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 
-struct dentry_operations affs_dentry_operations = {
+const struct dentry_operations affs_dentry_operations = {
 	.d_hash		= affs_hash_dentry,
 	.d_compare	= affs_compare_dentry,
 };
 
-static struct dentry_operations affs_intl_dentry_operations = {
+static const struct dentry_operations affs_intl_dentry_operations = {
 	.d_hash		= affs_intl_hash_dentry,
 	.d_compare	= affs_intl_compare_dentry,
 };
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 75b1fa90b2cb..4bb9d0a5decc 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -59,7 +59,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
 
-static struct dentry_operations coda_dentry_operations =
+static const struct dentry_operations coda_dentry_operations =
 {
 	.d_revalidate	= coda_dentry_revalidate,
 	.d_delete	= coda_dentry_delete,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 9955232fdf8c..052387e11671 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,7 +213,7 @@ extern void hfs_mdb_put(struct super_block *);
 extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 
 /* string.c */
-extern struct dentry_operations hfs_dentry_operations;
+extern const struct dentry_operations hfs_dentry_operations;
 
 extern int hfs_hash_dentry(struct dentry *, struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 5bf89ec01cd4..7478f5c219aa 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -31,7 +31,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 	return 1;
 }
 
-struct dentry_operations hfs_dentry_operations =
+const struct dentry_operations hfs_dentry_operations =
 {
 	.d_revalidate	= hfs_revalidate_dentry,
 	.d_hash		= hfs_hash_dentry,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index f027a905225f..5c10d803d9df 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -327,7 +327,7 @@ void hfsplus_file_truncate(struct inode *);
 /* inode.c */
 extern const struct address_space_operations hfsplus_aops;
 extern const struct address_space_operations hfsplus_btree_aops;
-extern struct dentry_operations hfsplus_dentry_operations;
+extern const struct dentry_operations hfsplus_dentry_operations;
 
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f105ee9e1cc4..1bcf597c0562 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -137,7 +137,7 @@ const struct address_space_operations hfsplus_aops = {
 	.writepages	= hfsplus_writepages,
 };
 
-struct dentry_operations hfsplus_dentry_operations = {
+const struct dentry_operations hfsplus_dentry_operations = {
 	.d_hash       = hfsplus_hash_dentry,
 	.d_compare    = hfsplus_compare_dentry,
 };
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5c538e0ec14b..fe02ad4740e7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -31,12 +31,12 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
 
-int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(struct dentry *dentry)
 {
 	return 1;
 }
 
-struct dentry_operations hostfs_dentry_ops = {
+static const struct dentry_operations hostfs_dentry_ops = {
 	.d_delete		= hostfs_d_delete,
 };
 
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 08319126b2af..940d6d150bee 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -49,7 +49,7 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
 	return 0;
 }
 
-static struct dentry_operations hpfs_dentry_operations = {
+static const struct dentry_operations hpfs_dentry_operations = {
 	.d_hash		= hpfs_hash_dentry,
 	.d_compare	= hpfs_compare_dentry,
 };
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6147ec3643a0..13d2eddd0692 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static const struct super_operations isofs_sops = {
 };
 
 
-static struct dentry_operations isofs_dentry_ops[] = {
+static const struct dentry_operations isofs_dentry_ops[] = {
 	{
 		.d_hash		= isofs_hash,
 		.d_compare	= isofs_dentry_cmp,
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 07e9715b8658..9c590722d87e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -79,7 +79,7 @@ static int ncp_hash_dentry(struct dentry *, struct qstr *);
 static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
 static int ncp_delete_dentry(struct dentry *);
 
-static struct dentry_operations ncp_dentry_operations =
+static const struct dentry_operations ncp_dentry_operations =
 {
 	.d_revalidate	= ncp_lookup_validate,
 	.d_hash		= ncp_hash_dentry,
@@ -87,7 +87,7 @@ static struct dentry_operations ncp_dentry_operations =
 	.d_delete	= ncp_delete_dentry,
 };
 
-struct dentry_operations ncp_root_dentry_operations =
+const struct dentry_operations ncp_root_dentry_operations =
 {
 	.d_hash		= ncp_hash_dentry,
 	.d_compare	= ncp_compare_dentry,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ad92461cbfc3..ae881ccd2f03 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1136,7 +1136,7 @@ xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 	return 1;
 }
 
-static struct dentry_operations xattr_lookup_poison_ops = {
+static const struct dentry_operations xattr_lookup_poison_ops = {
 	.d_compare = xattr_lookup_poison,
 };
 
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index e7ddd0328ddc..3e4803b4427e 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -277,7 +277,7 @@ static int smb_hash_dentry(struct dentry *, struct qstr *);
 static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 static int smb_delete_dentry(struct dentry *);
 
-static struct dentry_operations smbfs_dentry_operations =
+static const struct dentry_operations smbfs_dentry_operations =
 {
 	.d_revalidate	= smb_lookup_validate,
 	.d_hash		= smb_hash_dentry,
@@ -285,7 +285,7 @@ static struct dentry_operations smbfs_dentry_operations =
 	.d_delete	= smb_delete_dentry,
 };
 
-static struct dentry_operations smbfs_dentry_operations_case =
+static const struct dentry_operations smbfs_dentry_operations_case =
 {
 	.d_revalidate	= smb_lookup_validate,
 	.d_delete	= smb_delete_dentry,
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index a1f1ef33e81c..33e047b59b8d 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -38,7 +38,7 @@ static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
 	return 0;
 }
 
-struct dentry_operations sysv_dentry_operations = {
+const struct dentry_operations sysv_dentry_operations = {
 	.d_hash		= sysv_hash,
 };
 
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 38ebe3f85b3d..5784a318c883 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -170,7 +170,7 @@ extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
 extern const struct address_space_operations sysv_aops;
 extern const struct super_operations sysv_sops;
-extern struct dentry_operations sysv_dentry_operations;
+extern const struct dentry_operations sysv_dentry_operations;
 
 
 enum {
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index f69e66d151cc..30b06c893944 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -204,7 +204,7 @@ void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
 /* linux/fs/ncpfs/dir.c */
 extern const struct inode_operations ncp_dir_inode_operations;
 extern const struct file_operations ncp_dir_operations;
-extern struct dentry_operations ncp_root_dentry_operations;
+extern const struct dentry_operations ncp_root_dentry_operations;
 int ncp_conn_logged_in(struct super_block *);
 int ncp_date_dos2unix(__le16 time, __le16 date);
 void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
-- 
cgit v1.2.3-71-gd317


From 585d3bc06f4ca57f975a5a1f698f65a45ea66225 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 25 Feb 2009 10:44:19 +0100
Subject: fs: move bdev code out of buffer.c

Move some block device related code out from buffer.c and put it in
block_dev.c. I'm trying to move non-buffer_head code out of buffer.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c              | 146 ++++++++++++++++++++++++++++++++++++++++++++
 fs/buffer.c                 | 145 -------------------------------------------
 include/linux/buffer_head.h |   7 ---
 include/linux/fs.h          |   7 +++
 4 files changed, 153 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3c1efff5e1d..8c3c6899ccf3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
+#include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
@@ -174,6 +175,151 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+	int ret = 0;
+
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	return ret;
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	if (sb) {
+		int res = fsync_super(sb);
+		drop_super(sb);
+		return res;
+	}
+	return sync_blockdev(bdev);
+}
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:	blockdevice to lock
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until thaw_bdev() is called.
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		bdev->bd_fsfreeze_count++;
+		sb = get_super(bdev);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+	bdev->bd_fsfreeze_count++;
+
+	down(&bdev->bd_mount_sem);
+	sb = get_super(bdev);
+	if (sb && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_frozen = SB_FREEZE_WRITE;
+		smp_wmb();
+
+		__fsync_super(sb);
+
+		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
+
+		sync_blockdev(sb->s_bdev);
+
+		if (sb->s_op->freeze_fs) {
+			error = sb->s_op->freeze_fs(sb);
+			if (error) {
+				printk(KERN_ERR
+					"VFS:Filesystem freeze failed\n");
+				sb->s_frozen = SB_UNFROZEN;
+				drop_super(sb);
+				up(&bdev->bd_mount_sem);
+				bdev->bd_fsfreeze_count--;
+				mutex_unlock(&bdev->bd_fsfreeze_mutex);
+				return ERR_PTR(error);
+			}
+		}
+	}
+
+	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
+	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:	blockdevice to unlock
+ * @sb:		associated superblock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EINVAL;
+	}
+
+	bdev->bd_fsfreeze_count--;
+	if (bdev->bd_fsfreeze_count > 0) {
+		if (sb)
+			drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return 0;
+	}
+
+	if (sb) {
+		BUG_ON(sb->s_bdev != bdev);
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (sb->s_op->unfreeze_fs) {
+				error = sb->s_op->unfreeze_fs(sb);
+				if (error) {
+					printk(KERN_ERR
+						"VFS:Filesystem thaw failed\n");
+					sb->s_frozen = SB_FREEZE_TRANS;
+					bdev->bd_fsfreeze_count++;
+					mutex_unlock(&bdev->bd_fsfreeze_mutex);
+					return error;
+				}
+			}
+			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
+		}
+		drop_super(sb);
+	}
+
+	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_bdev);
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
diff --git a/fs/buffer.c b/fs/buffer.c
index 891e1c78e4f1..a2fd743d97cb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -165,151 +165,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-/*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping.  Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
-}
-EXPORT_SYMBOL(sync_blockdev);
-
-/*
- * Write out and wait upon all dirty data associated with this
- * device.   Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
-	struct super_block *sb = get_super(bdev);
-	if (sb) {
-		int res = fsync_super(sb);
-		drop_super(sb);
-		return res;
-	}
-	return sync_blockdev(bdev);
-}
-
-/**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
- * @bdev:	blockdevice to lock
- *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-struct super_block *freeze_bdev(struct block_device *bdev)
-{
-	struct super_block *sb;
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		bdev->bd_fsfreeze_count++;
-		sb = get_super(bdev);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
-	bdev->bd_fsfreeze_count++;
-
-	down(&bdev->bd_mount_sem);
-	sb = get_super(bdev);
-	if (sb && !(sb->s_flags & MS_RDONLY)) {
-		sb->s_frozen = SB_FREEZE_WRITE;
-		smp_wmb();
-
-		__fsync_super(sb);
-
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
-
-		sync_blockdev(sb->s_bdev);
-
-		if (sb->s_op->freeze_fs) {
-			error = sb->s_op->freeze_fs(sb);
-			if (error) {
-				printk(KERN_ERR
-					"VFS:Filesystem freeze failed\n");
-				sb->s_frozen = SB_UNFROZEN;
-				drop_super(sb);
-				up(&bdev->bd_mount_sem);
-				bdev->bd_fsfreeze_count--;
-				mutex_unlock(&bdev->bd_fsfreeze_mutex);
-				return ERR_PTR(error);
-			}
-		}
-	}
-
-	sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
-	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
-}
-EXPORT_SYMBOL(freeze_bdev);
-
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:	blockdevice to unlock
- * @sb:		associated superblock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return -EINVAL;
-	}
-
-	bdev->bd_fsfreeze_count--;
-	if (bdev->bd_fsfreeze_count > 0) {
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return 0;
-	}
-
-	if (sb) {
-		BUG_ON(sb->s_bdev != bdev);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			if (sb->s_op->unfreeze_fs) {
-				error = sb->s_op->unfreeze_fs(sb);
-				if (error) {
-					printk(KERN_ERR
-						"VFS:Filesystem thaw failed\n");
-					sb->s_frozen = SB_FREEZE_TRANS;
-					bdev->bd_fsfreeze_count++;
-					mutex_unlock(&bdev->bd_fsfreeze_mutex);
-					return error;
-				}
-			}
-			sb->s_frozen = SB_UNFROZEN;
-			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
-		}
-		drop_super(sb);
-	}
-
-	up(&bdev->bd_mount_sem);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(thaw_bdev);
-
 /*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index bd7ac793be19..f19fd9045ea0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -165,15 +165,8 @@ int sync_mapping_buffers(struct address_space *mapping);
 void unmap_underlying_metadata(struct block_device *bdev, sector_t block);
 
 void mark_buffer_async_write(struct buffer_head *bh);
-void invalidate_bdev(struct block_device *);
-int sync_blockdev(struct block_device *bdev);
 void __wait_on_buffer(struct buffer_head *);
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
-int fsync_bdev(struct block_device *);
-struct super_block *freeze_bdev(struct block_device *);
-int thaw_bdev(struct block_device *, struct super_block *);
-int fsync_super(struct super_block *);
-int fsync_no_super(struct block_device *);
 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
 struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f74d616cd7d..c2c4454a268a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1874,6 +1874,13 @@ extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, fmode_t);
+extern void invalidate_bdev(struct block_device *);
+extern int sync_blockdev(struct block_device *bdev);
+extern struct super_block *freeze_bdev(struct block_device *);
+extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
+extern int fsync_bdev(struct block_device *);
+extern int fsync_super(struct super_block *);
+extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 #endif
-- 
cgit v1.2.3-71-gd317


From a3ec947c85ec339884b30ef6a08133e9311fdae1 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Wed, 4 Mar 2009 12:06:34 -0800
Subject: vfs: simple_set_mnt() should return void

simple_set_mnt() is defined as returning 'int' but always returns 0.
Callers assume simple_set_mnt() never fails and don't properly cleanup if
it were to _ever_ fail.  For instance, get_sb_single() and get_sb_nodev()
should:

        up_write(sb->s_unmount);
        deactivate_super(sb);

if simple_set_mnt() fails.

Since simple_set_mnt() never fails, would be cleaner if it did not
return anything.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/mtdsuper.c | 7 +++++--
 fs/9p/vfs_super.c      | 5 +++--
 fs/cifs/cifsfs.c       | 3 ++-
 fs/devpts/inode.c      | 3 ++-
 fs/libfs.c             | 3 ++-
 fs/namespace.c         | 3 +--
 fs/proc/root.c         | 3 ++-
 fs/super.c             | 9 ++++++---
 fs/ubifs/super.c       | 3 ++-
 include/linux/fs.h     | 2 +-
 kernel/cgroup.c        | 3 ++-
 11 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 00d46e137b2a..92285d0089c2 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -81,13 +81,16 @@ static int get_sb_mtd_aux(struct file_system_type *fs_type, int flags,
 
 	/* go */
 	sb->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+
+	return 0;
 
 	/* new mountpoint for an already mounted superblock */
 already_mounted:
 	DEBUG(1, "MTDSB: Device %d (\"%s\") is already mounted\n",
 	      mtd->index, mtd->name);
-	ret = simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	ret = 0;
 	goto out_put;
 
 out_error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 93212e40221a..5f8ab8adb5f5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -168,8 +168,9 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	p9stat_free(st);
 	kfree(st);
 
-P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n");
-	return simple_set_mnt(mnt, sb);
+P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 release_sb:
 	if (sb) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 13ea53251dcf..38491fd3871d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -606,7 +606,8 @@ cifs_get_sb(struct file_system_type *fs_type,
 		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 140b43144cd8..b0a76340a4cd 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -454,7 +454,8 @@ static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 /*
diff --git a/fs/libfs.c b/fs/libfs.c
index ec600bd33e75..4910a36f516e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -242,7 +242,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 Enomem:
 	up_write(&s->s_umount);
diff --git a/fs/namespace.c b/fs/namespace.c
index 06f8e63f6cb1..2432ca6bb223 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,11 +397,10 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	spin_unlock(&vfsmount_lock);
 }
 
-int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
 	mnt->mnt_root = dget(sb->s_root);
-	return 0;
 }
 
 EXPORT_SYMBOL(simple_set_mnt);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f6299a25594e..1e15a2b176e8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -83,7 +83,8 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		ns->proc_mnt = mnt;
 	}
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static void proc_kill_sb(struct super_block *sb)
diff --git a/fs/super.c b/fs/super.c
index 6ce501447ada..e512fab64c93 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -831,7 +831,8 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		bdev->bd_super = s;
 	}
 
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 error_s:
 	error = PTR_ERR(s);
@@ -877,7 +878,8 @@ int get_sb_nodev(struct file_system_type *fs_type,
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -909,7 +911,8 @@ int get_sb_single(struct file_system_type *fs_type,
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_single);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1182b66a5491..c5c98355459a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2034,7 +2034,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	/* 'fill_super()' opens ubi again so we must close it here */
 	ubi_close_volume(ubi);
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 out_deact:
 	up_write(&sb->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c2c4454a268a..a7d73914a9f7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1719,7 +1719,7 @@ struct super_block *sget(struct file_system_type *type,
 extern int get_sb_pseudo(struct file_system_type *, char *,
 	const struct super_operations *ops, unsigned long,
 	struct vfsmount *mnt);
-extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
+extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b01100ebd074..c500ca7239b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1071,7 +1071,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		mutex_unlock(&cgroup_mutex);
 	}
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 
  free_cg_links:
 	free_cg_links(&tmp_cg_links);
-- 
cgit v1.2.3-71-gd317


From 03a971a2899886006f19f3495973bbd646d8bdae Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Wed, 3 Dec 2008 18:00:38 +0000
Subject: thermal: support forcing support for passive cooling

Due to poor thermal design or Linux driving hardware outside its thermal
envelope, some systems will reach critical temperature and shut down
under high load. This patch adds support for forcing a polling-based
passive trip point if the firmware doesn't provide one. The assumption
is made that the processor is the most practical means to reduce the
dynamic heat generation, so hitting the passive thermal limit will cause
the CPU to be throttled until the temperature stabalises around the
defined value.

UI is provided via a "passive" sysfs entry in the thermal zone
directory. It accepts a decimal value in millidegrees celsius, or "0" to
disable the functionality. Default behaviour is for this functionality
to be disabled.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/thermal/thermal_sys.c | 77 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/thermal.h       |  1 +
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 6378741882f3..d0b093b66adc 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -214,9 +214,69 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%ld\n", temperature);
 }
 
+static ssize_t
+passive_store(struct device *dev, struct device_attribute *attr,
+		    const char *buf, size_t count)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	struct thermal_cooling_device *cdev = NULL;
+	int state;
+
+	if (!sscanf(buf, "%d\n", &state))
+		return -EINVAL;
+
+	if (state && !tz->forced_passive) {
+		mutex_lock(&thermal_list_lock);
+		list_for_each_entry(cdev, &thermal_cdev_list, node) {
+			if (!strncmp("Processor", cdev->type,
+				     sizeof("Processor")))
+				thermal_zone_bind_cooling_device(tz,
+								 THERMAL_TRIPS_NONE,
+								 cdev);
+		}
+		mutex_unlock(&thermal_list_lock);
+	} else if (!state && tz->forced_passive) {
+		mutex_lock(&thermal_list_lock);
+		list_for_each_entry(cdev, &thermal_cdev_list, node) {
+			if (!strncmp("Processor", cdev->type,
+				     sizeof("Processor")))
+				thermal_zone_unbind_cooling_device(tz,
+								   THERMAL_TRIPS_NONE,
+								   cdev);
+		}
+		mutex_unlock(&thermal_list_lock);
+	}
+
+	tz->tc1 = 1;
+	tz->tc2 = 1;
+
+	if (!tz->passive_delay)
+		tz->passive_delay = 1000;
+
+	if (!tz->polling_delay)
+		tz->polling_delay = 10000;
+
+	tz->forced_passive = state;
+
+	thermal_zone_device_update(tz);
+
+	return count;
+}
+
+static ssize_t
+passive_show(struct device *dev, struct device_attribute *attr,
+		   char *buf)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+
+	return sprintf(buf, "%d\n", tz->forced_passive);
+}
+
 static DEVICE_ATTR(type, 0444, type_show, NULL);
 static DEVICE_ATTR(temp, 0444, temp_show, NULL);
 static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
+static DEVICE_ATTR(passive, S_IRUGO | S_IWUSR, passive_show, \
+		   passive_store);
 
 static struct device_attribute trip_point_attrs[] = {
 	__ATTR(trip_point_0_type, 0444, trip_point_type_show, NULL),
@@ -939,6 +999,11 @@ void thermal_zone_device_update(struct thermal_zone_device *tz)
 			break;
 		}
 	}
+
+	if (tz->forced_passive)
+		thermal_zone_device_passive(tz, temp, tz->forced_passive,
+					    THERMAL_TRIPS_NONE);
+
 	tz->last_temperature = temp;
 	if (tz->passive)
 		thermal_zone_device_set_polling(tz, tz->passive_delay);
@@ -977,8 +1042,10 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 {
 	struct thermal_zone_device *tz;
 	struct thermal_cooling_device *pos;
+	enum thermal_trip_type trip_type;
 	int result;
 	int count;
+	int passive = 0;
 
 	if (strlen(type) >= THERMAL_NAME_LENGTH)
 		return ERR_PTR(-EINVAL);
@@ -1041,8 +1108,18 @@ struct thermal_zone_device *thermal_zone_device_register(char *type,
 		TRIP_POINT_ATTR_ADD(&tz->device, count, result);
 		if (result)
 			goto unregister;
+		tz->ops->get_trip_type(tz, count, &trip_type);
+		if (trip_type == THERMAL_TRIP_PASSIVE)
+			passive = 1;
 	}
 
+	if (!passive)
+		result = device_create_file(&tz->device,
+					    &dev_attr_passive);
+
+	if (result)
+		goto unregister;
+
 	result = thermal_add_hwmon_sysfs(tz);
 	if (result)
 		goto unregister;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index a81c61521ba4..1de8b9eb841b 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -113,6 +113,7 @@ struct thermal_zone_device {
 	int polling_delay;
 	int last_temperature;
 	bool passive;
+	unsigned int forced_passive;
 	struct thermal_zone_device_ops *ops;
 	struct list_head cooling_devices;
 	struct idr idr;
-- 
cgit v1.2.3-71-gd317


From cc0be3227df9146968311308a9d19db1469ce1db Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Fri, 27 Mar 2009 15:55:36 -0700
Subject: net: Add missing include into include/linux/netdevice.h

The inline function skb_gro_mac_header defined in include/linux/netdevice.h
makes use of page_address(). Depending on configuration options, the latter
is either defined as a macro or is declared as a function in another header
file, namely include/linux/mm.h. However, include/linux/netdevice.h does not
include include/linux/mm.h.

On MIPS, this has produced the following build error:

  CC      kernel/sysctl_check.o
In file included from include/linux/icmpv6.h:173,
                 from include/linux/ipv6.h:208,
                 from include/net/ip_vs.h:26,
                 from kernel/sysctl_check.c:6:
include/linux/netdevice.h: In function 'skb_gro_mac_header':
include/linux/netdevice.h:1132: error: implicit declaration of function
'page_address'
include/linux/netdevice.h:1133: warning: pointer/integer type mismatch
in conditional expression
make[1]: *** [kernel/sysctl_check.o] Error 1
make: *** [kernel] Error 2

The patch adds the missing include and fixes the build error.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index be3ebd7e8ce5..1b55952a17f6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -32,6 +32,7 @@
 #ifdef __KERNEL__
 #include <linux/timer.h>
 #include <linux/delay.h>
+#include <linux/mm.h>
 #include <asm/atomic.h>
 #include <asm/cache.h>
 #include <asm/byteorder.h>
-- 
cgit v1.2.3-71-gd317


From 8fdc621dc743b87879ccf0177969864b09388d9a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sat, 14 Mar 2009 09:34:01 +0100
Subject: nl80211: export supported commands

This makes nl80211 export the supported commands (command groups)
per wiphy so userspace has an idea what it can do -- this will be
required reading for userspace when we introduce auth/assoc /or/
connect for older hardware that cannot separate auth and assoc.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  6 ++++++
 net/wireless/nl80211.c  | 27 +++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index f33aa08dd9b3..3700d927e245 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -349,6 +349,10 @@ enum nl80211_commands {
  * @NL80211_ATTR_REG_TYPE: indicates the type of the regulatory domain currently
  * 	set. This can be one of the nl80211_reg_type (%NL80211_REGDOM_TYPE_*)
  *
+ * @NL80211_ATTR_SUPPORTED_COMMANDS: wiphy attribute that specifies
+ *	an array of command numbers (i.e. a mapping index to command number)
+ *	that the driver for the given wiphy supports.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -426,6 +430,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_REG_INITIATOR,
 	NL80211_ATTR_REG_TYPE,
 
+	NL80211_ATTR_SUPPORTED_COMMANDS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ab9d8f14e151..58ee1b1aff89 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -131,6 +131,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	struct nlattr *nl_freqs, *nl_freq;
 	struct nlattr *nl_rates, *nl_rate;
 	struct nlattr *nl_modes;
+	struct nlattr *nl_cmds;
 	enum ieee80211_band band;
 	struct ieee80211_channel *chan;
 	struct ieee80211_rate *rate;
@@ -242,6 +243,32 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	}
 	nla_nest_end(msg, nl_bands);
 
+	nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS);
+	if (!nl_cmds)
+		goto nla_put_failure;
+
+	i = 0;
+#define CMD(op, n)						\
+	 do {							\
+		if (dev->ops->op) {				\
+			i++;					\
+			NLA_PUT_U32(msg, i, NL80211_CMD_ ## n);	\
+		}						\
+	} while (0)
+
+	CMD(add_virtual_intf, NEW_INTERFACE);
+	CMD(change_virtual_intf, SET_INTERFACE);
+	CMD(add_key, NEW_KEY);
+	CMD(add_beacon, NEW_BEACON);
+	CMD(add_station, NEW_STATION);
+	CMD(add_mpath, NEW_MPATH);
+	CMD(set_mesh_params, SET_MESH_PARAMS);
+	CMD(change_bss, SET_BSS);
+	CMD(set_mgmt_extra_ie, SET_MGMT_EXTRA_IE);
+
+#undef CMD
+	nla_nest_end(msg, nl_cmds);
+
 	return genlmsg_end(msg, hdr);
 
  nla_put_failure:
-- 
cgit v1.2.3-71-gd317


From 3f46b29cd8caa35fcbc46e254a5abeee4e0e9e2f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sat, 14 Mar 2009 19:10:51 +0100
Subject: ieee80211: document DS bit usage

I keep needing this because I'm too stupid to remember it.
Everybody else can probably remember, but who knows :)

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b1bb817d1427..382387e75b89 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -18,6 +18,22 @@
 #include <linux/types.h>
 #include <asm/byteorder.h>
 
+/*
+ * DS bit usage
+ *
+ * TA = transmitter address
+ * RA = receiver address
+ * DA = destination address
+ * SA = source address
+ *
+ * ToDS    FromDS  A1(RA)  A2(TA)  A3      A4      Use
+ * -----------------------------------------------------------------
+ *  0       0       DA      SA      BSSID   -       IBSS/DLS
+ *  0       1       DA      BSSID   SA      -       AP -> STA
+ *  1       0       BSSID   SA      DA      -       AP <- STA
+ *  1       1       RA      TA      DA      SA      unspecified (WDS)
+ */
+
 #define FCS_LEN 4
 
 #define IEEE80211_FCTL_VERS		0x0003
-- 
cgit v1.2.3-71-gd317


From 6039f6d23fe792d615da5449e9fa1c6b43caacf6 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Thu, 19 Mar 2009 13:39:21 +0200
Subject: nl80211: Event notifications for MLME events

Add new nl80211 event notifications (and a new multicast group, "mlme")
for informing user space about received and processed Authentication,
(Re)Association Response, Deauthentication, and Disassociation frames in
station and IBSS modes (i.e., MLME SAP interface primitives
MLME-AUTHENTICATE.confirm, MLME-ASSOCIATE.confirm,
MLME-REASSOCIATE.confirm, MLME-DEAUTHENTICATE.indicate, and
MLME-DISASSOCIATE.indication). The event data is encapsulated as the 802.11
management frame since we already have the frame in that format and it
includes all the needed information.

This is the initial step in providing MLME SAP interface for
authentication and association with nl80211. In other words, kernel code
will act as the MLME and a user space application can control it as the
SME.

Signed-off-by: Jouni Malinen <j@w1.fi>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 36 ++++++++++++++++++++++++-
 include/net/cfg80211.h  | 46 +++++++++++++++++++++++++++++++
 net/mac80211/mlme.c     |  9 +++++--
 net/wireless/Makefile   |  2 +-
 net/wireless/mlme.c     | 46 +++++++++++++++++++++++++++++++
 net/wireless/nl80211.c  | 72 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h  | 12 +++++++++
 7 files changed, 219 insertions(+), 4 deletions(-)
 create mode 100644 net/wireless/mlme.c

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 3700d927e245..5ce68ae8314e 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -161,6 +161,25 @@
  * 	%NL80211_REG_TYPE_COUNTRY the alpha2 to which we have moved on
  * 	to (%NL80211_ATTR_REG_ALPHA2).
  *
+ * @NL80211_CMD_AUTHENTICATE: authentication notification (on the "mlme"
+ *	multicast group). This event reports reception of an Authentication
+ *	frame in station and IBSS modes when the local MLME processed the
+ *	frame, i.e., it was for the local STA and was received in correct
+ *	state. This is similar to MLME-AUTHENTICATE.confirm primitive in the
+ *	MLME SAP interface (kernel providing MLME, userspace SME). The
+ *	included NL80211_ATTR_FRAME attribute contains the management frame
+ *	(including both the header and frame body, but not FCS).
+ * @NL80211_CMD_ASSOCIATE: association notification; like
+ *	NL80211_CMD_AUTHENTICATE but for Association Response and Reassociation
+ *	Response frames (similar to MLME-ASSOCIATE.confirm or
+ *	MLME-REASSOCIATE.confirm primitives).
+ * @NL80211_CMD_DEAUTHENTICATE: deauthentication notification; like
+ *	NL80211_CMD_AUTHENTICATE but for Deauthentication frames (similar to
+ *	MLME-DEAUTHENTICATE.indication primitive).
+ * @NL80211_CMD_DISASSOCIATE: disassociation notification; like
+ *	NL80211_CMD_AUTHENTICATE but for Disassociation frames (similar to
+ *	MLME-DISASSOCIATE.indication primitive).
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -217,6 +236,11 @@ enum nl80211_commands {
 
 	NL80211_CMD_REG_CHANGE,
 
+	NL80211_CMD_AUTHENTICATE,
+	NL80211_CMD_ASSOCIATE,
+	NL80211_CMD_DEAUTHENTICATE,
+	NL80211_CMD_DISASSOCIATE,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -230,8 +254,11 @@ enum nl80211_commands {
  */
 #define NL80211_CMD_SET_BSS NL80211_CMD_SET_BSS
 #define NL80211_CMD_SET_MGMT_EXTRA_IE NL80211_CMD_SET_MGMT_EXTRA_IE
-
 #define NL80211_CMD_REG_CHANGE NL80211_CMD_REG_CHANGE
+#define NL80211_CMD_AUTHENTICATE NL80211_CMD_AUTHENTICATE
+#define NL80211_CMD_ASSOCIATE NL80211_CMD_ASSOCIATE
+#define NL80211_CMD_DEAUTHENTICATE NL80211_CMD_DEAUTHENTICATE
+#define NL80211_CMD_DISASSOCIATE NL80211_CMD_DISASSOCIATE
 
 /**
  * enum nl80211_attrs - nl80211 netlink attributes
@@ -353,6 +380,10 @@ enum nl80211_commands {
  *	an array of command numbers (i.e. a mapping index to command number)
  *	that the driver for the given wiphy supports.
  *
+ * @NL80211_ATTR_FRAME: frame data (binary attribute), including frame header
+ *	and body, but not FCS; used, e.g., with NL80211_CMD_AUTHENTICATE and
+ *	NL80211_CMD_ASSOCIATE events
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -432,6 +463,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_SUPPORTED_COMMANDS,
 
+	NL80211_ATTR_FRAME,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -451,6 +484,7 @@ enum nl80211_attrs {
 #define NL80211_ATTR_IE NL80211_ATTR_IE
 #define NL80211_ATTR_REG_INITIATOR NL80211_ATTR_REG_INITIATOR
 #define NL80211_ATTR_REG_TYPE NL80211_ATTR_REG_TYPE
+#define NL80211_ATTR_FRAME NL80211_ATTR_FRAME
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 50f3fd9ff524..ad44016021b1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -807,4 +807,50 @@ void cfg80211_put_bss(struct cfg80211_bss *bss);
  */
 void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
 
+/**
+ * cfg80211_send_rx_auth - notification of processed authentication
+ * @dev: network device
+ * @buf: authentication frame (header + body)
+ * @len: length of the frame data
+ *
+ * This function is called whenever an authentication has been processed in
+ * station mode.
+ */
+void cfg80211_send_rx_auth(struct net_device *dev, const u8 *buf, size_t len);
+
+/**
+ * cfg80211_send_rx_assoc - notification of processed association
+ * @dev: network device
+ * @buf: (re)association response frame (header + body)
+ * @len: length of the frame data
+ *
+ * This function is called whenever a (re)association response has been
+ * processed in station mode.
+ */
+void cfg80211_send_rx_assoc(struct net_device *dev, const u8 *buf, size_t len);
+
+/**
+ * cfg80211_send_rx_deauth - notification of processed deauthentication
+ * @dev: network device
+ * @buf: deauthentication frame (header + body)
+ * @len: length of the frame data
+ *
+ * This function is called whenever deauthentication has been processed in
+ * station mode.
+ */
+void cfg80211_send_rx_deauth(struct net_device *dev, const u8 *buf,
+			     size_t len);
+
+/**
+ * cfg80211_send_rx_disassoc - notification of processed disassociation
+ * @dev: network device
+ * @buf: disassociation response frame (header + body)
+ * @len: length of the frame data
+ *
+ * This function is called whenever disassociation has been processed in
+ * station mode.
+ */
+void cfg80211_send_rx_disassoc(struct net_device *dev, const u8 *buf,
+			       size_t len);
+
 #endif /* __NET_CFG80211_H */
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 1f49b63d8dd2..6dc7a61bc18b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1085,11 +1085,13 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	case WLAN_AUTH_OPEN:
 	case WLAN_AUTH_LEAP:
 		ieee80211_auth_completed(sdata);
+		cfg80211_send_rx_auth(sdata->dev, (u8 *) mgmt, len);
 		break;
 	case WLAN_AUTH_SHARED_KEY:
-		if (ifmgd->auth_transaction == 4)
+		if (ifmgd->auth_transaction == 4) {
 			ieee80211_auth_completed(sdata);
-		else
+			cfg80211_send_rx_auth(sdata->dev, (u8 *) mgmt, len);
+		} else
 			ieee80211_auth_challenge(sdata, mgmt, len);
 		break;
 	}
@@ -1125,6 +1127,7 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_set_disassoc(sdata, true, false, 0);
 	ifmgd->flags &= ~IEEE80211_STA_AUTHENTICATED;
+	cfg80211_send_rx_deauth(sdata->dev, (u8 *) mgmt, len);
 }
 
 
@@ -1154,6 +1157,7 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
 	}
 
 	ieee80211_set_disassoc(sdata, false, false, reason_code);
+	cfg80211_send_rx_disassoc(sdata->dev, (u8 *) mgmt, len);
 }
 
 
@@ -1370,6 +1374,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	ieee80211_set_associated(sdata, changed);
 
 	ieee80211_associated(sdata);
+	cfg80211_send_rx_assoc(sdata->dev, (u8 *) mgmt, len);
 }
 
 
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index c157b4d8014b..6d1e7b27b752 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o
 obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o
 obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o
 
-cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
+cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o mlme.o
 cfg80211-$(CONFIG_WIRELESS_EXT) += wext-compat.o
 
 ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
new file mode 100644
index 000000000000..bec5721b6f99
--- /dev/null
+++ b/net/wireless/mlme.c
@@ -0,0 +1,46 @@
+/*
+ * cfg80211 MLME SAP interface
+ *
+ * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/nl80211.h>
+#include <net/cfg80211.h>
+#include "core.h"
+#include "nl80211.h"
+
+void cfg80211_send_rx_auth(struct net_device *dev, const u8 *buf, size_t len)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	nl80211_send_rx_auth(rdev, dev, buf, len);
+}
+EXPORT_SYMBOL(cfg80211_send_rx_auth);
+
+void cfg80211_send_rx_assoc(struct net_device *dev, const u8 *buf, size_t len)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	nl80211_send_rx_assoc(rdev, dev, buf, len);
+}
+EXPORT_SYMBOL(cfg80211_send_rx_assoc);
+
+void cfg80211_send_rx_deauth(struct net_device *dev, const u8 *buf, size_t len)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	nl80211_send_rx_deauth(rdev, dev, buf, len);
+}
+EXPORT_SYMBOL(cfg80211_send_rx_deauth);
+
+void cfg80211_send_rx_disassoc(struct net_device *dev, const u8 *buf,
+			       size_t len)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	nl80211_send_rx_disassoc(rdev, dev, buf, len);
+}
+EXPORT_SYMBOL(cfg80211_send_rx_disassoc);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a3ecf8d73898..c034c2418cb3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2830,6 +2830,9 @@ static struct genl_ops nl80211_ops[] = {
 		.dumpit = nl80211_dump_scan,
 	},
 };
+static struct genl_multicast_group nl80211_mlme_mcgrp = {
+	.name = "mlme",
+};
 
 /* multicast groups */
 static struct genl_multicast_group nl80211_config_mcgrp = {
@@ -2975,6 +2978,71 @@ nla_put_failure:
 	nlmsg_free(msg);
 }
 
+static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
+				    struct net_device *netdev,
+				    const u8 *buf, size_t len,
+				    enum nl80211_commands cmd)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_FRAME, len, buf);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_mlme_mcgrp.id, GFP_KERNEL);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
+			  struct net_device *netdev, const u8 *buf, size_t len)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_AUTHENTICATE);
+}
+
+void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
+			   struct net_device *netdev, const u8 *buf,
+			   size_t len)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len, NL80211_CMD_ASSOCIATE);
+}
+
+void nl80211_send_rx_deauth(struct cfg80211_registered_device *rdev,
+			    struct net_device *netdev, const u8 *buf,
+			    size_t len)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_DEAUTHENTICATE);
+}
+
+void nl80211_send_rx_disassoc(struct cfg80211_registered_device *rdev,
+			      struct net_device *netdev, const u8 *buf,
+			      size_t len)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_DISASSOCIATE);
+}
+
 /* initialisation/exit functions */
 
 int nl80211_init(void)
@@ -3003,6 +3071,10 @@ int nl80211_init(void)
 	if (err)
 		goto err_out;
 
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_mlme_mcgrp);
+	if (err)
+		goto err_out;
+
 	return 0;
  err_out:
 	genl_unregister_family(&nl80211_fam);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 5b5fe1339de0..b77af4ab80be 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -11,5 +11,17 @@ extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
 extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
 				      struct net_device *netdev);
 extern void nl80211_send_reg_change_event(struct regulatory_request *request);
+extern void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev,
+				 const u8 *buf, size_t len);
+extern void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
+				  struct net_device *netdev,
+				  const u8 *buf, size_t len);
+extern void nl80211_send_rx_deauth(struct cfg80211_registered_device *rdev,
+				   struct net_device *netdev,
+				   const u8 *buf, size_t len);
+extern void nl80211_send_rx_disassoc(struct cfg80211_registered_device *rdev,
+				     struct net_device *netdev,
+				     const u8 *buf, size_t len);
 
 #endif /* __NET_WIRELESS_NL80211_H */
-- 
cgit v1.2.3-71-gd317


From 636a5d3625993c5ca59abc81794b9ded93cdb740 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Thu, 19 Mar 2009 13:39:22 +0200
Subject: nl80211: Add MLME primitives to support external SME

This patch adds new nl80211 commands to allow user space to request
authentication and association (and also deauthentication and
disassociation). The commands are structured to allow separate
authentication and association steps, i.e., the interface between
kernel and user space is similar to the MLME SAP interface in IEEE
802.11 standard and an user space application takes the role of the
SME.

The patch introduces MLME-AUTHENTICATE.request,
MLME-{,RE}ASSOCIATE.request, MLME-DEAUTHENTICATE.request, and
MLME-DISASSOCIATE.request primitives. The authentication and
association commands request the actual operations in two steps
(assuming the driver supports this; if not, separate authentication
step is skipped; this could end up being a separate "connect"
command).

The initial implementation for mac80211 uses the current
net/mac80211/mlme.c for actual sending and processing of management
frames and the new nl80211 commands will just stop the current state
machine from moving automatically from authentication to association.
Future cleanup may move more of the MLME operations into cfg80211.

The goal of this design is to provide more control of authentication and
association process to user space without having to move the full MLME
implementation. This should be enough to allow IEEE 802.11r FT protocol
and 802.11s SAE authentication to be implemented. Obviously, this will
also bring the extra benefit of not having to use WEXT for association
requests with mac80211. An example implementation of a user space SME
using the new nl80211 commands is available for wpa_supplicant.

This patch is enough to get IEEE 802.11r FT protocol working with
over-the-air mechanism (over-the-DS will need additional MLME
primitives for handling the FT Action frames).

Signed-off-by: Jouni Malinen <j@w1.fi>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  |   1 +
 include/linux/nl80211.h    |  58 +++++++++--
 include/net/cfg80211.h     | 113 ++++++++++++++++++++
 net/mac80211/cfg.c         | 140 +++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |   7 +-
 net/mac80211/mlme.c        |  45 ++++++--
 net/mac80211/wext.c        |   3 +
 net/wireless/nl80211.c     | 255 +++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 601 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 382387e75b89..4b501b48ce86 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -867,6 +867,7 @@ struct ieee80211_ht_info {
 /* Authentication algorithms */
 #define WLAN_AUTH_OPEN 0
 #define WLAN_AUTH_SHARED_KEY 1
+#define WLAN_AUTH_FT 2
 #define WLAN_AUTH_LEAP 128
 
 #define WLAN_AUTH_CHALLENGE_LEN 128
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 5ce68ae8314e..9685eaab40a9 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -161,24 +161,37 @@
  * 	%NL80211_REG_TYPE_COUNTRY the alpha2 to which we have moved on
  * 	to (%NL80211_ATTR_REG_ALPHA2).
  *
- * @NL80211_CMD_AUTHENTICATE: authentication notification (on the "mlme"
- *	multicast group). This event reports reception of an Authentication
+ * @NL80211_CMD_AUTHENTICATE: authentication request and notification.
+ *	This command is used both as a command (request to authenticate) and
+ *	as an event on the "mlme" multicast group indicating completion of the
+ *	authentication process.
+ *	When used as a command, %NL80211_ATTR_IFINDEX is used to identify the
+ *	interface. %NL80211_ATTR_MAC is used to specify PeerSTAAddress (and
+ *	BSSID in case of station mode). %NL80211_ATTR_SSID is used to specify
+ *	the SSID (mainly for association, but is included in authentication
+ *	request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ is used
+ *	to specify the frequence of the channel in MHz. %NL80211_ATTR_AUTH_TYPE
+ *	is used to specify the authentication type. %NL80211_ATTR_IE is used to
+ *	define IEs (VendorSpecificInfo, but also including RSN IE and FT IEs)
+ *	to be added to the frame.
+ *	When used as an event, this reports reception of an Authentication
  *	frame in station and IBSS modes when the local MLME processed the
  *	frame, i.e., it was for the local STA and was received in correct
  *	state. This is similar to MLME-AUTHENTICATE.confirm primitive in the
  *	MLME SAP interface (kernel providing MLME, userspace SME). The
  *	included NL80211_ATTR_FRAME attribute contains the management frame
  *	(including both the header and frame body, but not FCS).
- * @NL80211_CMD_ASSOCIATE: association notification; like
- *	NL80211_CMD_AUTHENTICATE but for Association Response and Reassociation
- *	Response frames (similar to MLME-ASSOCIATE.confirm or
- *	MLME-REASSOCIATE.confirm primitives).
- * @NL80211_CMD_DEAUTHENTICATE: deauthentication notification; like
+ * @NL80211_CMD_ASSOCIATE: association request and notification; like
+ *	NL80211_CMD_AUTHENTICATE but for Association and Reassociation
+ *	(similar to MLME-ASSOCIATE.request, MLME-REASSOCIATE.request,
+ *	MLME-ASSOCIATE.confirm or MLME-REASSOCIATE.confirm primitives).
+ * @NL80211_CMD_DEAUTHENTICATE: deauthentication request and notification; like
  *	NL80211_CMD_AUTHENTICATE but for Deauthentication frames (similar to
- *	MLME-DEAUTHENTICATE.indication primitive).
- * @NL80211_CMD_DISASSOCIATE: disassociation notification; like
+ *	MLME-DEAUTHENTICATION.request and MLME-DEAUTHENTICATE.indication
+ *	primitives).
+ * @NL80211_CMD_DISASSOCIATE: disassociation request and notification; like
  *	NL80211_CMD_AUTHENTICATE but for Disassociation frames (similar to
- *	MLME-DISASSOCIATE.indication primitive).
+ *	MLME-DISASSOCIATE.request and MLME-DISASSOCIATE.indication primitives).
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -383,6 +396,11 @@ enum nl80211_commands {
  * @NL80211_ATTR_FRAME: frame data (binary attribute), including frame header
  *	and body, but not FCS; used, e.g., with NL80211_CMD_AUTHENTICATE and
  *	NL80211_CMD_ASSOCIATE events
+ * @NL80211_ATTR_SSID: SSID (binary attribute, 0..32 octets)
+ * @NL80211_ATTR_AUTH_TYPE: AuthenticationType, see &enum nl80211_auth_type,
+ *	represented as a u32
+ * @NL80211_ATTR_REASON_CODE: ReasonCode for %NL80211_CMD_DEAUTHENTICATE and
+ *	%NL80211_CMD_DISASSOCIATE, u16
  *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -464,6 +482,9 @@ enum nl80211_attrs {
 	NL80211_ATTR_SUPPORTED_COMMANDS,
 
 	NL80211_ATTR_FRAME,
+	NL80211_ATTR_SSID,
+	NL80211_ATTR_AUTH_TYPE,
+	NL80211_ATTR_REASON_CODE,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -485,6 +506,9 @@ enum nl80211_attrs {
 #define NL80211_ATTR_REG_INITIATOR NL80211_ATTR_REG_INITIATOR
 #define NL80211_ATTR_REG_TYPE NL80211_ATTR_REG_TYPE
 #define NL80211_ATTR_FRAME NL80211_ATTR_FRAME
+#define NL80211_ATTR_SSID NL80211_ATTR_SSID
+#define NL80211_ATTR_AUTH_TYPE NL80211_ATTR_AUTH_TYPE
+#define NL80211_ATTR_REASON_CODE NL80211_ATTR_REASON_CODE
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
@@ -1018,4 +1042,18 @@ enum nl80211_bss {
 	NL80211_BSS_MAX = __NL80211_BSS_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_auth_type - AuthenticationType
+ *
+ * @NL80211_AUTHTYPE_OPEN_SYSTEM: Open System authentication
+ * @NL80211_AUTHTYPE_SHARED_KEY: Shared Key authentication (WEP only)
+ * @NL80211_AUTHTYPE_FT: Fast BSS Transition (IEEE 802.11r)
+ * @NL80211_AUTHTYPE_NETWORK_EAP: Network EAP (some Cisco APs and mainly LEAP)
+ */
+enum nl80211_auth_type {
+	NL80211_AUTHTYPE_OPEN_SYSTEM,
+	NL80211_AUTHTYPE_SHARED_KEY,
+	NL80211_AUTHTYPE_FT,
+	NL80211_AUTHTYPE_NETWORK_EAP,
+};
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ad44016021b1..0da9a55881a1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -578,6 +578,105 @@ struct cfg80211_bss {
 	u8 priv[0] __attribute__((__aligned__(sizeof(void *))));
 };
 
+/**
+ * struct cfg80211_auth_request - Authentication request data
+ *
+ * This structure provides information needed to complete IEEE 802.11
+ * authentication.
+ * NOTE: This structure will likely change when more code from mac80211 is
+ * moved into cfg80211 so that non-mac80211 drivers can benefit from it, too.
+ * Before using this in a driver that does not use mac80211, it would be better
+ * to check the status of that work and better yet, volunteer to work on it.
+ *
+ * @chan: The channel to use or %NULL if not specified (auto-select based on
+ *	scan results)
+ * @peer_addr: The address of the peer STA (AP BSSID in infrastructure case);
+ *	this field is required to be present; if the driver wants to help with
+ *	BSS selection, it should use (yet to be added) MLME event to allow user
+ *	space SME to be notified of roaming candidate, so that the SME can then
+ *	use the authentication request with the recommended BSSID and whatever
+ *	other data may be needed for authentication/association
+ * @ssid: SSID or %NULL if not yet available
+ * @ssid_len: Length of ssid in octets
+ * @auth_type: Authentication type (algorithm)
+ * @ie: Extra IEs to add to Authentication frame or %NULL
+ * @ie_len: Length of ie buffer in octets
+ */
+struct cfg80211_auth_request {
+	struct ieee80211_channel *chan;
+	u8 *peer_addr;
+	const u8 *ssid;
+	size_t ssid_len;
+	enum nl80211_auth_type auth_type;
+	const u8 *ie;
+	size_t ie_len;
+};
+
+/**
+ * struct cfg80211_assoc_request - (Re)Association request data
+ *
+ * This structure provides information needed to complete IEEE 802.11
+ * (re)association.
+ * NOTE: This structure will likely change when more code from mac80211 is
+ * moved into cfg80211 so that non-mac80211 drivers can benefit from it, too.
+ * Before using this in a driver that does not use mac80211, it would be better
+ * to check the status of that work and better yet, volunteer to work on it.
+ *
+ * @chan: The channel to use or %NULL if not specified (auto-select based on
+ *	scan results)
+ * @peer_addr: The address of the peer STA (AP BSSID); this field is required
+ *	to be present and the STA must be in State 2 (authenticated) with the
+ *	peer STA
+ * @ssid: SSID
+ * @ssid_len: Length of ssid in octets
+ * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
+ * @ie_len: Length of ie buffer in octets
+ */
+struct cfg80211_assoc_request {
+	struct ieee80211_channel *chan;
+	u8 *peer_addr;
+	const u8 *ssid;
+	size_t ssid_len;
+	const u8 *ie;
+	size_t ie_len;
+};
+
+/**
+ * struct cfg80211_deauth_request - Deauthentication request data
+ *
+ * This structure provides information needed to complete IEEE 802.11
+ * deauthentication.
+ *
+ * @peer_addr: The address of the peer STA (AP BSSID); this field is required
+ *	to be present and the STA must be authenticated with the peer STA
+ * @ie: Extra IEs to add to Deauthentication frame or %NULL
+ * @ie_len: Length of ie buffer in octets
+ */
+struct cfg80211_deauth_request {
+	u8 *peer_addr;
+	u16 reason_code;
+	const u8 *ie;
+	size_t ie_len;
+};
+
+/**
+ * struct cfg80211_disassoc_request - Disassociation request data
+ *
+ * This structure provides information needed to complete IEEE 802.11
+ * disassocation.
+ *
+ * @peer_addr: The address of the peer STA (AP BSSID); this field is required
+ *	to be present and the STA must be associated with the peer STA
+ * @ie: Extra IEs to add to Disassociation frame or %NULL
+ * @ie_len: Length of ie buffer in octets
+ */
+struct cfg80211_disassoc_request {
+	u8 *peer_addr;
+	u16 reason_code;
+	const u8 *ie;
+	size_t ie_len;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -650,6 +749,11 @@ struct cfg80211_bss {
  *	the driver, and will be valid until passed to cfg80211_scan_done().
  *	For scan results, call cfg80211_inform_bss(); you can call this outside
  *	the scan/scan_done bracket too.
+ *
+ * @auth: Request to authenticate with the specified peer
+ * @assoc: Request to (re)associate with the specified peer
+ * @deauth: Request to deauthenticate from the specified peer
+ * @disassoc: Request to disassociate from the specified peer
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy);
@@ -730,6 +834,15 @@ struct cfg80211_ops {
 
 	int	(*scan)(struct wiphy *wiphy, struct net_device *dev,
 			struct cfg80211_scan_request *request);
+
+	int	(*auth)(struct wiphy *wiphy, struct net_device *dev,
+			struct cfg80211_auth_request *req);
+	int	(*assoc)(struct wiphy *wiphy, struct net_device *dev,
+			 struct cfg80211_assoc_request *req);
+	int	(*deauth)(struct wiphy *wiphy, struct net_device *dev,
+			  struct cfg80211_deauth_request *req);
+	int	(*disassoc)(struct wiphy *wiphy, struct net_device *dev,
+			    struct cfg80211_disassoc_request *req);
 };
 
 /* temporary wext handlers */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 58693e52d458..223e536e8426 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1300,6 +1300,142 @@ static int ieee80211_scan(struct wiphy *wiphy,
 	return ieee80211_request_scan(sdata, req);
 }
 
+static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev,
+			  struct cfg80211_auth_request *req)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	switch (req->auth_type) {
+	case NL80211_AUTHTYPE_OPEN_SYSTEM:
+		sdata->u.mgd.auth_algs = IEEE80211_AUTH_ALG_OPEN;
+		break;
+	case NL80211_AUTHTYPE_SHARED_KEY:
+		sdata->u.mgd.auth_algs = IEEE80211_AUTH_ALG_SHARED_KEY;
+		break;
+	case NL80211_AUTHTYPE_FT:
+		sdata->u.mgd.auth_algs = IEEE80211_AUTH_ALG_FT;
+		break;
+	case NL80211_AUTHTYPE_NETWORK_EAP:
+		sdata->u.mgd.auth_algs = IEEE80211_AUTH_ALG_LEAP;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	memcpy(sdata->u.mgd.bssid, req->peer_addr, ETH_ALEN);
+	sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
+	sdata->u.mgd.flags |= IEEE80211_STA_BSSID_SET;
+
+	/* TODO: req->chan */
+	sdata->u.mgd.flags |= IEEE80211_STA_AUTO_CHANNEL_SEL;
+
+	if (req->ssid) {
+		sdata->u.mgd.flags |= IEEE80211_STA_SSID_SET;
+		memcpy(sdata->u.mgd.ssid, req->ssid, req->ssid_len);
+		sdata->u.mgd.ssid_len = req->ssid_len;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
+	}
+
+	kfree(sdata->u.mgd.sme_auth_ie);
+	sdata->u.mgd.sme_auth_ie = NULL;
+	sdata->u.mgd.sme_auth_ie_len = 0;
+	if (req->ie) {
+		sdata->u.mgd.sme_auth_ie = kmalloc(req->ie_len, GFP_KERNEL);
+		if (sdata->u.mgd.sme_auth_ie == NULL)
+			return -ENOMEM;
+		memcpy(sdata->u.mgd.sme_auth_ie, req->ie, req->ie_len);
+		sdata->u.mgd.sme_auth_ie_len = req->ie_len;
+	}
+
+	sdata->u.mgd.flags |= IEEE80211_STA_EXT_SME;
+	sdata->u.mgd.state = IEEE80211_STA_MLME_DIRECT_PROBE;
+	ieee80211_sta_req_auth(sdata);
+	return 0;
+}
+
+static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev,
+			   struct cfg80211_assoc_request *req)
+{
+	struct ieee80211_sub_if_data *sdata;
+	int ret;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	if (memcmp(sdata->u.mgd.bssid, req->peer_addr, ETH_ALEN) != 0 ||
+	    !(sdata->u.mgd.flags & IEEE80211_STA_AUTHENTICATED))
+		return -ENOLINK; /* not authenticated */
+
+	sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
+	sdata->u.mgd.flags |= IEEE80211_STA_BSSID_SET;
+
+	/* TODO: req->chan */
+	sdata->u.mgd.flags |= IEEE80211_STA_AUTO_CHANNEL_SEL;
+
+	if (req->ssid) {
+		sdata->u.mgd.flags |= IEEE80211_STA_SSID_SET;
+		memcpy(sdata->u.mgd.ssid, req->ssid, req->ssid_len);
+		sdata->u.mgd.ssid_len = req->ssid_len;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
+	} else
+		sdata->u.mgd.flags |= IEEE80211_STA_AUTO_SSID_SEL;
+
+	ret = ieee80211_sta_set_extra_ie(sdata, req->ie, req->ie_len);
+	if (ret)
+		return ret;
+
+	sdata->u.mgd.flags |= IEEE80211_STA_EXT_SME;
+	sdata->u.mgd.state = IEEE80211_STA_MLME_ASSOCIATE;
+	ieee80211_sta_req_auth(sdata);
+	return 0;
+}
+
+static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev,
+			    struct cfg80211_deauth_request *req)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	/* TODO: req->ie */
+	return ieee80211_sta_deauthenticate(sdata, req->reason_code);
+}
+
+static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev,
+			      struct cfg80211_disassoc_request *req)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	/* TODO: req->ie */
+	return ieee80211_sta_disassociate(sdata, req->reason_code);
+}
+
 struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -1333,4 +1469,8 @@ struct cfg80211_ops mac80211_config_ops = {
 	.suspend = ieee80211_suspend,
 	.resume = ieee80211_resume,
 	.scan = ieee80211_scan,
+	.auth = ieee80211_auth,
+	.assoc = ieee80211_assoc,
+	.deauth = ieee80211_deauth,
+	.disassoc = ieee80211_disassoc,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ad12c2a03a95..7b96d95f48b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -256,6 +256,7 @@ struct mesh_preq_queue {
 #define IEEE80211_STA_TKIP_WEP_USED	BIT(14)
 #define IEEE80211_STA_CSA_RECEIVED	BIT(15)
 #define IEEE80211_STA_MFP_ENABLED	BIT(16)
+#define IEEE80211_STA_EXT_SME		BIT(17)
 /* flags for MLME request */
 #define IEEE80211_STA_REQ_SCAN 0
 #define IEEE80211_STA_REQ_DIRECT_PROBE 1
@@ -266,6 +267,7 @@ struct mesh_preq_queue {
 #define IEEE80211_AUTH_ALG_OPEN BIT(0)
 #define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1)
 #define IEEE80211_AUTH_ALG_LEAP BIT(2)
+#define IEEE80211_AUTH_ALG_FT BIT(3)
 
 struct ieee80211_if_managed {
 	struct timer_list timer;
@@ -335,6 +337,9 @@ struct ieee80211_if_managed {
 	size_t ie_deauth_len;
 	u8 *ie_disassoc;
 	size_t ie_disassoc_len;
+
+	u8 *sme_auth_ie;
+	size_t sme_auth_ie_len;
 };
 
 enum ieee80211_ibss_flags {
@@ -970,7 +975,7 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata,
 		  struct sk_buff *skb,
 		  struct ieee80211_rx_status *rx_status);
 int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata,
-			       char *ie, size_t len);
+			       const char *ie, size_t len);
 
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
 void ieee80211_scan_failed(struct ieee80211_local *local);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 6dc7a61bc18b..d1bcc8438772 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -730,6 +730,8 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
+	u8 *ies;
+	size_t ies_len;
 
 	ifmgd->auth_tries++;
 	if (ifmgd->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
@@ -755,7 +757,14 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata)
 	printk(KERN_DEBUG "%s: authenticate with AP %pM\n",
 	       sdata->dev->name, ifmgd->bssid);
 
-	ieee80211_send_auth(sdata, 1, ifmgd->auth_alg, NULL, 0,
+	if (ifmgd->flags & IEEE80211_STA_EXT_SME) {
+		ies = ifmgd->sme_auth_ie;
+		ies_len = ifmgd->sme_auth_ie_len;
+	} else {
+		ies = NULL;
+		ies_len = 0;
+	}
+	ieee80211_send_auth(sdata, 1, ifmgd->auth_alg, ies, ies_len,
 			    ifmgd->bssid, 0);
 	ifmgd->auth_transaction = 2;
 
@@ -870,7 +879,8 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata)
 	int wep_privacy;
 	int privacy_invoked;
 
-	if (!ifmgd || (ifmgd->flags & IEEE80211_STA_MIXED_CELL))
+	if (!ifmgd || (ifmgd->flags & (IEEE80211_STA_MIXED_CELL |
+				       IEEE80211_STA_EXT_SME)))
 		return 0;
 
 	bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
@@ -998,7 +1008,11 @@ static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata)
 
 	printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name);
 	ifmgd->flags |= IEEE80211_STA_AUTHENTICATED;
-	ieee80211_associate(sdata);
+	if (ifmgd->flags & IEEE80211_STA_EXT_SME) {
+		/* Wait for SME to request association */
+		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+	} else
+		ieee80211_associate(sdata);
 }
 
 
@@ -1084,6 +1098,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	switch (ifmgd->auth_alg) {
 	case WLAN_AUTH_OPEN:
 	case WLAN_AUTH_LEAP:
+	case WLAN_AUTH_FT:
 		ieee80211_auth_completed(sdata);
 		cfg80211_send_rx_auth(sdata->dev, (u8 *) mgmt, len);
 		break;
@@ -1117,9 +1132,10 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 		printk(KERN_DEBUG "%s: deauthenticated (Reason: %u)\n",
 				sdata->dev->name, reason_code);
 
-	if (ifmgd->state == IEEE80211_STA_MLME_AUTHENTICATE ||
-	    ifmgd->state == IEEE80211_STA_MLME_ASSOCIATE ||
-	    ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
+	if (!(ifmgd->flags & IEEE80211_STA_EXT_SME) &&
+	    (ifmgd->state == IEEE80211_STA_MLME_AUTHENTICATE ||
+	     ifmgd->state == IEEE80211_STA_MLME_ASSOCIATE ||
+	     ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED)) {
 		ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
 		mod_timer(&ifmgd->timer, jiffies +
 				      IEEE80211_RETRY_AUTH_INTERVAL);
@@ -1150,7 +1166,8 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
 		printk(KERN_DEBUG "%s: disassociated (Reason: %u)\n",
 				sdata->dev->name, reason_code);
 
-	if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
+	if (!(ifmgd->flags & IEEE80211_STA_EXT_SME) &&
+	    ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
 		ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE;
 		mod_timer(&ifmgd->timer, jiffies +
 				      IEEE80211_RETRY_AUTH_INTERVAL);
@@ -1664,6 +1681,8 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata)
 		ifmgd->auth_alg = WLAN_AUTH_SHARED_KEY;
 	else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP)
 		ifmgd->auth_alg = WLAN_AUTH_LEAP;
+	else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_FT)
+		ifmgd->auth_alg = WLAN_AUTH_FT;
 	else
 		ifmgd->auth_alg = WLAN_AUTH_OPEN;
 	ifmgd->auth_transaction = -1;
@@ -1687,7 +1706,8 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata)
 	u16 capa_val = WLAN_CAPABILITY_ESS;
 	struct ieee80211_channel *chan = local->oper_channel;
 
-	if (ifmgd->flags & (IEEE80211_STA_AUTO_SSID_SEL |
+	if (!(ifmgd->flags & IEEE80211_STA_EXT_SME) &&
+	    ifmgd->flags & (IEEE80211_STA_AUTO_SSID_SEL |
 			    IEEE80211_STA_AUTO_BSSID_SEL |
 			    IEEE80211_STA_AUTO_CHANNEL_SEL)) {
 		capa_mask |= WLAN_CAPABILITY_PRIVACY;
@@ -1884,7 +1904,11 @@ void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata)
 			ieee80211_set_disassoc(sdata, true, true,
 					       WLAN_REASON_DEAUTH_LEAVING);
 
-		set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
+		if (!(ifmgd->flags & IEEE80211_STA_EXT_SME) ||
+		    ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE)
+			set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
+		else if (ifmgd->flags & IEEE80211_STA_EXT_SME)
+			set_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request);
 		queue_work(local->hw.workqueue, &ifmgd->work);
 	}
 }
@@ -1953,7 +1977,8 @@ int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
 	return ieee80211_sta_commit(sdata);
 }
 
-int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
+int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata,
+			       const char *ie, size_t len)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index e55d2834764c..ce21d66b1023 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -137,6 +137,7 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev,
 		if (ret)
 			return ret;
 		sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_EXT_SME;
 		ieee80211_sta_req_auth(sdata);
 		return 0;
 	}
@@ -224,6 +225,7 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
 		if (ret)
 			return ret;
 
+		sdata->u.mgd.flags &= ~IEEE80211_STA_EXT_SME;
 		ieee80211_sta_req_auth(sdata);
 		return 0;
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
@@ -287,6 +289,7 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
 		ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
 		if (ret)
 			return ret;
+		sdata->u.mgd.flags &= ~IEEE80211_STA_EXT_SME;
 		ieee80211_sta_req_auth(sdata);
 		return 0;
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c034c2418cb3..9e1318d1d4bb 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -111,6 +111,11 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 			      .len = IEEE80211_MAX_DATA_LEN },
 	[NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED },
 	[NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED },
+
+	[NL80211_ATTR_SSID] = { .type = NLA_BINARY,
+				.len = IEEE80211_MAX_SSID_LEN },
+	[NL80211_ATTR_AUTH_TYPE] = { .type = NLA_U32 },
+	[NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
 };
 
 /* message building helper */
@@ -265,6 +270,10 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	CMD(set_mesh_params, SET_MESH_PARAMS);
 	CMD(change_bss, SET_BSS);
 	CMD(set_mgmt_extra_ie, SET_MGMT_EXTRA_IE);
+	CMD(auth, AUTHENTICATE);
+	CMD(assoc, ASSOCIATE);
+	CMD(deauth, DEAUTHENTICATE);
+	CMD(disassoc, DISASSOCIATE);
 
 #undef CMD
 	nla_nest_end(msg, nl_cmds);
@@ -2646,6 +2655,228 @@ static int nl80211_dump_scan(struct sk_buff *skb,
 	return err;
 }
 
+static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	struct cfg80211_auth_request req;
+	struct wiphy *wiphy;
+	int err;
+
+	rtnl_lock();
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		goto unlock_rtnl;
+
+	if (!drv->ops->auth) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MAC]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	wiphy = &drv->wiphy;
+	memset(&req, 0, sizeof(req));
+
+	req.peer_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+		req.chan = ieee80211_get_channel(
+			wiphy,
+			nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+		if (!req.chan) {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (info->attrs[NL80211_ATTR_SSID]) {
+		req.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+		req.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+	}
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		req.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+		req.auth_type =
+			nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+	}
+
+	err = drv->ops->auth(&drv->wiphy, dev, &req);
+
+out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+unlock_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
+static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	struct cfg80211_assoc_request req;
+	struct wiphy *wiphy;
+	int err;
+
+	rtnl_lock();
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		goto unlock_rtnl;
+
+	if (!drv->ops->assoc) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MAC] ||
+	    !info->attrs[NL80211_ATTR_SSID]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	wiphy = &drv->wiphy;
+	memset(&req, 0, sizeof(req));
+
+	req.peer_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+		req.chan = ieee80211_get_channel(
+			wiphy,
+			nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+		if (!req.chan) {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (nla_len(info->attrs[NL80211_ATTR_SSID]) > IEEE80211_MAX_SSID_LEN) {
+		err = -EINVAL;
+		goto out;
+	}
+	req.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+	req.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		req.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	err = drv->ops->assoc(&drv->wiphy, dev, &req);
+
+out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+unlock_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
+static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	struct cfg80211_deauth_request req;
+	struct wiphy *wiphy;
+	int err;
+
+	rtnl_lock();
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		goto unlock_rtnl;
+
+	if (!drv->ops->deauth) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MAC]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	wiphy = &drv->wiphy;
+	memset(&req, 0, sizeof(req));
+
+	req.peer_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (info->attrs[NL80211_ATTR_REASON_CODE])
+		req.reason_code =
+			nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		req.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	err = drv->ops->deauth(&drv->wiphy, dev, &req);
+
+out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+unlock_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
+static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	struct cfg80211_disassoc_request req;
+	struct wiphy *wiphy;
+	int err;
+
+	rtnl_lock();
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		goto unlock_rtnl;
+
+	if (!drv->ops->disassoc) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MAC]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	wiphy = &drv->wiphy;
+	memset(&req, 0, sizeof(req));
+
+	req.peer_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (info->attrs[NL80211_ATTR_REASON_CODE])
+		req.reason_code =
+			nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		req.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	err = drv->ops->disassoc(&drv->wiphy, dev, &req);
+
+out:
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+unlock_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -2829,6 +3060,30 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.dumpit = nl80211_dump_scan,
 	},
+	{
+		.cmd = NL80211_CMD_AUTHENTICATE,
+		.doit = nl80211_authenticate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_ASSOCIATE,
+		.doit = nl80211_associate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_DEAUTHENTICATE,
+		.doit = nl80211_deauthenticate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_DISASSOCIATE,
+		.doit = nl80211_disassociate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 static struct genl_multicast_group nl80211_mlme_mcgrp = {
 	.name = "mlme",
-- 
cgit v1.2.3-71-gd317


From 65fc73ac4a310945dfeceac961726c2765ad2ec0 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni.malinen@atheros.com>
Date: Fri, 20 Mar 2009 21:21:16 +0200
Subject: nl80211: Remove NL80211_CMD_SET_MGMT_EXTRA_IE

The functionality that NL80211_CMD_SET_MGMT_EXTRA_IE provided can now
be achieved with cleaner design by adding IE(s) into
NL80211_CMD_TRIGGER_SCAN, NL80211_CMD_AUTHENTICATE,
NL80211_CMD_ASSOCIATE, NL80211_CMD_DEAUTHENTICATE, and
NL80211_CMD_DISASSOCIATE.

Since this is a very recently added command and there are no known (or
known planned) applications using NL80211_CMD_SET_MGMT_EXTRA_IE and
taken into account how much extra complexity it adds to the IE
processing we have now (and need to add in the future to fix IE order
in couple of frames), it looks like the best option is to just remove
the implementation of this command for now. The enum values themselves
are left to avoid changing the nl80211 command or attribute numbers.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    |  8 ++++-
 include/net/cfg80211.h     | 26 --------------
 net/mac80211/cfg.c         | 86 ----------------------------------------------
 net/mac80211/ieee80211_i.h | 15 --------
 net/mac80211/iface.c       |  7 ----
 net/mac80211/mlme.c        | 36 ++-----------------
 net/mac80211/util.c        | 29 +++-------------
 net/wireless/nl80211.c     | 47 -------------------------
 8 files changed, 14 insertions(+), 240 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 9685eaab40a9..cbe8ce3bf486 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -142,6 +142,12 @@
  *	%NL80211_ATTR_IE. If the command succeeds, the requested data will be
  *	added to all specified management frames generated by
  *	kernel/firmware/driver.
+ *	Note: This command has been removed and it is only reserved at this
+ *	point to avoid re-using existing command number. The functionality this
+ *	command was planned for has been provided with cleaner design with the
+ *	option to specify additional IEs in NL80211_CMD_TRIGGER_SCAN,
+ *	NL80211_CMD_AUTHENTICATE, NL80211_CMD_ASSOCIATE,
+ *	NL80211_CMD_DEAUTHENTICATE, and NL80211_CMD_DISASSOCIATE.
  *
  * @NL80211_CMD_GET_SCAN: get scan results
  * @NL80211_CMD_TRIGGER_SCAN: trigger a new scan with the given parameters
@@ -238,7 +244,7 @@ enum nl80211_commands {
 	NL80211_CMD_GET_MESH_PARAMS,
 	NL80211_CMD_SET_MESH_PARAMS,
 
-	NL80211_CMD_SET_MGMT_EXTRA_IE,
+	NL80211_CMD_SET_MGMT_EXTRA_IE /* reserved; not used */,
 
 	NL80211_CMD_GET_REG,
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0da9a55881a1..dca4a6b0461b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -471,26 +471,6 @@ struct ieee80211_txq_params {
 	u8 aifs;
 };
 
-/**
- * struct mgmt_extra_ie_params - Extra management frame IE parameters
- *
- * Used to add extra IE(s) into management frames. If the driver cannot add the
- * requested data into all management frames of the specified subtype that are
- * generated in kernel or firmware/hardware, it must reject the configuration
- * call. The IE data buffer is added to the end of the specified management
- * frame body after all other IEs. This addition is not applied to frames that
- * are injected through a monitor interface.
- *
- * @subtype: Management frame subtype
- * @ies: IE data buffer or %NULL to remove previous data
- * @ies_len: Length of @ies in octets
- */
-struct mgmt_extra_ie_params {
-	u8 subtype;
-	u8 *ies;
-	int ies_len;
-};
-
 /* from net/wireless.h */
 struct wiphy;
 
@@ -743,8 +723,6 @@ struct cfg80211_disassoc_request {
  *
  * @set_channel: Set channel
  *
- * @set_mgmt_extra_ie: Set extra IE data for management frames
- *
  * @scan: Request to do a scan. If returning zero, the scan request is given
  *	the driver, and will be valid until passed to cfg80211_scan_done().
  *	For scan results, call cfg80211_inform_bss(); you can call this outside
@@ -828,10 +806,6 @@ struct cfg80211_ops {
 			       struct ieee80211_channel *chan,
 			       enum nl80211_channel_type channel_type);
 
-	int	(*set_mgmt_extra_ie)(struct wiphy *wiphy,
-				     struct net_device *dev,
-				     struct mgmt_extra_ie_params *params);
-
 	int	(*scan)(struct wiphy *wiphy, struct net_device *dev,
 			struct cfg80211_scan_request *request);
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 223e536e8426..f5c15c9a00ce 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1181,91 +1181,6 @@ static int ieee80211_set_channel(struct wiphy *wiphy,
 	return ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
 }
 
-static int set_mgmt_extra_ie_sta(struct ieee80211_sub_if_data *sdata,
-				 u8 subtype, u8 *ies, size_t ies_len)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-
-	switch (subtype) {
-	case IEEE80211_STYPE_PROBE_REQ >> 4:
-		if (local->ops->hw_scan)
-			break;
-		kfree(ifmgd->ie_probereq);
-		ifmgd->ie_probereq = ies;
-		ifmgd->ie_probereq_len = ies_len;
-		return 0;
-	case IEEE80211_STYPE_PROBE_RESP >> 4:
-		kfree(ifmgd->ie_proberesp);
-		ifmgd->ie_proberesp = ies;
-		ifmgd->ie_proberesp_len = ies_len;
-		return 0;
-	case IEEE80211_STYPE_AUTH >> 4:
-		kfree(ifmgd->ie_auth);
-		ifmgd->ie_auth = ies;
-		ifmgd->ie_auth_len = ies_len;
-		return 0;
-	case IEEE80211_STYPE_ASSOC_REQ >> 4:
-		kfree(ifmgd->ie_assocreq);
-		ifmgd->ie_assocreq = ies;
-		ifmgd->ie_assocreq_len = ies_len;
-		return 0;
-	case IEEE80211_STYPE_REASSOC_REQ >> 4:
-		kfree(ifmgd->ie_reassocreq);
-		ifmgd->ie_reassocreq = ies;
-		ifmgd->ie_reassocreq_len = ies_len;
-		return 0;
-	case IEEE80211_STYPE_DEAUTH >> 4:
-		kfree(ifmgd->ie_deauth);
-		ifmgd->ie_deauth = ies;
-		ifmgd->ie_deauth_len = ies_len;
-		return 0;
-	case IEEE80211_STYPE_DISASSOC >> 4:
-		kfree(ifmgd->ie_disassoc);
-		ifmgd->ie_disassoc = ies;
-		ifmgd->ie_disassoc_len = ies_len;
-		return 0;
-	}
-
-	return -EOPNOTSUPP;
-}
-
-static int ieee80211_set_mgmt_extra_ie(struct wiphy *wiphy,
-				       struct net_device *dev,
-				       struct mgmt_extra_ie_params *params)
-{
-	struct ieee80211_sub_if_data *sdata;
-	u8 *ies;
-	size_t ies_len;
-	int ret = -EOPNOTSUPP;
-
-	if (params->ies) {
-		ies = kmemdup(params->ies, params->ies_len, GFP_KERNEL);
-		if (ies == NULL)
-			return -ENOMEM;
-		ies_len = params->ies_len;
-	} else {
-		ies = NULL;
-		ies_len = 0;
-	}
-
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
-	switch (sdata->vif.type) {
-	case NL80211_IFTYPE_STATION:
-		ret = set_mgmt_extra_ie_sta(sdata, params->subtype,
-					    ies, ies_len);
-		break;
-	default:
-		ret = -EOPNOTSUPP;
-		break;
-	}
-
-	if (ret)
-		kfree(ies);
-	return ret;
-}
-
 #ifdef CONFIG_PM
 static int ieee80211_suspend(struct wiphy *wiphy)
 {
@@ -1465,7 +1380,6 @@ struct cfg80211_ops mac80211_config_ops = {
 	.change_bss = ieee80211_change_bss,
 	.set_txq_params = ieee80211_set_txq_params,
 	.set_channel = ieee80211_set_channel,
-	.set_mgmt_extra_ie = ieee80211_set_mgmt_extra_ie,
 	.suspend = ieee80211_suspend,
 	.resume = ieee80211_resume,
 	.scan = ieee80211_scan,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 547cfac218ee..f69e84ab9617 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -323,21 +323,6 @@ struct ieee80211_if_managed {
 	int wmm_last_param_set;
 
 	/* Extra IE data for management frames */
-	u8 *ie_probereq;
-	size_t ie_probereq_len;
-	u8 *ie_proberesp;
-	size_t ie_proberesp_len;
-	u8 *ie_auth;
-	size_t ie_auth_len;
-	u8 *ie_assocreq;
-	size_t ie_assocreq_len;
-	u8 *ie_reassocreq;
-	size_t ie_reassocreq_len;
-	u8 *ie_deauth;
-	size_t ie_deauth_len;
-	u8 *ie_disassoc;
-	size_t ie_disassoc_len;
-
 	u8 *sme_auth_ie;
 	size_t sme_auth_ie_len;
 };
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 6b56dc2208e7..34f4798a98f7 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -653,13 +653,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 		kfree(sdata->u.mgd.extra_ie);
 		kfree(sdata->u.mgd.assocreq_ies);
 		kfree(sdata->u.mgd.assocresp_ies);
-		kfree(sdata->u.mgd.ie_probereq);
-		kfree(sdata->u.mgd.ie_proberesp);
-		kfree(sdata->u.mgd.ie_auth);
-		kfree(sdata->u.mgd.ie_assocreq);
-		kfree(sdata->u.mgd.ie_reassocreq);
-		kfree(sdata->u.mgd.ie_deauth);
-		kfree(sdata->u.mgd.ie_disassoc);
 		kfree(sdata->u.mgd.sme_auth_ie);
 		break;
 	case NL80211_IFTYPE_WDS:
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d1bcc8438772..b0808efcedf6 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -82,38 +82,23 @@ static int ieee80211_compatible_rates(struct ieee80211_bss *bss,
 
 /* frame sending functions */
 
-static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
-{
-	if (ies)
-		memcpy(skb_put(skb, ies_len), ies, ies_len);
-}
-
 static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *ies, *ht_ie, *e_ies;
+	u8 *pos, *ies, *ht_ie;
 	int i, len, count, rates_len, supp_rates_len;
 	u16 capab;
 	struct ieee80211_bss *bss;
 	int wmm = 0;
 	struct ieee80211_supported_band *sband;
 	u32 rates = 0;
-	size_t e_ies_len;
-
-	if (ifmgd->flags & IEEE80211_STA_PREV_BSSID_SET) {
-		e_ies = sdata->u.mgd.ie_reassocreq;
-		e_ies_len = sdata->u.mgd.ie_reassocreq_len;
-	} else {
-		e_ies = sdata->u.mgd.ie_assocreq;
-		e_ies_len = sdata->u.mgd.ie_assocreq_len;
-	}
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
 			    sizeof(*mgmt) + 200 + ifmgd->extra_ie_len +
-			    ifmgd->ssid_len + e_ies_len);
+			    ifmgd->ssid_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
 		       "frame\n", sdata->dev->name);
@@ -304,8 +289,6 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 		memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs));
 	}
 
-	add_extra_ies(skb, e_ies, e_ies_len);
-
 	kfree(ifmgd->assocreq_ies);
 	ifmgd->assocreq_ies_len = (skb->data + skb->len) - ies;
 	ifmgd->assocreq_ies = kmalloc(ifmgd->assocreq_ies_len, GFP_KERNEL);
@@ -323,19 +306,8 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	u8 *ies;
-	size_t ies_len;
 
-	if (stype == IEEE80211_STYPE_DEAUTH) {
-		ies = sdata->u.mgd.ie_deauth;
-		ies_len = sdata->u.mgd.ie_deauth_len;
-	} else {
-		ies = sdata->u.mgd.ie_disassoc;
-		ies_len = sdata->u.mgd.ie_disassoc_len;
-	}
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) +
-			    ies_len);
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for "
 		       "deauth/disassoc frame\n", sdata->dev->name);
@@ -353,8 +325,6 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	/* u.deauth.reason_code == u.disassoc.reason_code */
 	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
 
-	add_extra_ies(skb, ies, ies_len);
-
 	ieee80211_tx_skb(sdata, skb, ifmgd->flags & IEEE80211_STA_MFP_ENABLED);
 }
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index e0431a1d218b..444bb14c95e1 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -846,16 +846,9 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	const u8 *ie_auth = NULL;
-	int ie_auth_len = 0;
-
-	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
-		ie_auth_len = sdata->u.mgd.ie_auth_len;
-		ie_auth = sdata->u.mgd.ie_auth;
-	}
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    sizeof(*mgmt) + 6 + extra_len + ie_auth_len);
+			    sizeof(*mgmt) + 6 + extra_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
 		       "frame\n", sdata->dev->name);
@@ -877,8 +870,6 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 	mgmt->u.auth.status_code = cpu_to_le16(0);
 	if (extra)
 		memcpy(skb_put(skb, extra_len), extra, extra_len);
-	if (ie_auth)
-		memcpy(skb_put(skb, ie_auth_len), ie_auth, ie_auth_len);
 
 	ieee80211_tx_skb(sdata, skb, encrypt);
 }
@@ -891,20 +882,11 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 	struct ieee80211_supported_band *sband;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *supp_rates, *esupp_rates = NULL, *extra_preq_ie = NULL;
-	int i, extra_preq_ie_len = 0;
-
-	switch (sdata->vif.type) {
-	case NL80211_IFTYPE_STATION:
-		extra_preq_ie_len = sdata->u.mgd.ie_probereq_len;
-		extra_preq_ie = sdata->u.mgd.ie_probereq;
-		break;
-	default:
-		break;
-	}
+	u8 *pos, *supp_rates, *esupp_rates = NULL;
+	int i;
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
-			    ie_len + extra_preq_ie_len);
+			    ie_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
 		       "request\n", sdata->dev->name);
@@ -953,9 +935,6 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 
 	if (ie)
 		memcpy(skb_put(skb, ie_len), ie, ie_len);
-	if (extra_preq_ie)
-		memcpy(skb_put(skb, extra_preq_ie_len), extra_preq_ie,
-		       extra_preq_ie_len);
 
 	ieee80211_tx_skb(sdata, skb, 0);
 }
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9e1318d1d4bb..44c79972be57 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -269,7 +269,6 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	CMD(add_mpath, NEW_MPATH);
 	CMD(set_mesh_params, SET_MESH_PARAMS);
 	CMD(change_bss, SET_BSS);
-	CMD(set_mgmt_extra_ie, SET_MGMT_EXTRA_IE);
 	CMD(auth, AUTHENTICATE);
 	CMD(assoc, ASSOCIATE);
 	CMD(deauth, DEAUTHENTICATE);
@@ -2355,46 +2354,6 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
 	return -EINVAL;
 }
 
-static int nl80211_set_mgmt_extra_ie(struct sk_buff *skb,
-				     struct genl_info *info)
-{
-	struct cfg80211_registered_device *drv;
-	int err;
-	struct net_device *dev;
-	struct mgmt_extra_ie_params params;
-
-	memset(&params, 0, sizeof(params));
-
-	if (!info->attrs[NL80211_ATTR_MGMT_SUBTYPE])
-		return -EINVAL;
-	params.subtype = nla_get_u8(info->attrs[NL80211_ATTR_MGMT_SUBTYPE]);
-	if (params.subtype > 15)
-		return -EINVAL; /* FC Subtype field is 4 bits (0..15) */
-
-	if (info->attrs[NL80211_ATTR_IE]) {
-		params.ies = nla_data(info->attrs[NL80211_ATTR_IE]);
-		params.ies_len = nla_len(info->attrs[NL80211_ATTR_IE]);
-	}
-
-	rtnl_lock();
-
-	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (drv->ops->set_mgmt_extra_ie)
-		err = drv->ops->set_mgmt_extra_ie(&drv->wiphy, dev, &params);
-	else
-		err = -EOPNOTSUPP;
-
-	cfg80211_put_dev(drv);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
-
-	return err;
-}
-
 static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *drv;
@@ -3043,12 +3002,6 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
-	{
-		.cmd = NL80211_CMD_SET_MGMT_EXTRA_IE,
-		.doit = nl80211_set_mgmt_extra_ie,
-		.policy = nl80211_policy,
-		.flags = GENL_ADMIN_PERM,
-	},
 	{
 		.cmd = NL80211_CMD_TRIGGER_SCAN,
 		.doit = nl80211_trigger_scan,
-- 
cgit v1.2.3-71-gd317


From af83debf5bb44257082d4489ac86123a0cadf6d3 Mon Sep 17 00:00:00 2001
From: Tulio Magno Quites Machado Filho <tuliom@gmail.com>
Date: Sun, 22 Mar 2009 01:41:13 +0100
Subject: ath5k: Support LED's on Acer Extensa 5620z

Add vendor ID for Quanta Microsystems and update the led table with the reported device.

Reported-by: Scott Barnes <nekoreeve@gmail.com>

Signed-off-by: Tulio Magno Quites Machado Filho <tuliom@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath5k/led.c | 2 ++
 include/linux/pci_ids.h          | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath5k/led.c b/drivers/net/wireless/ath5k/led.c
index 0686e12738b3..19555fb79c9b 100644
--- a/drivers/net/wireless/ath5k/led.c
+++ b/drivers/net/wireless/ath5k/led.c
@@ -65,6 +65,8 @@ static const struct pci_device_id ath5k_led_devices[] = {
 	{ ATH_SDEVICE(PCI_VENDOR_ID_AMBIT, 0x0422), ATH_LED(1, 1) },
 	/* E-machines E510 (tuliom@gmail.com) */
 	{ ATH_SDEVICE(PCI_VENDOR_ID_AMBIT, 0x0428), ATH_LED(3, 0) },
+	/* Acer Extensa 5620z (nekoreeve@gmail.com) */
+	{ ATH_SDEVICE(PCI_VENDOR_ID_QMI, 0x0105), ATH_LED(3, 0) },
 	{ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 097f410edefa..05dfa7c4fb64 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2271,6 +2271,8 @@
 #define PCI_DEVICE_ID_KORENIX_JETCARDF0	0x1600
 #define PCI_DEVICE_ID_KORENIX_JETCARDF1	0x16ff
 
+#define PCI_VENDOR_ID_QMI		0x1a32
+
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
-- 
cgit v1.2.3-71-gd317


From 512a004382f2c60d5c4f855476ba965adc00250c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 27 Mar 2009 22:14:27 -0400
Subject: ext3: Use WRITE_SYNC for commits which are caused by fsync()

If a commit is triggered by fsync(), set a flag indicating the journal
blocks associated with the transaction should be flushed out using
WRITE_SYNC.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/commit.c      | 23 +++++++++++++++--------
 fs/jbd/transaction.c |  2 ++
 include/linux/jbd.h  |  5 +++++
 3 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..f8077b9c8981 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
 	return (ret == -EIO);
 }
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+				   int write_op)
 {
 	int i;
 
 	for (i = 0; i < bufs; i++) {
 		wbuf[i]->b_end_io = end_buffer_write_sync;
 		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
+		submit_bh(write_op, wbuf[i]);
 	}
 }
 
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
  *  Submit all the data buffers to disk
  */
 static int journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
+				       transaction_t *commit_transaction,
+				       int write_op)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
+				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
 				lock_buffer(bh);
 				spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
 			jbd_unlock_bh_state(bh);
 			if (bufs == journal->j_wbufsize) {
 				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
+				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
 				goto write_out_data;
 			}
@@ -286,7 +289,7 @@ write_out_data:
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
+	journal_do_submit_data(wbuf, bufs, write_op);
 
 	return err;
 }
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
+	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -347,6 +351,8 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	if (commit_transaction->t_synchronous_commit)
+		write_op = WRITE_SYNC;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -431,7 +437,8 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_data_buffers(journal, commit_transaction,
+					  write_op);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
@@ -660,7 +667,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(write_op, bh);
 			}
 			cond_resched();
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
 		}
 	}
 
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 64246dce5663..2c6943152c21 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -552,6 +552,11 @@ struct transaction_s
 	 */
 	int t_handle_count;
 
+	/*
+	 * This transaction is being forced and some process is
+	 * waiting for it to finish.
+	 */
+	int t_synchronous_commit:1;
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 7faa144a518c456e2057918f030f50100144ccc6 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Fri, 27 Mar 2009 22:23:52 -0400
Subject: ACPI: battery: add power_{now,avg} properties to power_class

ACPI has smart batteries, which work in units of energy and measure
rate of (dis)charge as power, thus it is not appropriate to export it
as a current_now. Current_now will still be exported to allow
for userland applications to match.

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/battery.c             | 12 +++++++-----
 drivers/acpi/sbs.c                 | 27 ++++++++++++++++-----------
 drivers/power/power_supply_sysfs.c |  2 ++
 include/linux/power_supply.h       |  2 ++
 4 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 69cbc57c2d1c..09a2240d5605 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -92,7 +92,7 @@ struct acpi_battery {
 #endif
 	struct acpi_device *device;
 	unsigned long update_time;
-	int current_now;
+	int rate_now;
 	int capacity_now;
 	int voltage_now;
 	int design_capacity;
@@ -196,7 +196,8 @@ static int acpi_battery_get_property(struct power_supply *psy,
 		val->intval = battery->voltage_now * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
-		val->intval = battery->current_now * 1000;
+	case POWER_SUPPLY_PROP_POWER_NOW:
+		val->intval = battery->rate_now * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
 	case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN:
@@ -247,6 +248,7 @@ static enum power_supply_property energy_battery_props[] = {
 	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_POWER_NOW,
 	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_FULL,
 	POWER_SUPPLY_PROP_ENERGY_NOW,
@@ -273,7 +275,7 @@ struct acpi_offsets {
 
 static struct acpi_offsets state_offsets[] = {
 	{offsetof(struct acpi_battery, state), 0},
-	{offsetof(struct acpi_battery, current_now), 0},
+	{offsetof(struct acpi_battery, rate_now), 0},
 	{offsetof(struct acpi_battery, capacity_now), 0},
 	{offsetof(struct acpi_battery, voltage_now), 0},
 };
@@ -605,11 +607,11 @@ static int acpi_battery_print_state(struct seq_file *seq, int result)
 	else
 		seq_printf(seq, "charging state:          charged\n");
 
-	if (battery->current_now == ACPI_BATTERY_VALUE_UNKNOWN)
+	if (battery->rate_now == ACPI_BATTERY_VALUE_UNKNOWN)
 		seq_printf(seq, "present rate:            unknown\n");
 	else
 		seq_printf(seq, "present rate:            %d %s\n",
-			   battery->current_now, acpi_battery_units(battery));
+			   battery->rate_now, acpi_battery_units(battery));
 
 	if (battery->capacity_now == ACPI_BATTERY_VALUE_UNKNOWN)
 		seq_printf(seq, "remaining capacity:      unknown\n");
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 6050ce481873..3963cb6e0f19 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -102,8 +102,8 @@ struct acpi_battery {
 	u16 cycle_count;
 	u16 temp_now;
 	u16 voltage_now;
-	s16 current_now;
-	s16 current_avg;
+	s16 rate_now;
+	s16 rate_avg;
 	u16 capacity_now;
 	u16 state_of_charge;
 	u16 state;
@@ -202,9 +202,9 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,
 		return -ENODEV;
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
-		if (battery->current_now < 0)
+		if (battery->rate_now < 0)
 			val->intval = POWER_SUPPLY_STATUS_DISCHARGING;
-		else if (battery->current_now > 0)
+		else if (battery->rate_now > 0)
 			val->intval = POWER_SUPPLY_STATUS_CHARGING;
 		else
 			val->intval = POWER_SUPPLY_STATUS_FULL;
@@ -224,11 +224,13 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,
 				acpi_battery_vscale(battery) * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
-		val->intval = abs(battery->current_now) *
+	case POWER_SUPPLY_PROP_POWER_NOW:
+		val->intval = abs(battery->rate_now) *
 				acpi_battery_ipscale(battery) * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CURRENT_AVG:
-		val->intval = abs(battery->current_avg) *
+	case POWER_SUPPLY_PROP_POWER_AVG:
+		val->intval = abs(battery->rate_avg) *
 				acpi_battery_ipscale(battery) * 1000;
 		break;
 	case POWER_SUPPLY_PROP_CAPACITY:
@@ -293,6 +295,8 @@ static enum power_supply_property sbs_energy_battery_props[] = {
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
 	POWER_SUPPLY_PROP_CURRENT_AVG,
+	POWER_SUPPLY_PROP_POWER_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
 	POWER_SUPPLY_PROP_CAPACITY,
 	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_FULL,
@@ -301,6 +305,7 @@ static enum power_supply_property sbs_energy_battery_props[] = {
 	POWER_SUPPLY_PROP_MODEL_NAME,
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
+
 #endif
 
 /* --------------------------------------------------------------------------
@@ -330,8 +335,8 @@ static struct acpi_battery_reader info_readers[] = {
 static struct acpi_battery_reader state_readers[] = {
 	{0x08, SMBUS_READ_WORD, offsetof(struct acpi_battery, temp_now)},
 	{0x09, SMBUS_READ_WORD, offsetof(struct acpi_battery, voltage_now)},
-	{0x0a, SMBUS_READ_WORD, offsetof(struct acpi_battery, current_now)},
-	{0x0b, SMBUS_READ_WORD, offsetof(struct acpi_battery, current_avg)},
+	{0x0a, SMBUS_READ_WORD, offsetof(struct acpi_battery, rate_now)},
+	{0x0b, SMBUS_READ_WORD, offsetof(struct acpi_battery, rate_avg)},
 	{0x0f, SMBUS_READ_WORD, offsetof(struct acpi_battery, capacity_now)},
 	{0x0e, SMBUS_READ_WORD, offsetof(struct acpi_battery, state_of_charge)},
 	{0x16, SMBUS_READ_WORD, offsetof(struct acpi_battery, state)},
@@ -589,9 +594,9 @@ static int acpi_battery_read_state(struct seq_file *seq, void *offset)
 	seq_printf(seq, "capacity state:          %s\n",
 		   (battery->state & 0x0010) ? "critical" : "ok");
 	seq_printf(seq, "charging state:          %s\n",
-		   (battery->current_now < 0) ? "discharging" :
-		   ((battery->current_now > 0) ? "charging" : "charged"));
-	rate = abs(battery->current_now) * acpi_battery_ipscale(battery);
+		   (battery->rate_now < 0) ? "discharging" :
+		   ((battery->rate_now > 0) ? "charging" : "charged"));
+	rate = abs(battery->rate_now) * acpi_battery_ipscale(battery);
 	rate *= (acpi_battery_mode(battery))?(battery->voltage_now *
 			acpi_battery_vscale(battery)/1000):1;
 	seq_printf(seq, "present rate:            %d%s\n", rate,
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index ac01e06817fb..da73591017f9 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -93,6 +93,8 @@ static struct device_attribute power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(voltage_avg),
 	POWER_SUPPLY_ATTR(current_now),
 	POWER_SUPPLY_ATTR(current_avg),
+	POWER_SUPPLY_ATTR(power_now),
+	POWER_SUPPLY_ATTR(power_avg),
 	POWER_SUPPLY_ATTR(charge_full_design),
 	POWER_SUPPLY_ATTR(charge_empty_design),
 	POWER_SUPPLY_ATTR(charge_full),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 8ff25e0e7f7a..594c494ac3f0 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -73,6 +73,8 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_VOLTAGE_AVG,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
 	POWER_SUPPLY_PROP_CURRENT_AVG,
+	POWER_SUPPLY_PROP_POWER_NOW,
+	POWER_SUPPLY_PROP_POWER_AVG,
 	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
 	POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN,
 	POWER_SUPPLY_PROP_CHARGE_FULL,
-- 
cgit v1.2.3-71-gd317


From 8651d5c0b1f874c5b8307ae2b858bc40f9f02482 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Fri, 27 Mar 2009 17:10:48 -0400
Subject: lsm: Remove the socket_post_accept() hook

The socket_post_accept() hook is not currently used by any in-tree modules
and its existence continues to cause problems by confusing people about
what can be safely accomplished using this hook.  If a legitimate need for
this hook arises in the future it can always be reintroduced.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 13 -------------
 net/socket.c             |  2 --
 security/capability.c    |  5 -----
 security/security.c      |  5 -----
 4 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 1f2ab6353c00..54ed15799a83 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -880,11 +880,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@sock contains the listening socket structure.
  *	@newsock contains the newly created server socket for connection.
  *	Return 0 if permission is granted.
- * @socket_post_accept:
- *	This hook allows a security module to copy security
- *	information into the newly created socket's inode.
- *	@sock contains the listening socket structure.
- *	@newsock contains the newly created server socket for connection.
  * @socket_sendmsg:
  *	Check permission before transmitting a message to another socket.
  *	@sock contains the socket structure.
@@ -1554,8 +1549,6 @@ struct security_operations {
 			       struct sockaddr *address, int addrlen);
 	int (*socket_listen) (struct socket *sock, int backlog);
 	int (*socket_accept) (struct socket *sock, struct socket *newsock);
-	void (*socket_post_accept) (struct socket *sock,
-				    struct socket *newsock);
 	int (*socket_sendmsg) (struct socket *sock,
 			       struct msghdr *msg, int size);
 	int (*socket_recvmsg) (struct socket *sock,
@@ -2537,7 +2530,6 @@ int security_socket_bind(struct socket *sock, struct sockaddr *address, int addr
 int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen);
 int security_socket_listen(struct socket *sock, int backlog);
 int security_socket_accept(struct socket *sock, struct socket *newsock);
-void security_socket_post_accept(struct socket *sock, struct socket *newsock);
 int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size);
 int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
 			    int size, int flags);
@@ -2616,11 +2608,6 @@ static inline int security_socket_accept(struct socket *sock,
 	return 0;
 }
 
-static inline void security_socket_post_accept(struct socket *sock,
-					       struct socket *newsock)
-{
-}
-
 static inline int security_socket_sendmsg(struct socket *sock,
 					  struct msghdr *msg, int size)
 {
diff --git a/net/socket.c b/net/socket.c
index 0b14b79c03af..91d0c0254ffe 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1536,8 +1536,6 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
 	fd_install(newfd, newfile);
 	err = newfd;
 
-	security_socket_post_accept(sock, newsock);
-
 out_put:
 	fput_light(sock->file, fput_needed);
 out:
diff --git a/security/capability.c b/security/capability.c
index c545bd1300b5..21b6cead6a8e 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -620,10 +620,6 @@ static int cap_socket_accept(struct socket *sock, struct socket *newsock)
 	return 0;
 }
 
-static void cap_socket_post_accept(struct socket *sock, struct socket *newsock)
-{
-}
-
 static int cap_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
 {
 	return 0;
@@ -1014,7 +1010,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, socket_connect);
 	set_to_cap_if_null(ops, socket_listen);
 	set_to_cap_if_null(ops, socket_accept);
-	set_to_cap_if_null(ops, socket_post_accept);
 	set_to_cap_if_null(ops, socket_sendmsg);
 	set_to_cap_if_null(ops, socket_recvmsg);
 	set_to_cap_if_null(ops, socket_getsockname);
diff --git a/security/security.c b/security/security.c
index c3586c0d97e2..206e53844d2f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1007,11 +1007,6 @@ int security_socket_accept(struct socket *sock, struct socket *newsock)
 	return security_ops->socket_accept(sock, newsock);
 }
 
-void security_socket_post_accept(struct socket *sock, struct socket *newsock)
-{
-	security_ops->socket_post_accept(sock, newsock);
-}
-
 int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
 {
 	return security_ops->socket_sendmsg(sock, msg, size);
-- 
cgit v1.2.3-71-gd317


From efb3288b423d7e3533a68dccecaa05a56a281a4e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:43 -0400
Subject: SUNRPC: Clean up static inline functions in svc_xprt.h

Clean up:  Enable the use of const arguments in higher level svc_ APIs
by adding const to the arguments of the helper functions in svc_xprt.h

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc_xprt.h | 46 +++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0127daca4354..959b931b6053 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -88,29 +88,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt)
 	kref_get(&xprt->xpt_ref);
 }
 static inline void svc_xprt_set_local(struct svc_xprt *xprt,
-				      struct sockaddr *sa, int salen)
+				      const struct sockaddr *sa,
+				      const size_t salen)
 {
 	memcpy(&xprt->xpt_local, sa, salen);
 	xprt->xpt_locallen = salen;
 }
 static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
-				       struct sockaddr *sa, int salen)
+				       const struct sockaddr *sa,
+				       const size_t salen)
 {
 	memcpy(&xprt->xpt_remote, sa, salen);
 	xprt->xpt_remotelen = salen;
 }
-static inline unsigned short svc_addr_port(struct sockaddr *sa)
+static inline unsigned short svc_addr_port(const struct sockaddr *sa)
 {
-	unsigned short ret = 0;
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
+
 	switch (sa->sa_family) {
 	case AF_INET:
-		ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
-		break;
+		return ntohs(sin->sin_port);
 	case AF_INET6:
-		ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
-		break;
+		return ntohs(sin6->sin6_port);
 	}
-	return ret;
+
+	return 0;
 }
 
 static inline size_t svc_addr_len(struct sockaddr *sa)
@@ -124,36 +127,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
 	return -EAFNOSUPPORT;
 }
 
-static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
 }
 
-static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
 {
-	return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
 }
 
-static inline char *__svc_print_addr(struct sockaddr *addr,
-				     char *buf, size_t len)
+static inline char *__svc_print_addr(const struct sockaddr *addr,
+				     char *buf, const size_t len)
 {
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
+
 	switch (addr->sa_family) {
 	case AF_INET:
-		snprintf(buf, len, "%pI4, port=%u",
-			&((struct sockaddr_in *)addr)->sin_addr,
-			ntohs(((struct sockaddr_in *) addr)->sin_port));
+		snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
+			ntohs(sin->sin_port));
 		break;
 
 	case AF_INET6:
 		snprintf(buf, len, "%pI6, port=%u",
-			 &((struct sockaddr_in6 *)addr)->sin6_addr,
-			ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
+			 &sin6->sin6_addr,
+			ntohs(sin6->sin6_port));
 		break;
 
 	default:
 		snprintf(buf, len, "unknown address type: %d", addr->sa_family);
 		break;
 	}
+
 	return buf;
 }
 #endif /* SUNRPC_SVC_XPRT_H */
-- 
cgit v1.2.3-71-gd317


From 156e62094a74cf43f02f56ef96b6cda567501357 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:45:58 -0400
Subject: SUNRPC: Clean up svc_find_xprt() calling sequence

Clean up: add documentating comment and use appropriate data types for
svc_find_xprt()'s arguments.

This also eliminates a mixed sign comparison: @port was an int, while
the return value of svc_xprt_local_port() is an unsigned short.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc_xprt.h |  3 ++-
 net/sunrpc/svc_xprt.c           | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 959b931b6053..55b68582c5d9 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -80,7 +80,8 @@ void	svc_close_xprt(struct svc_xprt *xprt);
 void	svc_delete_xprt(struct svc_xprt *xprt);
 int	svc_port_is_privileged(struct sockaddr *sin);
 int	svc_print_xprts(char *buf, int maxlen);
-struct	svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
+struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			const sa_family_t af, const unsigned short port);
 int	svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b34..c947c93dbc24 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1033,7 +1033,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
 	return dr;
 }
 
-/*
+/**
+ * svc_find_xprt - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @af: Address family of transport's local address
+ * @port: transport's IP port number
+ *
  * Return the transport instance pointer for the endpoint accepting
  * connections/peer traffic from the specified transport class,
  * address family and port.
@@ -1042,14 +1048,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
  * wild-card, and will result in matching the first transport in the
  * service's list that has a matching class name.
  */
-struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
-			       int af, int port)
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			       const sa_family_t af, const unsigned short port)
 {
 	struct svc_xprt *xprt;
 	struct svc_xprt *found = NULL;
 
 	/* Sanity check the args */
-	if (!serv || !xcl_name)
+	if (serv == NULL || xcl_name == NULL)
 		return found;
 
 	spin_lock_bh(&serv->sv_lock);
@@ -1058,7 +1064,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
 			continue;
 		if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
 			continue;
-		if (port && port != svc_xprt_local_port(xprt))
+		if (port != 0 && port != svc_xprt_local_port(xprt))
 			continue;
 		found = xprt;
 		svc_xprt_get(xprt);
-- 
cgit v1.2.3-71-gd317


From 4b62e58cccff9c5e7ffc7023f7ec24c75fbd549b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:06 -0400
Subject: SUNRPC: Pass a family argument to svc_register()

The sv_family field is going away.  Instead of using sv_family, have
the svc_register() function take a protocol family argument.

Since this argument represents a protocol family, and not an address
family, this argument takes an int, as this is what is passed to
sock_create_kern().  Also make sure svc_register's helpers are
checking for PF_FOO instead of AF_FOO.  The value of [AP]F_FOO are
equivalent; this is simply a symbolic change to reflect the semantics
of the value stored in that variable.

sock_create_kern() should return EPFNOSUPPORT if the passed-in
protocol family isn't supported, but it uses EAFNOSUPPORT for this
case.  We will stick with that tradition here, as svc_register()
is called by the RPC server in the same path as sock_create_kern().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc.h |  4 ++--
 net/sunrpc/svc.c           | 21 +++++++++++----------
 net/sunrpc/svcsock.c       |  2 +-
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3435d24bfe55..1f18fc728cba 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -396,8 +396,8 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
-int		   svc_register(const struct svc_serv *, const unsigned short,
-				const unsigned short);
+int		   svc_register(const struct svc_serv *, const int,
+				const unsigned short, const unsigned short);
 
 void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..41bc36ea2224 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -800,17 +800,17 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
  * if any error occurs.
  */
 static int __svc_register(const u32 program, const u32 version,
-			  const sa_family_t family,
+			  const int family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
 	int error;
 
 	switch (family) {
-	case AF_INET:
+	case PF_INET:
 		return __svc_rpcb_register4(program, version,
 						protocol, port);
-	case AF_INET6:
+	case PF_INET6:
 		error = __svc_rpcb_register6(program, version,
 						protocol, port);
 		if (error < 0)
@@ -840,11 +840,11 @@ static int __svc_register(const u32 program, const u32 version,
  * if any error occurs.
  */
 static int __svc_register(const u32 program, const u32 version,
-			  sa_family_t family,
+			  const int family,
 			  const unsigned short protocol,
 			  const unsigned short port)
 {
-	if (family != AF_INET)
+	if (family != PF_INET)
 		return -EAFNOSUPPORT;
 
 	return rpcb_register(program, version, protocol, port);
@@ -855,13 +855,14 @@ static int __svc_register(const u32 program, const u32 version,
 /**
  * svc_register - register an RPC service with the local portmapper
  * @serv: svc_serv struct for the service to register
+ * @family: protocol family of service's listener socket
  * @proto: transport protocol number to advertise
  * @port: port to advertise
  *
- * Service is registered for any address in serv's address family
+ * Service is registered for any address in the passed-in protocol family
  */
-int svc_register(const struct svc_serv *serv, const unsigned short proto,
-		 const unsigned short port)
+int svc_register(const struct svc_serv *serv, const int family,
+		 const unsigned short proto, const unsigned short port)
 {
 	struct svc_program	*progp;
 	unsigned int		i;
@@ -879,7 +880,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 					i,
 					proto == IPPROTO_UDP?  "udp" : "tcp",
 					port,
-					serv->sv_family,
+					family,
 					progp->pg_vers[i]->vs_hidden?
 						" (but not telling portmap)" : "");
 
@@ -887,7 +888,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
 				continue;
 
 			error = __svc_register(progp->pg_prog, i,
-						serv->sv_family, proto, port);
+						family, proto, port);
 			if (error < 0)
 				break;
 		}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5763e6460fea..d00583c1cd04 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 
 	/* Register socket with portmapper */
 	if (*errp >= 0 && pmap_register)
-		*errp = svc_register(serv, inet->sk_protocol,
+		*errp = svc_register(serv, serv->sv_family, inet->sk_protocol,
 				     ntohs(inet_sk(inet)->sport));
 
 	if (*errp < 0) {
-- 
cgit v1.2.3-71-gd317


From 9652ada3fb5914a67d8422114e8a76388330fa79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:21 -0400
Subject: SUNRPC: Change svc_create_xprt() to take a @family argument

The sv_family field is going away.  Pass a protocol family argument to
svc_create_xprt() instead of extracting the family from the passed-in
svc_serv struct.

Again, as this is a listener socket and not an address, we make this
new argument an "int" protocol family, instead of an "sa_family_t."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c                  |  3 ++-
 fs/nfs/callback.c               |  4 ++--
 fs/nfsd/nfsctl.c                |  2 +-
 fs/nfsd/nfssvc.c                |  4 ++--
 include/linux/sunrpc/svc_xprt.h |  3 ++-
 net/sunrpc/svc_xprt.c           | 15 +++++++++------
 6 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..390c5593655c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -211,7 +211,8 @@ static int create_lockd_listener(struct svc_serv *serv, char *name,
 
 	xprt = svc_find_xprt(serv, name, 0, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+		return svc_create_xprt(serv, name, nlmsvc_family,
+					port, SVC_SOCK_DEFAULTS);
 
 	svc_xprt_put(xprt);
 	return 0;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..fb35cab63c8a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -122,8 +122,8 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 
-	ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
-			      SVC_SOCK_ANONYMOUS);
+	ret = svc_create_xprt(serv, "tcp", nfs_callback_family,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5a936c14f6ff..a4ed8644d69c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -943,7 +943,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 			err = nfsd_create_serv();
 			if (!err) {
 				err = svc_create_xprt(nfsd_serv,
-						      transport, port,
+						      transport, PF_INET, port,
 						      SVC_SOCK_ANONYMOUS);
 				if (err == -ENOENT)
 					/* Give a reasonable perror msg for
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..ab7f249055b5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -244,7 +244,7 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = svc_create_xprt(nfsd_serv, "udp", port,
+	error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
@@ -253,7 +253,7 @@ static int nfsd_init_socks(int port)
 	if (error < 0)
 		return error;
 
-	error = svc_create_xprt(nfsd_serv, "tcp", port,
+	error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 55b68582c5d9..0d9cb6ef28b0 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -71,7 +71,8 @@ int	svc_reg_xprt_class(struct svc_xprt_class *);
 void	svc_unreg_xprt_class(struct svc_xprt_class *);
 void	svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
 		      struct svc_serv *);
-int	svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
+int	svc_create_xprt(struct svc_serv *, const char *, const int,
+			const unsigned short, int);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_received(struct svc_xprt *);
 void	svc_xprt_put(struct svc_xprt *xprt);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c947c93dbc24..2819ee093f36 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -161,7 +161,9 @@ EXPORT_SYMBOL_GPL(svc_xprt_init);
 
 static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 					 struct svc_serv *serv,
-					 unsigned short port, int flags)
+					 const int family,
+					 const unsigned short port,
+					 int flags)
 {
 	struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
@@ -176,12 +178,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	struct sockaddr *sap;
 	size_t len;
 
-	switch (serv->sv_family) {
-	case AF_INET:
+	switch (family) {
+	case PF_INET:
 		sap = (struct sockaddr *)&sin;
 		len = sizeof(sin);
 		break;
-	case AF_INET6:
+	case PF_INET6:
 		sap = (struct sockaddr *)&sin6;
 		len = sizeof(sin6);
 		break;
@@ -192,7 +194,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
 }
 
-int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+		    const int family, const unsigned short port,
 		    int flags)
 {
 	struct svc_xprt_class *xcl;
@@ -209,7 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
 			goto err;
 
 		spin_unlock(&svc_xprt_class_lock);
-		newxprt = __svc_xpo_create(xcl, serv, port, flags);
+		newxprt = __svc_xpo_create(xcl, serv, family, port, flags);
 		if (IS_ERR(newxprt)) {
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
-- 
cgit v1.2.3-71-gd317


From 49a9072f29a1039f142ec98b44a72d7173651c02 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Mar 2009 20:46:29 -0400
Subject: SUNRPC: Remove @family argument from svc_create() and
 svc_create_pooled()

Since an RPC service listener's protocol family is specified now via
svc_create_xprt(), it no longer needs to be passed to svc_create() or
svc_create_pooled().  Remove that argument from the synopsis of those
functions, and remove the sv_family field from the svc_serv struct.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c             |  2 +-
 fs/nfs/callback.c          |  3 +--
 fs/nfsd/nfssvc.c           |  1 -
 include/linux/sunrpc/svc.h |  5 ++---
 net/sunrpc/svc.c           | 11 +++++------
 5 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 390c5593655c..d30920038cb6 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -275,7 +275,7 @@ int lockd_up(void)
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index fb35cab63c8a..ddf4b4ae6967 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -116,8 +116,7 @@ int nfs_callback_up(void)
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
-				nfs_callback_family, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ab7f249055b5..bc3567bab8c4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
 
 	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      AF_INET,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 1f18fc728cba..d3a4c0231933 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -69,7 +69,6 @@ struct svc_serv {
 	struct list_head	sv_tempsocks;	/* all temporary sockets */
 	int			sv_tmpcnt;	/* count of temporary sockets */
 	struct timer_list	sv_temptimer;	/* timer for aging temporary sockets */
-	sa_family_t		sv_family;	/* listener's address family */
 
 	char *			sv_name;	/* service name */
 
@@ -385,13 +384,13 @@ struct svc_procedure {
 /*
  * Function prototypes.
  */
-struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t,
+struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    void (*shutdown)(struct svc_serv *));
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			sa_family_t, void (*shutdown)(struct svc_serv *),
+			void (*shutdown)(struct svc_serv *),
 			svc_thread_fn, struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 41bc36ea2224..d72ff44826d8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	   sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+	     void (*shutdown)(struct svc_serv *serv))
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -368,7 +368,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 	if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
-	serv->sv_family    = family;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
@@ -427,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
-		sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+	   void (*shutdown)(struct svc_serv *serv))
 {
-	return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
+	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  sa_family_t family, void (*shutdown)(struct svc_serv *serv),
+		  void (*shutdown)(struct svc_serv *serv),
 		  svc_thread_fn func, struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, family, shutdown);
+	serv = __svc_create(prog, bufsize, npools, shutdown);
 
 	if (serv != NULL) {
 		serv->sv_function = func;
-- 
cgit v1.2.3-71-gd317


From d2dd14ac1847082d4bb955619e86ed315c0ecd20 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 28 Mar 2009 21:34:41 +0100
Subject: i2c-nforce2: Add support for MCP67, MCP73, MCP78S and MCP79

The MCP78S and MCP79 appear to be compatible with the previous nForce
chips as far as the SMBus controller is concerned. The MCP67 and MCP73
were not tested yet but I'd be very surprised if they weren't
compatible too.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Oleg Ryjkov <olegr@olegr.ca>
Cc: Malcolm Lalkaka <mlalkaka@gmail.com>
Cc: Zbigniew Luszpinski <zbiggy@o2.pl>
---
 Documentation/i2c/busses/i2c-nforce2 | 12 ++++++++----
 drivers/i2c/busses/i2c-nforce2.c     | 12 ++++++++++--
 include/linux/pci_ids.h              |  4 ++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-nforce2 b/Documentation/i2c/busses/i2c-nforce2
index fae3495bcbaf..9698c396b830 100644
--- a/Documentation/i2c/busses/i2c-nforce2
+++ b/Documentation/i2c/busses/i2c-nforce2
@@ -7,10 +7,14 @@ Supported adapters:
   * nForce3 250Gb MCP          10de:00E4 
   * nForce4 MCP                10de:0052
   * nForce4 MCP-04             10de:0034
-  * nForce4 MCP51              10de:0264
-  * nForce4 MCP55              10de:0368
-  * nForce4 MCP61              10de:03EB
-  * nForce4 MCP65              10de:0446
+  * nForce MCP51               10de:0264
+  * nForce MCP55               10de:0368
+  * nForce MCP61               10de:03EB
+  * nForce MCP65               10de:0446
+  * nForce MCP67               10de:0542
+  * nForce MCP73               10de:07D8
+  * nForce MCP78S              10de:0752
+  * nForce MCP79               10de:0AA2
 
 Datasheet: not publicly available, but seems to be similar to the
            AMD-8111 SMBus 2.0 adapter.
diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c
index 05af6cd7f270..2ff4683703a8 100644
--- a/drivers/i2c/busses/i2c-nforce2.c
+++ b/drivers/i2c/busses/i2c-nforce2.c
@@ -31,10 +31,14 @@
     nForce3 250Gb MCP		00E4
     nForce4 MCP			0052
     nForce4 MCP-04		0034
-    nForce4 MCP51		0264
-    nForce4 MCP55		0368
+    nForce MCP51		0264
+    nForce MCP55		0368
     nForce MCP61		03EB
     nForce MCP65		0446
+    nForce MCP67		0542
+    nForce MCP73		07D8
+    nForce MCP78S		0752
+    nForce MCP79		0AA2
 
     This driver supports the 2 SMBuses that are included in the MCP of the
     nForce2/3/4/5xx chipsets.
@@ -315,6 +319,10 @@ static struct pci_device_id nforce2_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP78S_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS) },
 	{ 0 }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 05dfa7c4fb64..5109fecde284 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1237,6 +1237,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NVENET_21              0x0451
 #define PCI_DEVICE_ID_NVIDIA_NVENET_22              0x0452
 #define PCI_DEVICE_ID_NVIDIA_NVENET_23              0x0453
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_SMBUS     0x0542
 #define PCI_DEVICE_ID_NVIDIA_NVENET_24              0x054C
 #define PCI_DEVICE_ID_NVIDIA_NVENET_25              0x054D
 #define PCI_DEVICE_ID_NVIDIA_NVENET_26              0x054E
@@ -1247,11 +1248,14 @@
 #define PCI_DEVICE_ID_NVIDIA_NVENET_31              0x07DF
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE       0x0560
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE       0x056C
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP78S_SMBUS    0x0752
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE       0x0759
 #define PCI_DEVICE_ID_NVIDIA_NVENET_32              0x0760
 #define PCI_DEVICE_ID_NVIDIA_NVENET_33              0x0761
 #define PCI_DEVICE_ID_NVIDIA_NVENET_34              0x0762
 #define PCI_DEVICE_ID_NVIDIA_NVENET_35              0x0763
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_SMBUS     0x07D8
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS     0x0AA2
 #define PCI_DEVICE_ID_NVIDIA_NVENET_36              0x0AB0
 #define PCI_DEVICE_ID_NVIDIA_NVENET_37              0x0AB1
 #define PCI_DEVICE_ID_NVIDIA_NVENET_38              0x0AB2
-- 
cgit v1.2.3-71-gd317


From eff9ec95efaaf6b12d230f0ea7d3c295d3bc9d57 Mon Sep 17 00:00:00 2001
From: Marco Aurelio da Costa <costa@gamic.com>
Date: Sat, 28 Mar 2009 21:34:44 +0100
Subject: i2c-algo-pca: Add PCA9665 support

Add support for the PCA9665 I2C controller.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/algos/i2c-algo-pca.c      | 180 +++++++++++++++++++++++++++++++---
 drivers/i2c/busses/Kconfig            |   8 +-
 drivers/i2c/busses/i2c-pca-isa.c      |  14 ++-
 drivers/i2c/busses/i2c-pca-platform.c |   9 +-
 include/linux/i2c-algo-pca.h          |  33 ++++++-
 5 files changed, 216 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/algos/i2c-algo-pca.c b/drivers/i2c/algos/i2c-algo-pca.c
index 943d70ee5d59..a8e51bd1a4f5 100644
--- a/drivers/i2c/algos/i2c-algo-pca.c
+++ b/drivers/i2c/algos/i2c-algo-pca.c
@@ -46,6 +46,14 @@ static int i2c_debug;
 #define pca_wait(adap) adap->wait_for_completion(adap->data)
 #define pca_reset(adap) adap->reset_chip(adap->data)
 
+static void pca9665_reset(void *pd)
+{
+	struct i2c_algo_pca_data *adap = pd;
+	pca_outw(adap, I2C_PCA_INDPTR, I2C_PCA_IPRESET);
+	pca_outw(adap, I2C_PCA_IND, 0xA5);
+	pca_outw(adap, I2C_PCA_IND, 0x5A);
+}
+
 /*
  * Generate a start condition on the i2c bus.
  *
@@ -333,27 +341,171 @@ static const struct i2c_algorithm pca_algo = {
 	.functionality	= pca_func,
 };
 
-static int pca_init(struct i2c_adapter *adap)
+static unsigned int pca_probe_chip(struct i2c_adapter *adap)
 {
-	static int freqs[] = {330,288,217,146,88,59,44,36};
-	int clock;
 	struct i2c_algo_pca_data *pca_data = adap->algo_data;
-
-	if (pca_data->i2c_clock > 7) {
-		printk(KERN_WARNING "%s: Invalid I2C clock speed selected. Trying default.\n",
-			adap->name);
-		pca_data->i2c_clock = I2C_PCA_CON_59kHz;
+	/* The trick here is to check if there is an indirect register
+	 * available. If there is one, we will read the value we first
+	 * wrote on I2C_PCA_IADR. Otherwise, we will read the last value
+	 * we wrote on I2C_PCA_ADR
+	 */
+	pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_IADR);
+	pca_outw(pca_data, I2C_PCA_IND, 0xAA);
+	pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_ITO);
+	pca_outw(pca_data, I2C_PCA_IND, 0x00);
+	pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_IADR);
+	if (pca_inw(pca_data, I2C_PCA_IND) == 0xAA) {
+		printk(KERN_INFO "%s: PCA9665 detected.\n", adap->name);
+		return I2C_PCA_CHIP_9665;
+	} else {
+		printk(KERN_INFO "%s: PCA9564 detected.\n", adap->name);
+		return I2C_PCA_CHIP_9564;
 	}
+}
+
+static int pca_init(struct i2c_adapter *adap)
+{
+	struct i2c_algo_pca_data *pca_data = adap->algo_data;
 
 	adap->algo = &pca_algo;
 
-	pca_reset(pca_data);
+	if (pca_probe_chip(adap) == I2C_PCA_CHIP_9564) {
+		static int freqs[] = {330, 288, 217, 146, 88, 59, 44, 36};
+		int clock;
+
+		if (pca_data->i2c_clock > 7) {
+			switch (pca_data->i2c_clock) {
+			case 330000:
+				pca_data->i2c_clock = I2C_PCA_CON_330kHz;
+				break;
+			case 288000:
+				pca_data->i2c_clock = I2C_PCA_CON_288kHz;
+				break;
+			case 217000:
+				pca_data->i2c_clock = I2C_PCA_CON_217kHz;
+				break;
+			case 146000:
+				pca_data->i2c_clock = I2C_PCA_CON_146kHz;
+				break;
+			case 88000:
+				pca_data->i2c_clock = I2C_PCA_CON_88kHz;
+				break;
+			case 59000:
+				pca_data->i2c_clock = I2C_PCA_CON_59kHz;
+				break;
+			case 44000:
+				pca_data->i2c_clock = I2C_PCA_CON_44kHz;
+				break;
+			case 36000:
+				pca_data->i2c_clock = I2C_PCA_CON_36kHz;
+				break;
+			default:
+				printk(KERN_WARNING
+					"%s: Invalid I2C clock speed selected."
+					" Using default 59kHz.\n", adap->name);
+			pca_data->i2c_clock = I2C_PCA_CON_59kHz;
+			}
+		} else {
+			printk(KERN_WARNING "%s: "
+				"Choosing the clock frequency based on "
+				"index is deprecated."
+				" Use the nominal frequency.\n", adap->name);
+		}
+
+		pca_reset(pca_data);
+
+		clock = pca_clock(pca_data);
+		printk(KERN_INFO "%s: Clock frequency is %dkHz\n",
+		     adap->name, freqs[clock]);
+
+		pca_set_con(pca_data, I2C_PCA_CON_ENSIO | clock);
+	} else {
+		int clock;
+		int mode;
+		int tlow, thi;
+		/* Values can be found on PCA9665 datasheet section 7.3.2.6 */
+		int min_tlow, min_thi;
+		/* These values are the maximum raise and fall values allowed
+		 * by the I2C operation mode (Standard, Fast or Fast+)
+		 * They are used (added) below to calculate the clock dividers
+		 * of PCA9665. Note that they are slightly different of the
+		 * real maximum, to allow the change on mode exactly on the
+		 * maximum clock rate for each mode
+		 */
+		int raise_fall_time;
+
+		struct i2c_algo_pca_data *pca_data = adap->algo_data;
+
+		/* Ignore the reset function from the module,
+		 * we can use the parallel bus reset
+		 */
+		pca_data->reset_chip = pca9665_reset;
+
+		if (pca_data->i2c_clock > 1265800) {
+			printk(KERN_WARNING "%s: I2C clock speed too high."
+				" Using 1265.8kHz.\n", adap->name);
+			pca_data->i2c_clock = 1265800;
+		}
+
+		if (pca_data->i2c_clock < 60300) {
+			printk(KERN_WARNING "%s: I2C clock speed too low."
+				" Using 60.3kHz.\n", adap->name);
+			pca_data->i2c_clock = 60300;
+		}
+
+		/* To avoid integer overflow, use clock/100 for calculations */
+		clock = pca_clock(pca_data) / 100;
+
+		if (pca_data->i2c_clock > 10000) {
+			mode = I2C_PCA_MODE_TURBO;
+			min_tlow = 14;
+			min_thi  = 5;
+			raise_fall_time = 22; /* Raise 11e-8s, Fall 11e-8s */
+		} else if (pca_data->i2c_clock > 4000) {
+			mode = I2C_PCA_MODE_FASTP;
+			min_tlow = 17;
+			min_thi  = 9;
+			raise_fall_time = 22; /* Raise 11e-8s, Fall 11e-8s */
+		} else if (pca_data->i2c_clock > 1000) {
+			mode = I2C_PCA_MODE_FAST;
+			min_tlow = 44;
+			min_thi  = 20;
+			raise_fall_time = 58; /* Raise 29e-8s, Fall 29e-8s */
+		} else {
+			mode = I2C_PCA_MODE_STD;
+			min_tlow = 157;
+			min_thi  = 134;
+			raise_fall_time = 127; /* Raise 29e-8s, Fall 98e-8s */
+		}
+
+		/* The minimum clock that respects the thi/tlow = 134/157 is
+		 * 64800 Hz. Below that, we have to fix the tlow to 255 and
+		 * calculate the thi factor.
+		 */
+		if (clock < 648) {
+			tlow = 255;
+			thi = 1000000 - clock * raise_fall_time;
+			thi /= (I2C_PCA_OSC_PER * clock) - tlow;
+		} else {
+			tlow = (1000000 - clock * raise_fall_time) * min_tlow;
+			tlow /= I2C_PCA_OSC_PER * clock * (min_thi + min_tlow);
+			thi = tlow * min_thi / min_tlow;
+		}
+
+		pca_reset(pca_data);
 
-	clock = pca_clock(pca_data);
-	printk(KERN_INFO "%s: Clock frequency is %dkHz\n", adap->name,
-	       freqs[clock]);
+		printk(KERN_INFO
+		     "%s: Clock frequency is %dHz\n", adap->name, clock * 100);
 
-	pca_set_con(pca_data, I2C_PCA_CON_ENSIO | clock);
+		pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_IMODE);
+		pca_outw(pca_data, I2C_PCA_IND, mode);
+		pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_ISCLL);
+		pca_outw(pca_data, I2C_PCA_IND, tlow);
+		pca_outw(pca_data, I2C_PCA_INDPTR, I2C_PCA_ISCLH);
+		pca_outw(pca_data, I2C_PCA_IND, thi);
+
+		pca_set_con(pca_data, I2C_PCA_CON_ENSIO);
+	}
 	udelay(500); /* 500 us for oscilator to stabilise */
 
 	return 0;
@@ -388,7 +540,7 @@ EXPORT_SYMBOL(i2c_pca_add_numbered_bus);
 
 MODULE_AUTHOR("Ian Campbell <icampbell@arcom.com>, "
 	"Wolfram Sang <w.sang@pengutronix.de>");
-MODULE_DESCRIPTION("I2C-Bus PCA9564 algorithm");
+MODULE_DESCRIPTION("I2C-Bus PCA9564/PCA9665 algorithm");
 MODULE_LICENSE("GPL");
 
 module_param(i2c_debug, int, 0);
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 7f95905bbb9d..68650643d116 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -617,12 +617,12 @@ config I2C_ELEKTOR
 	  will be called i2c-elektor.
 
 config I2C_PCA_ISA
-	tristate "PCA9564 on an ISA bus"
+	tristate "PCA9564/PCA9665 on an ISA bus"
 	depends on ISA
 	select I2C_ALGOPCA
 	default n
 	help
-	  This driver supports ISA boards using the Philips PCA9564
+	  This driver supports ISA boards using the Philips PCA9564/PCA9665
 	  parallel bus to I2C bus controller.
 
 	  This driver can also be built as a module.  If so, the module
@@ -634,11 +634,11 @@ config I2C_PCA_ISA
 	  time).  If unsure, say N.
 
 config I2C_PCA_PLATFORM
-	tristate "PCA9564 as platform device"
+	tristate "PCA9564/PCA9665 as platform device"
 	select I2C_ALGOPCA
 	default n
 	help
-	  This driver supports a memory mapped Philips PCA9564
+	  This driver supports a memory mapped Philips PCA9564/PCA9665
 	  parallel bus to I2C bus controller.
 
 	  This driver can also be built as a module.  If so, the module
diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c
index c420a7c0f3e4..0cc8017b3f64 100644
--- a/drivers/i2c/busses/i2c-pca-isa.c
+++ b/drivers/i2c/busses/i2c-pca-isa.c
@@ -41,7 +41,7 @@ static int irq = -1;
 
 /* Data sheet recommends 59kHz for 100kHz operation due to variation
  * in the actual clock rate */
-static int clock  = I2C_PCA_CON_59kHz;
+static int clock  = 59000;
 
 static wait_queue_head_t pca_wait;
 
@@ -103,7 +103,7 @@ static struct i2c_algo_pca_data pca_isa_data = {
 static struct i2c_adapter pca_isa_ops = {
 	.owner          = THIS_MODULE,
 	.algo_data	= &pca_isa_data,
-	.name		= "PCA9564 ISA Adapter",
+	.name		= "PCA9564/PCA9665 ISA Adapter",
 	.timeout	= 100,
 };
 
@@ -196,7 +196,7 @@ static void __exit pca_isa_exit(void)
 }
 
 MODULE_AUTHOR("Ian Campbell <icampbell@arcom.com>");
-MODULE_DESCRIPTION("ISA base PCA9564 driver");
+MODULE_DESCRIPTION("ISA base PCA9564/PCA9665 driver");
 MODULE_LICENSE("GPL");
 
 module_param(base, ulong, 0);
@@ -205,7 +205,13 @@ MODULE_PARM_DESC(base, "I/O base address");
 module_param(irq, int, 0);
 MODULE_PARM_DESC(irq, "IRQ");
 module_param(clock, int, 0);
-MODULE_PARM_DESC(clock, "Clock rate as described in table 1 of PCA9564 datasheet");
+MODULE_PARM_DESC(clock, "Clock rate in hertz.\n\t\t"
+		"For PCA9564: 330000,288000,217000,146000,"
+		"88000,59000,44000,36000\n"
+		"\t\tFor PCA9665:\tStandard: 60300 - 100099\n"
+		"\t\t\t\tFast: 100100 - 400099\n"
+		"\t\t\t\tFast+: 400100 - 10000099\n"
+		"\t\t\t\tTurbo: Up to 1265800");
 
 module_init(pca_isa_init);
 module_exit(pca_isa_exit);
diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c
index 6bb15ad0a6b6..51d179bbddf9 100644
--- a/drivers/i2c/busses/i2c-pca-platform.c
+++ b/drivers/i2c/busses/i2c-pca-platform.c
@@ -172,8 +172,9 @@ static int __devinit i2c_pca_pf_probe(struct platform_device *pdev)
 
 	i2c->adap.nr = pdev->id >= 0 ? pdev->id : 0;
 	i2c->adap.owner = THIS_MODULE;
-	snprintf(i2c->adap.name, sizeof(i2c->adap.name), "PCA9564 at 0x%08lx",
-		(unsigned long) res->start);
+	snprintf(i2c->adap.name, sizeof(i2c->adap.name),
+		 "PCA9564/PCA9665 at 0x%08lx",
+		 (unsigned long) res->start);
 	i2c->adap.algo_data = &i2c->algo_data;
 	i2c->adap.dev.parent = &pdev->dev;
 	i2c->adap.timeout = platform_data->timeout;
@@ -246,7 +247,7 @@ e_remap:
 e_alloc:
 	release_mem_region(res->start, res_len(res));
 e_print:
-	printk(KERN_ERR "Registering PCA9564 FAILED! (%d)\n", ret);
+	printk(KERN_ERR "Registering PCA9564/PCA9665 FAILED! (%d)\n", ret);
 	return ret;
 }
 
@@ -290,7 +291,7 @@ static void __exit i2c_pca_pf_exit(void)
 }
 
 MODULE_AUTHOR("Wolfram Sang <w.sang@pengutronix.de>");
-MODULE_DESCRIPTION("I2C-PCA9564 platform driver");
+MODULE_DESCRIPTION("I2C-PCA9564/PCA9665 platform driver");
 MODULE_LICENSE("GPL");
 
 module_init(i2c_pca_pf_init);
diff --git a/include/linux/i2c-algo-pca.h b/include/linux/i2c-algo-pca.h
index adcb3dc7ac26..1364d62e2fbe 100644
--- a/include/linux/i2c-algo-pca.h
+++ b/include/linux/i2c-algo-pca.h
@@ -1,7 +1,14 @@
 #ifndef _LINUX_I2C_ALGO_PCA_H
 #define _LINUX_I2C_ALGO_PCA_H
 
-/* Clock speeds for the bus */
+/* Chips known to the pca algo */
+#define I2C_PCA_CHIP_9564	0x00
+#define I2C_PCA_CHIP_9665	0x01
+
+/* Internal period for PCA9665 oscilator */
+#define I2C_PCA_OSC_PER		3 /* e10-8s */
+
+/* Clock speeds for the bus for PCA9564*/
 #define I2C_PCA_CON_330kHz	0x00
 #define I2C_PCA_CON_288kHz	0x01
 #define I2C_PCA_CON_217kHz	0x02
@@ -18,6 +25,26 @@
 #define I2C_PCA_ADR		0x02 /* OWN ADR Read/Write */
 #define I2C_PCA_CON		0x03 /* CONTROL Read/Write */
 
+/* PCA9665 registers */
+#define I2C_PCA_INDPTR          0x00 /* INDIRECT Pointer Write Only */
+#define I2C_PCA_IND             0x02 /* INDIRECT Read/Write */
+
+/* PCA9665 indirect registers */
+#define I2C_PCA_ICOUNT          0x00 /* Byte Count for buffered mode */
+#define I2C_PCA_IADR            0x01 /* OWN ADR */
+#define I2C_PCA_ISCLL           0x02 /* SCL LOW period */
+#define I2C_PCA_ISCLH           0x03 /* SCL HIGH period */
+#define I2C_PCA_ITO             0x04 /* TIMEOUT */
+#define I2C_PCA_IPRESET         0x05 /* Parallel bus reset */
+#define I2C_PCA_IMODE           0x06 /* I2C Bus mode */
+
+/* PCA9665 I2C bus mode */
+#define I2C_PCA_MODE_STD        0x00 /* Standard mode */
+#define I2C_PCA_MODE_FAST       0x01 /* Fast mode */
+#define I2C_PCA_MODE_FASTP      0x02 /* Fast Plus mode */
+#define I2C_PCA_MODE_TURBO      0x03 /* Turbo mode */
+
+
 #define I2C_PCA_CON_AA		0x80 /* Assert Acknowledge */
 #define I2C_PCA_CON_ENSIO	0x40 /* Enable */
 #define I2C_PCA_CON_STA		0x20 /* Start */
@@ -31,7 +58,9 @@ struct i2c_algo_pca_data {
 	int  (*read_byte)		(void *data, int reg);
 	int  (*wait_for_completion)	(void *data);
 	void (*reset_chip)		(void *data);
-	/* i2c_clock values are defined in linux/i2c-algo-pca.h */
+	/* For PCA9564, use one of the predefined frequencies:
+	 * 330000, 288000, 217000, 146000, 88000, 59000, 44000, 36000
+	 * For PCA9665, use the frequency you want here. */
 	unsigned int			i2c_clock;
 };
 
-- 
cgit v1.2.3-71-gd317


From 8e99ada8deaa9033600cd2c7d0a9366b0e99ab68 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Sat, 28 Mar 2009 21:34:45 +0100
Subject: i2c-algo-pca: Rework waiting for a free bus

Waiting for a free bus now accepts the timeout value in jiffies and does
proper checking using time_before.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 arch/sh/boards/board-sh7785lcr.c |  2 +-
 drivers/i2c/algos/i2c-algo-pca.c | 17 ++++++++++-------
 drivers/i2c/busses/i2c-pca-isa.c |  2 +-
 include/linux/i2c-pca-platform.h |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index 94c0296bc35d..6f94f17adc46 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -229,7 +229,7 @@ static struct resource i2c_resources[] = {
 static struct i2c_pca9564_pf_platform_data i2c_platform_data = {
 	.gpio			= 0,
 	.i2c_clock_speed	= I2C_PCA_CON_330kHz,
-	.timeout		= 100,
+	.timeout		= HZ,
 };
 
 static struct platform_device i2c_device = {
diff --git a/drivers/i2c/algos/i2c-algo-pca.c b/drivers/i2c/algos/i2c-algo-pca.c
index a8e51bd1a4f5..9e134fad7bda 100644
--- a/drivers/i2c/algos/i2c-algo-pca.c
+++ b/drivers/i2c/algos/i2c-algo-pca.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
+#include <linux/jiffies.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/i2c.h>
@@ -186,14 +187,16 @@ static int pca_xfer(struct i2c_adapter *i2c_adap,
 	int numbytes = 0;
 	int state;
 	int ret;
-	int timeout = i2c_adap->timeout;
+	unsigned long timeout = jiffies + i2c_adap->timeout;
 
-	while ((state = pca_status(adap)) != 0xf8 && timeout--) {
-		msleep(10);
-	}
-	if (state != 0xf8) {
-		dev_dbg(&i2c_adap->dev, "bus is not idle. status is %#04x\n", state);
-		return -EAGAIN;
+	while (pca_status(adap) != 0xf8) {
+		if (time_before(jiffies, timeout)) {
+			msleep(10);
+		} else {
+			dev_dbg(&i2c_adap->dev, "bus is not idle. status is "
+				"%#04x\n", state);
+			return -EAGAIN;
+		}
 	}
 
 	DEB1("{{{ XFER %d messages\n", num);
diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c
index 0cc8017b3f64..b9403fdfb6d8 100644
--- a/drivers/i2c/busses/i2c-pca-isa.c
+++ b/drivers/i2c/busses/i2c-pca-isa.c
@@ -104,7 +104,7 @@ static struct i2c_adapter pca_isa_ops = {
 	.owner          = THIS_MODULE,
 	.algo_data	= &pca_isa_data,
 	.name		= "PCA9564/PCA9665 ISA Adapter",
-	.timeout	= 100,
+	.timeout	= HZ,
 };
 
 static int __devinit pca_isa_match(struct device *dev, unsigned int id)
diff --git a/include/linux/i2c-pca-platform.h b/include/linux/i2c-pca-platform.h
index 3d191873f2d1..aba33759dec4 100644
--- a/include/linux/i2c-pca-platform.h
+++ b/include/linux/i2c-pca-platform.h
@@ -6,7 +6,7 @@ struct i2c_pca9564_pf_platform_data {
 				 * not supplied (negative value), but it
 				 * cannot exit some error conditions then */
 	int i2c_clock_speed;	/* values are defined in linux/i2c-algo-pca.h */
-	int timeout;		/* timeout = this value * 10us */
+	int timeout;		/* timeout in jiffies */
 };
 
 #endif /* I2C_PCA9564_PLATFORM_H */
-- 
cgit v1.2.3-71-gd317


From 506a8b6c27cb08998dc13069fbdf6eb7ec748b99 Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@sysclose.org>
Date: Sat, 28 Mar 2009 21:34:46 +0100
Subject: i2c-piix4: Add support for the Broadcom HT1100 chipset

Add support for the Broadcom HT1100 LD chipset (SMBus function.)

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/busses/i2c-piix4 | 2 +-
 drivers/i2c/busses/Kconfig         | 1 +
 drivers/i2c/busses/i2c-piix4.c     | 4 +++-
 include/linux/pci_ids.h            | 1 +
 4 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index ef1efa79b1df..f889481762b5 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -4,7 +4,7 @@ Supported adapters:
   * Intel 82371AB PIIX4 and PIIX4E
   * Intel 82443MX (440MX)
     Datasheet: Publicly available at the Intel website
-  * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges
+  * ServerWorks OSB4, CSB5, CSB6, HT-1000 and HT-1100 southbridges
     Datasheet: Only available via NDA from ServerWorks
   * ATI IXP200, IXP300, IXP400, SB600, SB700 and SB800 southbridges
     Datasheet: Not publicly available
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 68650643d116..da809ad0996a 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -132,6 +132,7 @@ config I2C_PIIX4
 	    Serverworks CSB5
 	    Serverworks CSB6
 	    Serverworks HT-1000
+	    Serverworks HT-1100
 	    SMSC Victory66
 
 	  This driver can also be built as a module.  If so, the module
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 63d5e5978046..0249a7d762b9 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -20,7 +20,7 @@
 /*
    Supports:
 	Intel PIIX4, 440MX
-	Serverworks OSB4, CSB5, CSB6, HT-1000
+	Serverworks OSB4, CSB5, CSB6, HT-1000, HT-1100
 	ATI IXP200, IXP300, IXP400, SB600, SB700, SB800
 	SMSC Victory66
 
@@ -487,6 +487,8 @@ static struct pci_device_id piix4_ids[] = {
 		     PCI_DEVICE_ID_SERVERWORKS_CSB6) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS,
 		     PCI_DEVICE_ID_SERVERWORKS_HT1000SB) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS,
+		     PCI_DEVICE_ID_SERVERWORKS_HT1100LD) },
 	{ 0, }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5109fecde284..2c9e8080da5e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1479,6 +1479,7 @@
 #define PCI_DEVICE_ID_SERVERWORKS_HT1000IDE 0x0214
 #define PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2 0x0217
 #define PCI_DEVICE_ID_SERVERWORKS_CSB6LPC 0x0227
+#define PCI_DEVICE_ID_SERVERWORKS_HT1100LD 0x0408
 
 #define PCI_VENDOR_ID_SBE		0x1176
 #define PCI_DEVICE_ID_SBE_WANXL100	0x0301
-- 
cgit v1.2.3-71-gd317


From 795e2fe0a3b69dbc040d7efcf517e0cbad6901d0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 28 Mar 2009 23:23:01 +0000
Subject: Annotate struct fs_struct's usage count restriction

Annotate struct fs_struct's usage count to indicate the restrictions upon it.
It may not be incremented, except by clone(CLONE_FS), as this affects the
check in check_unsafe_exec() in fs/exec.c.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs_struct.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index a97c053d3a9a..18b467dbe278 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -4,7 +4,10 @@
 #include <linux/path.h>
 
 struct fs_struct {
-	atomic_t count;
+	atomic_t count;	/* This usage count is used by check_unsafe_exec() for
+			 * security checking purposes - therefore it may not be
+			 * incremented, except by clone(CLONE_FS).
+			 */
 	rwlock_t lock;
 	int umask;
 	struct path root, pwd;
-- 
cgit v1.2.3-71-gd317


From 4b21cd4eedff2123712c2132c8c6264d40332465 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 28 Mar 2009 23:38:40 -0700
Subject: skbuff.h: fix missing kernel-doc

Add missing struct field to fix kernel-doc warning:

Warning(include/linux/skbuff.h:182): No description found for parameter 'flags'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb1981fd60f3..eb2e837afaf3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -168,6 +168,7 @@ struct skb_shared_hwtstamps {
  * @software:		generate software time stamp
  * @in_progress:	device driver is going to provide
  *			hardware time stamp
+ * @flags:		all shared_tx flags
  *
  * These flags are attached to packets as part of the
  * &skb_shared_info. Use skb_tx() to get a pointer.
-- 
cgit v1.2.3-71-gd317


From e7557af56a576762a655f1aaaded253ad14c5958 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 28 Mar 2009 15:38:31 +0000
Subject: netpoll: store local and remote ip in net-endian

Allows for the removal of byteswapping in some places and
the removal of HIPQUAD (replaced by %pI4).

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/netconsole.c | 10 ++++------
 include/linux/netpoll.h  |  2 +-
 net/core/netpoll.c       | 31 +++++++++++++++----------------
 3 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index d304d38cd5d1..eceadf787a67 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -294,14 +294,12 @@ static ssize_t show_remote_port(struct netconsole_target *nt, char *buf)
 
 static ssize_t show_local_ip(struct netconsole_target *nt, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d.%d.%d.%d\n",
-			HIPQUAD(nt->np.local_ip));
+	return snprintf(buf, PAGE_SIZE, "%pI4\n", &nt->np.local_ip);
 }
 
 static ssize_t show_remote_ip(struct netconsole_target *nt, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d.%d.%d.%d\n",
-			HIPQUAD(nt->np.remote_ip));
+	return snprintf(buf, PAGE_SIZE, "%pI4\n", &nt->np.remote_ip);
 }
 
 static ssize_t show_local_mac(struct netconsole_target *nt, char *buf)
@@ -438,7 +436,7 @@ static ssize_t store_local_ip(struct netconsole_target *nt,
 		return -EINVAL;
 	}
 
-	nt->np.local_ip = ntohl(in_aton(buf));
+	nt->np.local_ip = in_aton(buf);
 
 	return strnlen(buf, count);
 }
@@ -454,7 +452,7 @@ static ssize_t store_remote_ip(struct netconsole_target *nt,
 		return -EINVAL;
 	}
 
-	nt->np.remote_ip = ntohl(in_aton(buf));
+	nt->np.remote_ip = in_aton(buf);
 
 	return strnlen(buf, count);
 }
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index de99025f2c5d..2524267210d3 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -18,7 +18,7 @@ struct netpoll {
 	const char *name;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
 
-	u32 local_ip, remote_ip;
+	__be32 local_ip, remote_ip;
 	u16 local_port, remote_port;
 	u8 remote_mac[ETH_ALEN];
 };
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 755414cd49d1..b5873bdff612 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -345,8 +345,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 	udph->dest = htons(np->remote_port);
 	udph->len = htons(udp_len);
 	udph->check = 0;
-	udph->check = csum_tcpudp_magic(htonl(np->local_ip),
-					htonl(np->remote_ip),
+	udph->check = csum_tcpudp_magic(np->local_ip,
+					np->remote_ip,
 					udp_len, IPPROTO_UDP,
 					csum_partial(udph, udp_len, 0));
 	if (udph->check == 0)
@@ -365,8 +365,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 	iph->ttl      = 64;
 	iph->protocol = IPPROTO_UDP;
 	iph->check    = 0;
-	put_unaligned(htonl(np->local_ip), &(iph->saddr));
-	put_unaligned(htonl(np->remote_ip), &(iph->daddr));
+	put_unaligned(np->local_ip, &(iph->saddr));
+	put_unaligned(np->remote_ip, &(iph->daddr));
 	iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl);
 
 	eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
@@ -424,7 +424,7 @@ static void arp_reply(struct sk_buff *skb)
 	memcpy(&tip, arp_ptr, 4);
 
 	/* Should we ignore arp? */
-	if (tip != htonl(np->local_ip) ||
+	if (tip != np->local_ip ||
 	    ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
 		return;
 
@@ -533,9 +533,9 @@ int __netpoll_rx(struct sk_buff *skb)
 		goto out;
 	if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
 		goto out;
-	if (np->local_ip && np->local_ip != ntohl(iph->daddr))
+	if (np->local_ip && np->local_ip != iph->daddr)
 		goto out;
-	if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
+	if (np->remote_ip && np->remote_ip != iph->saddr)
 		goto out;
 	if (np->local_port && np->local_port != ntohs(uh->dest))
 		goto out;
@@ -560,14 +560,14 @@ void netpoll_print_options(struct netpoll *np)
 {
 	printk(KERN_INFO "%s: local port %d\n",
 			 np->name, np->local_port);
-	printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
-			 np->name, HIPQUAD(np->local_ip));
+	printk(KERN_INFO "%s: local IP %pI4\n",
+			 np->name, &np->local_ip);
 	printk(KERN_INFO "%s: interface %s\n",
 			 np->name, np->dev_name);
 	printk(KERN_INFO "%s: remote port %d\n",
 			 np->name, np->remote_port);
-	printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n",
-			 np->name, HIPQUAD(np->remote_ip));
+	printk(KERN_INFO "%s: remote IP %pI4\n",
+			 np->name, &np->remote_ip);
 	printk(KERN_INFO "%s: remote ethernet address %pM\n",
 	                 np->name, np->remote_mac);
 }
@@ -589,7 +589,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 		if ((delim = strchr(cur, '/')) == NULL)
 			goto parse_failed;
 		*delim = 0;
-		np->local_ip = ntohl(in_aton(cur));
+		np->local_ip = in_aton(cur);
 		cur = delim;
 	}
 	cur++;
@@ -618,7 +618,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 	if ((delim = strchr(cur, '/')) == NULL)
 		goto parse_failed;
 	*delim = 0;
-	np->remote_ip = ntohl(in_aton(cur));
+	np->remote_ip = in_aton(cur);
 	cur = delim + 1;
 
 	if (*cur != 0) {
@@ -759,10 +759,9 @@ int netpoll_setup(struct netpoll *np)
 			goto release;
 		}
 
-		np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
+		np->local_ip = in_dev->ifa_list->ifa_local;
 		rcu_read_unlock();
-		printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
-		       np->name, HIPQUAD(np->local_ip));
+		printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip);
 	}
 
 	if (np->rx_hook) {
-- 
cgit v1.2.3-71-gd317


From 2c60b6885afc56a17b9d55b04c4328123063fc9d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 28 Mar 2009 15:38:31 +0000
Subject: kernel: remove HIPQUAD()

All users have been removed.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 914918abfdd1..f81d80f47dcb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -379,18 +379,6 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 	((unsigned char *)&addr)[3]
 #define NIPQUAD_FMT "%u.%u.%u.%u"
 
-#if defined(__LITTLE_ENDIAN)
-#define HIPQUAD(addr) \
-	((unsigned char *)&addr)[3], \
-	((unsigned char *)&addr)[2], \
-	((unsigned char *)&addr)[1], \
-	((unsigned char *)&addr)[0]
-#elif defined(__BIG_ENDIAN)
-#define HIPQUAD	NIPQUAD
-#else
-#error "Please fix asm/byteorder.h"
-#endif /* __LITTLE_ENDIAN */
-
 /*
  * min()/max()/clamp() macros that also do
  * strict type-checking.. See the
-- 
cgit v1.2.3-71-gd317


From 503e57630309643562c12f09d4c8a96eb629ee33 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sun, 29 Mar 2009 12:59:50 +0200
Subject: Fix build error in <linux/irq.h>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<linux/irq.h> relies on <linux/gfp.h> and <linux/topology.h> having been
included previous.  If not, the errors like below will result.

    CC      arch/mips/mti-malta/malta-int.o
  In file included from arch/mips/mti-malta/malta-int.c:25:
  include/linux/irq.h: In function ‘init_alloc_desc_masks’:
  include/linux/irq.h:444: error: implicit declaration of function ‘cpu_to_node’
  include/linux/irq.h:446: error: ‘GFP_ATOMIC’ undeclared (first use in this function)
  include/linux/irq.h:446: error: (Each undeclared identifier is reported only once
  include/linux/irq.h:446: error: for each function it appears in.)
  make[3]: *** [arch/mips/mti-malta/malta-int.o] Error 1
  make[2]: *** [arch/mips/mti-malta] Error 2
  make[1]: *** [sub-make] Error 2

Fixed by including the two missing headers.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/irq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 873e4ac11b81..9c62fbe2ef30 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -17,9 +17,11 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
+#include <linux/gfp.h>
 #include <linux/irqreturn.h>
 #include <linux/irqnr.h>
 #include <linux/errno.h>
+#include <linux/topology.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
-- 
cgit v1.2.3-71-gd317


From af76aba00fdcfb21535c9f9872245d14097a4561 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:11 -0600
Subject: cpumask: fix seq_bitmap_*() functions.

1) seq_bitmap_list() should take a const.
2) All the seq_bitmap should use cpumask_bits().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/seq_file.c            | 2 +-
 include/linux/seq_file.h | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index a1a4cfe19210..7f40f30c55c5 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -513,7 +513,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
 }
 EXPORT_SYMBOL(seq_bitmap);
 
-int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
 		unsigned int nr_bits)
 {
 	if (m->count < m->size) {
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index f616f31576d7..004f3b3342c5 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -55,7 +55,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
 				   unsigned int nr_bits);
 static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask)
 {
-	return seq_bitmap(m, mask->bits, nr_cpu_ids);
+	return seq_bitmap(m, cpumask_bits(mask), nr_cpu_ids);
 }
 
 static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
@@ -63,12 +63,13 @@ static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
 	return seq_bitmap(m, mask->bits, MAX_NUMNODES);
 }
 
-int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
 		unsigned int nr_bits);
 
-static inline int seq_cpumask_list(struct seq_file *m, cpumask_t *mask)
+static inline int seq_cpumask_list(struct seq_file *m,
+				   const struct cpumask *mask)
 {
-	return seq_bitmap_list(m, mask->bits, NR_CPUS);
+	return seq_bitmap_list(m, cpumask_bits(mask), nr_cpu_ids);
 }
 
 static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
-- 
cgit v1.2.3-71-gd317


From aa85ea5b89c36c51200d795dd788139bd9b8cf50 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:15 -0600
Subject: cpumask: use new cpumask_ functions in core code.

Impact: cleanup

Time to clean up remaining laggards using the old cpu_ functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Trond.Myklebust@netapp.com
---
 drivers/base/cpu.c     | 2 +-
 include/linux/cpuset.h | 4 ++--
 kernel/workqueue.c     | 6 +++---
 mm/allocpercpu.c       | 2 +-
 mm/vmstat.c            | 2 +-
 net/sunrpc/svc.c       | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 5b257a57bc57..e62a4ccea54d 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -119,7 +119,7 @@ static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
 #define	print_cpus_func(type) \
 static ssize_t print_cpus_##type(struct sysdev_class *class, char *buf)	\
 {									\
-	return print_cpus_map(buf, &cpu_##type##_map);			\
+	return print_cpus_map(buf, cpu_##type##_mask);			\
 }									\
 static struct sysdev_class_attribute attr_##type##_map = 		\
 	_SYSDEV_CLASS_ATTR(type, 0444, print_cpus_##type, NULL)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 90c6074a36ca..2e0d79678deb 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -90,12 +90,12 @@ static inline void cpuset_init_smp(void) {}
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
 					      struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1f0c509b40d3..9aedd9fd825b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -416,7 +416,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +547,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -911,7 +911,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
  	cpu_maps_update_done();
 
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 1882923bc706..139d5b7b6621 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -143,7 +143,7 @@ void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
-	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
+	__percpu_depopulate_mask(__pdata, cpu_possible_mask);
 	kfree(__percpu_disguise(__pdata));
 }
 EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 91149746bb8d..8cd81ea1ddc1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
 
 	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
 
-	for_each_cpu_mask_nr(cpu, *cpumask) {
+	for_each_cpu(cpu, cpumask) {
 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..bb507e2bb94d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -312,7 +312,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	switch (m->mode) {
 	case SVC_POOL_PERCPU:
 	{
-		set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+		set_cpus_allowed_ptr(task, cpumask_of(node));
 		break;
 	}
 	case SVC_POOL_PERNODE:
-- 
cgit v1.2.3-71-gd317


From 6f4303fb2ec68055e793b84887a7ae0f9ea7cc2d Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 29 Jan 2009 00:15:51 +0100
Subject: HID: bring back possibility to specify vid/pid ignore on module load

When hid quirks were converted to specialized driver, the HID_QUIRK_IGNORE
has been moved completely, as the hid_ignore_list[] has been moved into the
generic code.

However userspace already got used to the possibility that modprobing
usbhid with

	'quirks=vid:pid:0x4'

makes the device ignored by usbhid driver. So keep this quirk flag in place
for backwards compatibility.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c | 3 +++
 include/linux/hid.h           | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index f0a0f72238ab..eed05a3017e5 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -711,6 +711,9 @@ static int usbhid_parse(struct hid_device *hid)
 	quirks = usbhid_lookup_quirk(le16_to_cpu(dev->descriptor.idVendor),
 			le16_to_cpu(dev->descriptor.idProduct));
 
+	if (quirks & HID_QUIRK_IGNORE)
+		return -ENODEV;
+
 	/* Many keyboards and mice don't like to be polled for reports,
 	 * so we will always set the HID_QUIRK_NOGET flag for them. */
 	if (interface->desc.bInterfaceSubClass == USB_INTERFACE_SUBCLASS_BOOT) {
diff --git a/include/linux/hid.h b/include/linux/hid.h
index fa8ee9cef7be..a46cda488695 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -270,6 +270,7 @@ struct hid_item {
 
 #define HID_QUIRK_INVERT			0x00000001
 #define HID_QUIRK_NOTOUCH			0x00000002
+#define HID_QUIRK_IGNORE			0x00000004
 #define HID_QUIRK_NOGET				0x00000008
 #define HID_QUIRK_BADPAD			0x00000020
 #define HID_QUIRK_MULTI_INPUT			0x00000040
-- 
cgit v1.2.3-71-gd317


From afa5eb7c68689ced4284f01c96feed44a2d0a127 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 18 Mar 2009 09:13:37 +0100
Subject: HID: remove compat stuff

This removal was scheduled and there is no problem with later
distros to adapt for the new bus, thanks to aliases.

module-init-tools map files are deprecated nowadays, so that
the patch which introduced hid ones into the m-i-t won't be
accepted and hence there is no reason for leaving compat stuff in.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/feature-removal-schedule.txt |  7 ---
 drivers/hid/Kconfig                        | 12 -----
 drivers/hid/Makefile                       |  4 --
 drivers/hid/hid-a4tech.c                   |  2 -
 drivers/hid/hid-apple.c                    |  2 -
 drivers/hid/hid-belkin.c                   |  2 -
 drivers/hid/hid-cherry.c                   |  2 -
 drivers/hid/hid-chicony.c                  |  2 -
 drivers/hid/hid-core.c                     | 21 --------
 drivers/hid/hid-cypress.c                  |  2 -
 drivers/hid/hid-drff.c                     |  2 -
 drivers/hid/hid-dummy.c                    | 87 ------------------------------
 drivers/hid/hid-ezkey.c                    |  2 -
 drivers/hid/hid-gaff.c                     |  2 -
 drivers/hid/hid-gyration.c                 |  2 -
 drivers/hid/hid-kensington.c               |  2 -
 drivers/hid/hid-kye.c                      |  2 -
 drivers/hid/hid-lg.c                       |  2 -
 drivers/hid/hid-microsoft.c                |  2 -
 drivers/hid/hid-monterey.c                 |  2 -
 drivers/hid/hid-ntrig.c                    |  2 -
 drivers/hid/hid-petalynx.c                 |  2 -
 drivers/hid/hid-pl.c                       |  2 -
 drivers/hid/hid-samsung.c                  |  2 -
 drivers/hid/hid-sony.c                     |  2 -
 drivers/hid/hid-sunplus.c                  |  2 -
 drivers/hid/hid-tmff.c                     |  2 -
 drivers/hid/hid-topseed.c                  |  2 -
 drivers/hid/hid-zpff.c                     |  2 -
 include/linux/hid.h                        | 16 ------
 30 files changed, 195 deletions(-)
 delete mode 100644 drivers/hid/hid-dummy.c

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1135996bec8b..fc5e85a5901c 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -273,13 +273,6 @@ Who:	Glauber Costa <gcosta@redhat.com>
 
 ---------------------------
 
-What:	remove HID compat support
-When:	2.6.29
-Why:	needed only as a temporary solution until distros fix themselves up
-Who:	Jiri Slaby <jirislaby@gmail.com>
-
----------------------------
-
 What: print_fn_descriptor_symbol()
 When: October 2009
 Why:  The %pF vsprintf format provides the same functionality in a
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index 88e16ef93247..63a2564f0f81 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -70,18 +70,6 @@ source "drivers/hid/usbhid/Kconfig"
 menu "Special HID drivers"
 	depends on HID
 
-config HID_COMPAT
-	bool "Load all HID drivers on hid core load"
-	default y
-	---help---
-	Compatible option for older userspace. If you have system without udev
-	support of module loading through aliases and also old
-	module-init-tools which can't handle hid bus, choose Y here. Otherwise
-	say N. If you say N and your userspace is old enough, the only
-	functionality you lose is modules autoloading.
-
-	If unsure, say Y.
-
 config HID_A4TECH
 	tristate "A4 tech" if EMBEDDED
 	depends on USB_HID
diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index e6b72ed0d70a..1f7cb0fd4505 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -8,10 +8,6 @@ obj-$(CONFIG_HID)		+= hid.o
 hid-$(CONFIG_HID_DEBUG)		+= hid-debug.o
 hid-$(CONFIG_HIDRAW)		+= hidraw.o
 
-ifdef CONFIG_HID_COMPAT
-obj-m				+= hid-dummy.o
-endif
-
 hid-logitech-objs		:= hid-lg.o
 ifdef CONFIG_LOGITECH_FF
 	hid-logitech-objs	+= hid-lgff.o
diff --git a/drivers/hid/hid-a4tech.c b/drivers/hid/hid-a4tech.c
index ebca00e6c103..42ea359e94cf 100644
--- a/drivers/hid/hid-a4tech.c
+++ b/drivers/hid/hid-a4tech.c
@@ -158,5 +158,3 @@ static void a4_exit(void)
 module_init(a4_init);
 module_exit(a4_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(a4tech);
diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index cab3be7ef0ab..7359d9d88e46 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -474,5 +474,3 @@ static void apple_exit(void)
 module_init(apple_init);
 module_exit(apple_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(apple);
diff --git a/drivers/hid/hid-belkin.c b/drivers/hid/hid-belkin.c
index 12c8a9ba6ed6..2f6723133a4b 100644
--- a/drivers/hid/hid-belkin.c
+++ b/drivers/hid/hid-belkin.c
@@ -101,5 +101,3 @@ static void belkin_exit(void)
 module_init(belkin_init);
 module_exit(belkin_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(belkin);
diff --git a/drivers/hid/hid-cherry.c b/drivers/hid/hid-cherry.c
index b833b9769aba..ab8209e7e45c 100644
--- a/drivers/hid/hid-cherry.c
+++ b/drivers/hid/hid-cherry.c
@@ -83,5 +83,3 @@ static void ch_exit(void)
 module_init(ch_init);
 module_exit(ch_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(cherry);
diff --git a/drivers/hid/hid-chicony.c b/drivers/hid/hid-chicony.c
index a54d4096e0f7..7f91076d8493 100644
--- a/drivers/hid/hid-chicony.c
+++ b/drivers/hid/hid-chicony.c
@@ -76,5 +76,3 @@ static void ch_exit(void)
 module_init(ch_init);
 module_exit(ch_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(chicony);
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index b96fbd5dab55..e56f8d5d3a50 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1819,15 +1819,6 @@ void hid_unregister_driver(struct hid_driver *hdrv)
 }
 EXPORT_SYMBOL_GPL(hid_unregister_driver);
 
-#ifdef CONFIG_HID_COMPAT
-static void hid_compat_load(struct work_struct *ws)
-{
-	request_module("hid-dummy");
-}
-static DECLARE_WORK(hid_compat_work, hid_compat_load);
-static struct workqueue_struct *hid_compat_wq;
-#endif
-
 static int __init hid_init(void)
 {
 	int ret;
@@ -1842,15 +1833,6 @@ static int __init hid_init(void)
 	if (ret)
 		goto err_bus;
 
-#ifdef CONFIG_HID_COMPAT
-	hid_compat_wq = create_singlethread_workqueue("hid_compat");
-	if (!hid_compat_wq) {
-		hidraw_exit();
-		goto err;
-	}
-	queue_work(hid_compat_wq, &hid_compat_work);
-#endif
-
 	return 0;
 err_bus:
 	bus_unregister(&hid_bus_type);
@@ -1860,9 +1842,6 @@ err:
 
 static void __exit hid_exit(void)
 {
-#ifdef CONFIG_HID_COMPAT
-	destroy_workqueue(hid_compat_wq);
-#endif
 	hidraw_exit();
 	bus_unregister(&hid_bus_type);
 }
diff --git a/drivers/hid/hid-cypress.c b/drivers/hid/hid-cypress.c
index 5d69d27b935d..9d6d3b91773b 100644
--- a/drivers/hid/hid-cypress.c
+++ b/drivers/hid/hid-cypress.c
@@ -154,5 +154,3 @@ static void cp_exit(void)
 module_init(cp_init);
 module_exit(cp_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(cypress);
diff --git a/drivers/hid/hid-drff.c b/drivers/hid/hid-drff.c
index 785d2492b5ef..34f3eb65100b 100644
--- a/drivers/hid/hid-drff.c
+++ b/drivers/hid/hid-drff.c
@@ -186,5 +186,3 @@ static void __exit dr_exit(void)
 module_init(dr_init);
 module_exit(dr_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(dragonrise);
diff --git a/drivers/hid/hid-dummy.c b/drivers/hid/hid-dummy.c
deleted file mode 100644
index 74d765f38624..000000000000
--- a/drivers/hid/hid-dummy.c
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <linux/autoconf.h>
-#include <linux/module.h>
-#include <linux/hid.h>
-
-static int __init hid_dummy_init(void)
-{
-#ifdef CONFIG_HID_A4TECH_MODULE
-	HID_COMPAT_CALL_DRIVER(a4tech);
-#endif
-#ifdef CONFIG_HID_APPLE_MODULE
-	HID_COMPAT_CALL_DRIVER(apple);
-#endif
-#ifdef CONFIG_HID_BELKIN_MODULE
-	HID_COMPAT_CALL_DRIVER(belkin);
-#endif
-#ifdef CONFIG_HID_BRIGHT_MODULE
-	HID_COMPAT_CALL_DRIVER(bright);
-#endif
-#ifdef CONFIG_HID_CHERRY_MODULE
-	HID_COMPAT_CALL_DRIVER(cherry);
-#endif
-#ifdef CONFIG_HID_CHICONY_MODULE
-	HID_COMPAT_CALL_DRIVER(chicony);
-#endif
-#ifdef CONFIG_HID_CYPRESS_MODULE
-	HID_COMPAT_CALL_DRIVER(cypress);
-#endif
-#ifdef CONFIG_HID_DELL_MODULE
-	HID_COMPAT_CALL_DRIVER(dell);
-#endif
-#ifdef CONFIG_DRAGONRISE_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(dragonrise);
-#endif
-#ifdef CONFIG_HID_EZKEY_MODULE
-	HID_COMPAT_CALL_DRIVER(ezkey);
-#endif
-#ifdef CONFIG_HID_KYE_MODULE
-	HID_COMPAT_CALL_DRIVER(kye);
-#endif
-#ifdef CONFIG_HID_GYRATION_MODULE
-	HID_COMPAT_CALL_DRIVER(gyration);
-#endif
-#ifdef CONFIG_HID_KENSINGTON_MODULE
-	HID_COMPAT_CALL_DRIVER(kensington);
-#endif
-#ifdef CONFIG_HID_LOGITECH_MODULE
-	HID_COMPAT_CALL_DRIVER(logitech);
-#endif
-#ifdef CONFIG_HID_MICROSOFT_MODULE
-	HID_COMPAT_CALL_DRIVER(microsoft);
-#endif
-#ifdef CONFIG_HID_MONTEREY_MODULE
-	HID_COMPAT_CALL_DRIVER(monterey);
-#endif
-#ifdef CONFIG_HID_NTRIG_MODULE
-	HID_COMPAT_CALL_DRIVER(ntrig);
-#endif
-#ifdef CONFIG_HID_PANTHERLORD_MODULE
-	HID_COMPAT_CALL_DRIVER(pantherlord);
-#endif
-#ifdef CONFIG_HID_PETALYNX_MODULE
-	HID_COMPAT_CALL_DRIVER(petalynx);
-#endif
-#ifdef CONFIG_HID_SAMSUNG_MODULE
-	HID_COMPAT_CALL_DRIVER(samsung);
-#endif
-#ifdef CONFIG_HID_SONY_MODULE
-	HID_COMPAT_CALL_DRIVER(sony);
-#endif
-#ifdef CONFIG_HID_SUNPLUS_MODULE
-	HID_COMPAT_CALL_DRIVER(sunplus);
-#endif
-#ifdef CONFIG_GREENASIA_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(greenasia);
-#endif
-#ifdef CONFIG_THRUSTMASTER_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(thrustmaster);
-#endif
-#ifdef CONFIG_ZEROPLUS_FF_MODULE
-	HID_COMPAT_CALL_DRIVER(zeroplus);
-#endif
-
-	return -EIO;
-}
-module_init(hid_dummy_init);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/hid/hid-ezkey.c b/drivers/hid/hid-ezkey.c
index deb42f931b7e..0a1fe054799b 100644
--- a/drivers/hid/hid-ezkey.c
+++ b/drivers/hid/hid-ezkey.c
@@ -91,5 +91,3 @@ static void ez_exit(void)
 module_init(ez_init);
 module_exit(ez_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(ezkey);
diff --git a/drivers/hid/hid-gaff.c b/drivers/hid/hid-gaff.c
index 71211f6a4f02..510ad3ab8d33 100644
--- a/drivers/hid/hid-gaff.c
+++ b/drivers/hid/hid-gaff.c
@@ -181,5 +181,3 @@ static void __exit ga_exit(void)
 module_init(ga_init);
 module_exit(ga_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(greenasia);
diff --git a/drivers/hid/hid-gyration.c b/drivers/hid/hid-gyration.c
index 04a0afec52ac..d42d222097a8 100644
--- a/drivers/hid/hid-gyration.c
+++ b/drivers/hid/hid-gyration.c
@@ -94,5 +94,3 @@ static void gyration_exit(void)
 module_init(gyration_init);
 module_exit(gyration_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(gyration);
diff --git a/drivers/hid/hid-kensington.c b/drivers/hid/hid-kensington.c
index 747fee5b2a73..7353bd79cbe9 100644
--- a/drivers/hid/hid-kensington.c
+++ b/drivers/hid/hid-kensington.c
@@ -61,5 +61,3 @@ static void ks_exit(void)
 module_init(ks_init);
 module_exit(ks_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(kensington);
diff --git a/drivers/hid/hid-kye.c b/drivers/hid/hid-kye.c
index ea7f412e31a9..72ee3fec56d9 100644
--- a/drivers/hid/hid-kye.c
+++ b/drivers/hid/hid-kye.c
@@ -67,5 +67,3 @@ static void kye_exit(void)
 module_init(kye_init);
 module_exit(kye_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(kye);
diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c
index 83e07c9f4144..7b80cb694982 100644
--- a/drivers/hid/hid-lg.c
+++ b/drivers/hid/hid-lg.c
@@ -326,5 +326,3 @@ static void lg_exit(void)
 module_init(lg_init);
 module_exit(lg_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(logitech);
diff --git a/drivers/hid/hid-microsoft.c b/drivers/hid/hid-microsoft.c
index 25b10dcad90d..5e9e37a0506d 100644
--- a/drivers/hid/hid-microsoft.c
+++ b/drivers/hid/hid-microsoft.c
@@ -210,5 +210,3 @@ static void ms_exit(void)
 module_init(ms_init);
 module_exit(ms_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(microsoft);
diff --git a/drivers/hid/hid-monterey.c b/drivers/hid/hid-monterey.c
index f3a85a065f18..240f87618be6 100644
--- a/drivers/hid/hid-monterey.c
+++ b/drivers/hid/hid-monterey.c
@@ -78,5 +78,3 @@ static void mr_exit(void)
 module_init(mr_init);
 module_exit(mr_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(monterey);
diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c
index db44fbd7bdf6..c5b252be9c21 100644
--- a/drivers/hid/hid-ntrig.c
+++ b/drivers/hid/hid-ntrig.c
@@ -78,5 +78,3 @@ static void ntrig_exit(void)
 module_init(ntrig_init);
 module_exit(ntrig_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(ntrig);
diff --git a/drivers/hid/hid-petalynx.c b/drivers/hid/hid-petalynx.c
index 10945fe12d50..2e83e8ff891a 100644
--- a/drivers/hid/hid-petalynx.c
+++ b/drivers/hid/hid-petalynx.c
@@ -118,5 +118,3 @@ static void pl_exit(void)
 module_init(pl_init);
 module_exit(pl_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(petalynx);
diff --git a/drivers/hid/hid-pl.c b/drivers/hid/hid-pl.c
index 9ad76bf71186..4db9a3483760 100644
--- a/drivers/hid/hid-pl.c
+++ b/drivers/hid/hid-pl.c
@@ -230,5 +230,3 @@ static void pl_exit(void)
 module_init(pl_init);
 module_exit(pl_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(pantherlord);
diff --git a/drivers/hid/hid-samsung.c b/drivers/hid/hid-samsung.c
index 15f3c0492450..07083aa6c19a 100644
--- a/drivers/hid/hid-samsung.c
+++ b/drivers/hid/hid-samsung.c
@@ -96,5 +96,3 @@ static void samsung_exit(void)
 module_init(samsung_init);
 module_exit(samsung_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(samsung);
diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index dd5a3979a4de..c2599388a350 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c
@@ -148,5 +148,3 @@ static void sony_exit(void)
 module_init(sony_init);
 module_exit(sony_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(sony);
diff --git a/drivers/hid/hid-sunplus.c b/drivers/hid/hid-sunplus.c
index 5ba68f7dbb78..e0a8fd36a85b 100644
--- a/drivers/hid/hid-sunplus.c
+++ b/drivers/hid/hid-sunplus.c
@@ -78,5 +78,3 @@ static void sp_exit(void)
 module_init(sp_init);
 module_exit(sp_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(sunplus);
diff --git a/drivers/hid/hid-tmff.c b/drivers/hid/hid-tmff.c
index 1b7cba0f7e1f..7c1f7b50330c 100644
--- a/drivers/hid/hid-tmff.c
+++ b/drivers/hid/hid-tmff.c
@@ -265,5 +265,3 @@ static void tm_exit(void)
 module_init(tm_init);
 module_exit(tm_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(thrustmaster);
diff --git a/drivers/hid/hid-topseed.c b/drivers/hid/hid-topseed.c
index cca64a0564a9..152ccfabeba5 100644
--- a/drivers/hid/hid-topseed.c
+++ b/drivers/hid/hid-topseed.c
@@ -73,5 +73,3 @@ static void ts_exit(void)
 module_init(ts_init);
 module_exit(ts_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(topseed);
diff --git a/drivers/hid/hid-zpff.c b/drivers/hid/hid-zpff.c
index ea82f3718b21..85a198a18537 100644
--- a/drivers/hid/hid-zpff.c
+++ b/drivers/hid/hid-zpff.c
@@ -158,5 +158,3 @@ static void zp_exit(void)
 module_init(zp_init);
 module_exit(zp_exit);
 MODULE_LICENSE("GPL");
-
-HID_COMPAT_LOAD_DRIVER(zeroplus);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index a46cda488695..a46cbea71d65 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -792,21 +792,5 @@ dbg_hid(const char *fmt, ...)
 		__FILE__ , ## arg)
 #endif /* HID_FF */
 
-#ifdef __KERNEL__
-#ifdef CONFIG_HID_COMPAT
-#define HID_COMPAT_LOAD_DRIVER(name)	\
-/* prototype to avoid sparse warning */	\
-extern void hid_compat_##name(void);	\
-void hid_compat_##name(void) { }	\
-EXPORT_SYMBOL(hid_compat_##name)
-#else
-#define HID_COMPAT_LOAD_DRIVER(name)
-#endif /* HID_COMPAT */
-#define HID_COMPAT_CALL_DRIVER(name)	do {	\
-	extern void hid_compat_##name(void);	\
-	hid_compat_##name();			\
-} while (0)
-#endif /* __KERNEL__ */
-
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 877d03105d04b2c13e241130277fa69c8d2564f0 Mon Sep 17 00:00:00 2001
From: Nick Andrew <nick@nick-andrew.net>
Date: Mon, 26 Jan 2009 11:06:57 +0100
Subject: trivial: Fix misspelling of firmware

Fix misspelling of firmware.

Signed-off-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ia64/kvm.txt                                    | 2 +-
 Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt | 2 +-
 arch/mips/sgi-ip27/ip27-smp.c                                 | 2 +-
 arch/sparc/kernel/head_64.S                                   | 2 +-
 drivers/net/sb1250-mac.c                                      | 2 +-
 drivers/net/tg3.c                                             | 2 +-
 drivers/net/wireless/ipw2x00/ipw2100.c                        | 2 +-
 drivers/net/wireless/ipw2x00/ipw2200.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl-agn.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl3945-base.c                   | 2 +-
 drivers/net/wireless/libertas/cmd.c                           | 2 +-
 drivers/pci/pci.c                                             | 2 +-
 drivers/platform/x86/thinkpad_acpi.c                          | 2 +-
 drivers/staging/otus/hal/hpmain.c                             | 2 +-
 drivers/usb/atm/ueagle-atm.c                                  | 2 +-
 drivers/usb/serial/ChangeLog.history                          | 2 +-
 include/linux/libata.h                                        | 2 +-
 kernel/power/disk.c                                           | 4 ++--
 sound/oss/pss.c                                               | 2 +-
 sound/sh/aica.c                                               | 2 +-
 20 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
index 84f7cb3d5bec..ffb5c80bec3e 100644
--- a/Documentation/ia64/kvm.txt
+++ b/Documentation/ia64/kvm.txt
@@ -42,7 +42,7 @@ Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qe
 		hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
 	    you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
 
-	(3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu
+	(3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu
 
 4. Boot up Linux or Windows guests:
 	4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
index 6c238f59b2a9..249db3a15d15 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
@@ -1,6 +1,6 @@
 * Uploaded QE firmware
 
-      If a new firwmare has been uploaded to the QE (usually by the
+      If a new firmware has been uploaded to the QE (usually by the
       boot loader), then a 'firmware' child node should be added to the QE
       node.  This node provides information on the uploaded firmware that
       device drivers may need.
diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c
index 5b47d6b65275..cbcd7eb83bd1 100644
--- a/arch/mips/sgi-ip27/ip27-smp.c
+++ b/arch/mips/sgi-ip27/ip27-smp.c
@@ -221,7 +221,7 @@ static void __init ip27_smp_setup(void)
 	 * Assumption to be fixed: we're always booted on logical / physical
 	 * processor 0.  While we're always running on logical processor 0
 	 * this still means this is physical processor zero; it might for
-	 * example be disabled in the firwware.
+	 * example be disabled in the firmware.
 	 */
 	alloc_cpupda(0, 0);
 }
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index a46c3a21e26d..3a1b7bf03cff 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -686,7 +686,7 @@ tlb_fixup_done:
 	 * point.
 	 *
 	 * There used to be enormous complexity wrt. transferring
-	 * over from the firwmare's trap table to the Linux kernel's.
+	 * over from the firmware's trap table to the Linux kernel's.
 	 * For example, there was a chicken & egg problem wrt. building
 	 * the OBP page tables, yet needing to be on the Linux kernel
 	 * trap table (to translate PAGE_OFFSET addresses) in order to
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 88dd2e09832f..ce7551e17ba7 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2299,7 +2299,7 @@ static int sbmac_init(struct platform_device *pldev, long long base)
 	eaddr = sc->sbm_hwaddr;
 
 	/*
-	 * Read the ethernet address.  The firwmare left this programmed
+	 * Read the ethernet address.  The firmware left this programmed
 	 * for us in the ethernet address register for each mac.
 	 */
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f7efcecc4108..ed60b18addac 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -11225,7 +11225,7 @@ static int __devinit tg3_phy_probe(struct tg3 *tp)
 		return tg3_phy_init(tp);
 
 	/* Reading the PHY ID register can conflict with ASF
-	 * firwmare access to the PHY hardware.
+	 * firmware access to the PHY hardware.
 	 */
 	err = 0;
 	if ((tp->tg3_flags & TG3_FLAG_ENABLE_ASF) ||
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 115b70487502..f4e963ba768b 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -2362,7 +2362,7 @@ static void ipw2100_corruption_detected(struct ipw2100_priv *priv, int i)
 		       i * sizeof(struct ipw2100_status));
 
 #ifdef IPW2100_DEBUG_C3
-	/* Halt the fimrware so we can get a good image */
+	/* Halt the firmware so we can get a good image */
 	write_register(priv->net_dev, IPW_REG_RESET_REG,
 		       IPW_AUX_HOST_RESET_REG_STOP_MASTER);
 	j = 5;
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index b3449948a25a..f6174fdc12bf 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -8844,7 +8844,7 @@ static int ipw_wx_set_mode(struct net_device *dev,
 #endif				/* CONFIG_IPW2200_MONITOR */
 
 	/* Free the existing firmware and reset the fw_loaded
-	 * flag so ipw_load() will bring in the new firmawre */
+	 * flag so ipw_load() will bring in the new firmware */
 	free_firmware();
 
 	priv->ieee->iw_mode = wrqu->mode;
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 663dc83be501..3889158b359c 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -1337,7 +1337,7 @@ static int iwl_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index a71b08ca7c71..9d5f97dd7c73 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -2562,7 +2562,7 @@ static int iwl3945_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 639dd02d3d31..8c3605cdc64c 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -1649,7 +1649,7 @@ static struct cmd_ctrl_node *lbs_get_cmd_ctrl_node(struct lbs_private *priv)
 
 /**
  *  @brief This function executes next command in command
- *  pending queue. It will put fimware back to PS mode
+ *  pending queue. It will put firmware back to PS mode
  *  if applicable.
  *
  *  @param priv     A pointer to struct lbs_private structure
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6d6120007af4..dab33a21d49a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -550,7 +550,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
  * @dev: PCI device to handle.
  * @state: PCI power state (D0, D1, D2, D3hot) to put the device into.
  *
- * Transition a device to a new power state, using the platform formware and/or
+ * Transition a device to a new power state, using the platform firmware and/or
  * the device's PCI PM registers.
  *
  * RETURN VALUE:
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index d2433204a40c..814cb6520673 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -5811,7 +5811,7 @@ static struct ibm_struct volume_driver_data = {
  *	ThinkPads from this same time period (and earlier) probably lack the
  *	tachometer as well.
  *
- *	Unfortunately a lot of ThinkPads with new-style ECs but whose firwmare
+ *	Unfortunately a lot of ThinkPads with new-style ECs but whose firmware
  *	was never fixed by IBM to report the EC firmware version string
  *	probably support the tachometer (like the early X models), so
  *	detecting it is quite hard.  We need more data to know for sure.
diff --git a/drivers/staging/otus/hal/hpmain.c b/drivers/staging/otus/hal/hpmain.c
index 2e65c466aae8..dab278326931 100644
--- a/drivers/staging/otus/hal/hpmain.c
+++ b/drivers/staging/otus/hal/hpmain.c
@@ -152,7 +152,7 @@ u16_t zfHpInit(zdev_t* dev, u32_t frequency)
     else
     {
     #ifndef ZM_OTUS_LINUX_PHASE_2
-        /* donwload the normal frimware */
+        /* download the normal firmware */
         if ((ret = zfFirmwareDownload(dev, (u32_t*)zcFwImage,
                 (u32_t)zcFwImageSize, ZM_FIRMWARE_WLAN_ADDR)) != ZM_SUCCESS)
         {
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index b6483dd98acc..9cf9ff69e3e3 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -626,7 +626,7 @@ static void uea_upload_pre_firmware(const struct firmware *fw_entry, void *conte
 		goto err_fw_corrupted;
 
 	/*
-	 * Start to upload formware : send reset
+	 * Start to upload firmware : send reset
 	 */
 	value = 1;
 	ret = uea_send_modem_cmd(usb, F8051_USBCS, sizeof(value), &value);
diff --git a/drivers/usb/serial/ChangeLog.history b/drivers/usb/serial/ChangeLog.history
index c1b279939bbf..f13fd488ebec 100644
--- a/drivers/usb/serial/ChangeLog.history
+++ b/drivers/usb/serial/ChangeLog.history
@@ -715,7 +715,7 @@ io_edgeport.c Change Log comments:
 
  0.2 (01/30/2000) greg kroah-hartman
 	Milestone 1 release.
-	Device is found by USB subsystem, enumerated, fimware is downloaded
+	Device is found by USB subsystem, enumerated, firmware is downloaded
 	and the descriptors are printed to the debug log, config is set, and
 	green light starts to blink. Open port works, and data can be sent
 	and received at the default settings of the UART. Loopback connector
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 76262d83656b..b450a2628855 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -379,7 +379,7 @@ enum {
 	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
 	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
 						    not multiple of 16 bytes */
-	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firwmare update warning */
+	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firmware update warning */
 	ATA_HORKAGE_1_5_GBPS	= (1 << 13),	/* force 1.5 Gbps */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206b1979..9d1c1a0de350 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -265,7 +265,7 @@ static int create_image(int platform_mode)
  *	hibernation_snapshot - quiesce devices and create the hibernation
  *	snapshot image.
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the power transition.
+ *			 prepare the platform firmware for the power transition.
  *
  *	Must be called with pm_mutex held
  */
@@ -378,7 +378,7 @@ static int resume_target_kernel(void)
  *	hibernation_restore - quiesce devices and restore the hibernation
  *	snapshot image.  If successful, control returns in hibernation_snaphot()
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the transition.
+ *			 prepare the platform firmware for the transition.
  *
  *	Must be called with pm_mutex held
  */
diff --git a/sound/oss/pss.c b/sound/oss/pss.c
index 16517a5a1301..83f5ee236b12 100644
--- a/sound/oss/pss.c
+++ b/sound/oss/pss.c
@@ -46,7 +46,7 @@
  *          load the driver as it did in previous versions.
  * 04-07-1999: Anthony Barbachan <barbcode@xmen.cis.fordham.edu>
  *          Added module parameter pss_firmware to allow the user to tell 
- *          the driver where the fireware file is located.  The default 
+ *          the driver where the firmware file is located.  The default 
  *          setting is the previous hardcoded setting "/etc/sound/pss_synth".
  * 00-03-03: Christoph Hellwig <chhellwig@infradead.org>
  *	    Adapted to module_init/module_exit
diff --git a/sound/sh/aica.c b/sound/sh/aica.c
index f551233c5a08..583a3693df75 100644
--- a/sound/sh/aica.c
+++ b/sound/sh/aica.c
@@ -565,7 +565,7 @@ static int load_aica_firmware(void)
 	err = request_firmware(&fw_entry, "aica_firmware.bin", &pd->dev);
 	if (unlikely(err))
 		return err;
-	/* write firware into memory */
+	/* write firmware into memory */
 	spu_disable();
 	spu_memload(0, fw_entry->data, fw_entry->size);
 	spu_enable();
-- 
cgit v1.2.3-71-gd317


From 5243ef8b54a927cae23216253e4e3f03af6f1446 Mon Sep 17 00:00:00 2001
From: Mark Vels <mark.vels@team-embedded.nl>
Date: Sun, 18 Jan 2009 18:42:45 +0100
Subject: trivial: PWM: fix of #endif comment

Signed-off-by: Mark Vels <mark.vels@team-embedded.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/pwm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 3945f803d514..7c775751392c 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -28,4 +28,4 @@ int pwm_enable(struct pwm_device *pwm);
  */
 void pwm_disable(struct pwm_device *pwm);
 
-#endif /* __ASM_ARCH_PWM_H */
+#endif /* __LINUX_PWM_H */
-- 
cgit v1.2.3-71-gd317


From 21acb9caa2e30b100e9a1943d995bb99d40f4035 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Wed, 4 Feb 2009 10:12:08 +0100
Subject: trivial: fix where cgroup documentation is not correctly referred to

cgroup documentation was moved to Documentation/cgroups/. There are some
places that still refer to Documentation/controllers/,
Documentation/cgroups.txt and Documentation/cpusets.txt. Fix those.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/00-INDEX                         |  4 ++--
 Documentation/cgroups/00-INDEX                 | 18 ++++++++++++++++++
 Documentation/kernel-parameters.txt            |  4 ++--
 Documentation/scheduler/sched-rt-group.txt     |  2 +-
 Documentation/vm/numa_memory_policy.txt        |  3 ++-
 Documentation/vm/page_migration                |  3 ++-
 Documentation/x86/x86_64/fake-numa-for-cpusets |  5 +++--
 include/linux/cgroup.h                         |  5 ++++-
 init/Kconfig                                   |  2 +-
 9 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/cgroups/00-INDEX

(limited to 'include/linux')

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 2a39aeba1464..d05737aaa84b 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -86,6 +86,8 @@ cachetlb.txt
 	- describes the cache/TLB flushing interfaces Linux uses.
 cdrom/
 	- directory with information on the CD-ROM drivers that Linux has.
+cgroups/
+	- cgroups features, including cpusets and memory controller.
 connector/
 	- docs on the netlink based userspace<->kernel space communication mod.
 console/
@@ -98,8 +100,6 @@ cpu-load.txt
 	- document describing how CPU load statistics are collected.
 cpuidle/
 	- info on CPU_IDLE, CPU idle state management subsystem.
-cpusets.txt
-	- documents the cpusets feature; assign CPUs and Mem to a set of tasks.
 cputopology.txt
 	- documentation on how CPU topology info is exported via sysfs.
 cris/
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
new file mode 100644
index 000000000000..3f58fa3d6d00
--- /dev/null
+++ b/Documentation/cgroups/00-INDEX
@@ -0,0 +1,18 @@
+00-INDEX
+	- this file
+cgroups.txt
+	- Control Groups definition, implementation details, examples and API.
+cpuacct.txt
+	- CPU Accounting Controller; account CPU usage for groups of tasks.
+cpusets.txt
+	- documents the cpusets feature; assign CPUs and Mem to a set of tasks.
+devices.txt
+	- Device Whitelist Controller; description, interface and security.
+freezer-subsystem.txt
+	- checkpointing; rationale to not use signals, interface.
+memcg_test.txt
+	- Memory Resource Controller; implementation details.
+memory.txt
+	- Memory Resource Controller; design, accounting, interface, testing.
+resource_counter.txt
+	- Resource Counter API.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index be3bde51b564..755def2cb071 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1593,7 +1593,7 @@ and is between 256 and 4096 characters. It is defined in the file
 	nosoftlockup	[KNL] Disable the soft-lockup detector.
 
 	noswapaccount	[KNL] Disable accounting of swap in memory resource
-			controller. (See Documentation/controllers/memory.txt)
+			controller. (See Documentation/cgroups/memory.txt)
 
 	nosync		[HW,M68K] Disables sync negotiation for all devices.
 
@@ -1932,7 +1932,7 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
-			See Documentation/cpusets.txt.
+			See Documentation/cgroups/cpusets.txt.
 
 	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
 
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 3ef339f491e0..5ba4d3fc625a 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,7 +126,7 @@ This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_u
 to control the CPU time reserved for each control group instead.
 
 For more information on working with control groups, you should read
-Documentation/cgroups.txt as well.
+Documentation/cgroups/cgroups.txt as well.
 
 Group settings are checked against the following limits in order to keep the configuration
 schedulable:
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index 6aaaeb38730c..be45dbb9d7f2 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -8,7 +8,8 @@ The current memory policy support was added to Linux 2.6 around May 2004.  This
 document attempts to describe the concepts and APIs of the 2.6 memory policy
 support.
 
-Memory policies should not be confused with cpusets (Documentation/cpusets.txt)
+Memory policies should not be confused with cpusets
+(Documentation/cgroups/cpusets.txt)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
 programming interface that a NUMA-aware application can take advantage of.  When
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index d5fdfd34bbaf..6513fe2d90b8 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -37,7 +37,8 @@ locations.
 
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
-move pages when a task is moved to another cpuset (See ../cpusets.txt).
+move pages when a task is moved to another cpuset (See
+Documentation/cgroups/cpusets.txt).
 Cpusets allows the automation of process locality. If a task is moved to
 a new cpuset then also all its pages are moved with it so that the
 performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets
index 33bb56655991..0f11d9becb0b 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets
@@ -7,7 +7,8 @@ you can create fake NUMA nodes that represent contiguous chunks of memory and
 assign them to cpusets and their attached tasks.  This is a way of limiting the
 amount of system memory that are available to a certain class of tasks.
 
-For more information on the features of cpusets, see Documentation/cpusets.txt.
+For more information on the features of cpusets, see
+Documentation/cgroups/cpusets.txt.
 There are a number of different configurations you can use for your needs.  For
 more information on the numa=fake command line option and its various ways of
 configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
@@ -32,7 +33,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:
 	On node 3 totalpages: 131072
 
 Now following the instructions for mounting the cpusets filesystem from
-Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
+Documentation/cgroups/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
 address spaces) to individual cpusets:
 
 	[root@xroads /]# mkdir exampleset
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 499900d0cee7..b837631fe499 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -342,7 +342,10 @@ int cgroup_task_count(const struct cgroup *cgrp);
 /* Return true if the cgroup is a descendant of the current cgroup */
 int cgroup_is_descendant(const struct cgroup *cgrp);
 
-/* Control Group subsystem type. See Documentation/cgroups.txt for details */
+/*
+ * Control Group subsystem type.
+ * See Documentation/cgroups/cgroups.txt for details
+ */
 
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
diff --git a/init/Kconfig b/init/Kconfig
index bcffc0e47647..99eb4196bd0a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -565,7 +565,7 @@ config CGROUP_MEM_RES_CTLR
 	select MM_OWNER
 	help
 	  Provides a memory resource controller that manages both anonymous
-	  memory and page cache. (See Documentation/controllers/memory.txt)
+	  memory and page cache. (See Documentation/cgroups/memory.txt)
 
 	  Note that setting this option increases fixed memory overhead
 	  associated with each page of memory in the system. By this,
-- 
cgit v1.2.3-71-gd317


From 39db4b8dd16b6d3b56fd3155f309e0eec8481c9a Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 9 Feb 2009 23:07:35 +0100
Subject: trivial: wusb: Storage class should be before const qualifier

The C99 specification states in section 6.11.5:

The placement of a storage-class specifier other than at the beginning
of the declaration specifiers in a declaration is an obsolescent
feature.

Acked-by: David Vrabel <david.vrabel@csr.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/usb/wusb.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/wusb.h b/include/linux/usb/wusb.h
index 5f401b644ed5..429c631d2aad 100644
--- a/include/linux/usb/wusb.h
+++ b/include/linux/usb/wusb.h
@@ -80,8 +80,7 @@ struct wusb_ckhdid {
 	u8 data[16];
 } __attribute__((packed));
 
-const static
-struct wusb_ckhdid wusb_ckhdid_zero = { .data = { 0 } };
+static const struct wusb_ckhdid wusb_ckhdid_zero = { .data = { 0 } };
 
 #define WUSB_CKHDID_STRSIZE (3 * sizeof(struct wusb_ckhdid) + 1)
 
-- 
cgit v1.2.3-71-gd317


From 0cbfdc8648115b2e8451ae9122311d01d2722005 Mon Sep 17 00:00:00 2001
From: Kazuo Moriwaka <moriwaka@redhat.com>
Date: Tue, 3 Mar 2009 13:01:23 +0100
Subject: trivial: jbd header comment typo fix for JBD_PARANOID_IOFAIL

jbd header comment typo fix.

Signed-off-by: Kazuo Moriwaka <moriwaka@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/jbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 64246dce5663..e8ca681d8acd 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -35,7 +35,7 @@
 #define journal_oom_retry 1
 
 /*
- * Define JBD_PARANIOD_IOFAIL to cause a kernel BUG() if ext3 finds
+ * Define JBD_PARANOID_IOFAIL to cause a kernel BUG() if ext3 finds
  * certain classes of error which can occur due to failed IOs.  Under
  * normal use we want ext3 to continue after such errors, because
  * hardware _can_ fail, but for debugging purposes when running tests on
-- 
cgit v1.2.3-71-gd317


From 4a6a4499693a419a20559c41e33a7bd70bf20a6f Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 27 Mar 2009 12:24:31 -0600
Subject: Fix a lockdep warning in fasync_helper()

Lockdep gripes if file->f_lock is taken in a no-IRQ situation, since that
is not always the case.  We don't really want to disable IRQs for every
acquisition of f_lock; instead, just move it outside of fasync_lock.

Reported-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Reported-by: Larry Finger <Larry.Finger@lwfinger.net>
Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/fcntl.c         | 10 +++++++---
 include/linux/fs.h |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index d865ca66ccba..cc8e4de2fee5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -531,6 +531,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 		if (!new)
 			return -ENOMEM;
 	}
+
+	/*
+	 * We need to take f_lock first since it's not an IRQ-safe
+	 * lock.
+	 */
+	spin_lock(&filp->f_lock);
 	write_lock_irq(&fasync_lock);
 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 		if (fa->fa_file == filp) {
@@ -555,14 +561,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 		result = 1;
 	}
 out:
-	/* Fix up FASYNC bit while still holding fasync_lock */
-	spin_lock(&filp->f_lock);
 	if (on)
 		filp->f_flags |= FASYNC;
 	else
 		filp->f_flags &= ~FASYNC;
-	spin_unlock(&filp->f_lock);
 	write_unlock_irq(&fasync_lock);
+	spin_unlock(&filp->f_lock);
 	return result;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7428c6d35e65..2f13c1d77812 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -848,7 +848,7 @@ struct file {
 #define f_dentry	f_path.dentry
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
-	spinlock_t		f_lock;  /* f_ep_links, f_flags */
+	spinlock_t		f_lock;  /* f_ep_links, f_flags, no IRQ */
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
-- 
cgit v1.2.3-71-gd317


From 553d0d839b93550780d1b46e6bcd01a3c5c5883e Mon Sep 17 00:00:00 2001
From: Kyle Guinn <elyk03@gmail.com>
Date: Fri, 16 Jan 2009 05:28:38 -0300
Subject: V4L/DVB (10365): Add Mars-Semi MR97310A format

The MR97310A is a dual-mode webcam controller that provides compressed BGGR
Bayer frames.  The decompression algorithm for still images is the same as for
video, and is currently implemented in libgphoto2.

Signed-off-by: Kyle Guinn <elyk03@gmail.com>
Signed-off-by: Jean-Francois Moine <moinejf@free.fr>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 5571dbe1c0ad..74aea975b305 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -344,6 +344,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SPCA508  v4l2_fourcc('S', '5', '0', '8') /* YUVY per line */
 #define V4L2_PIX_FMT_SPCA561  v4l2_fourcc('S', '5', '6', '1') /* compressed GBRG bayer */
 #define V4L2_PIX_FMT_PAC207   v4l2_fourcc('P', '2', '0', '7') /* compressed BGGR bayer */
+#define V4L2_PIX_FMT_MR97310A v4l2_fourcc('M', '3', '1', '0') /* compressed BGGR bayer */
 #define V4L2_PIX_FMT_PJPG     v4l2_fourcc('P', 'J', 'P', 'G') /* Pixart 73xx JPEG */
 #define V4L2_PIX_FMT_YVYU    v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16  YVU 4:2:2     */
 
-- 
cgit v1.2.3-71-gd317


From 2c32cc0c1f54d62c7e9ab81d1c1a3aa5b9efd73d Mon Sep 17 00:00:00 2001
From: Sergio Aguirre <saaguirre@ti.com>
Date: Tue, 20 Jan 2009 18:34:43 -0300
Subject: V4L/DVB (10575): V4L2: Add COLORFX user control

This is a common feature on many cameras. the options are:
Default colors,
B & W,
Sepia

Signed-off-by: Sergio Aguirre <saaguirre@ti.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 74aea975b305..c64e76a087b4 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -880,8 +880,15 @@ enum v4l2_power_line_frequency {
 #define V4L2_CID_BACKLIGHT_COMPENSATION 	(V4L2_CID_BASE+28)
 #define V4L2_CID_CHROMA_AGC                     (V4L2_CID_BASE+29)
 #define V4L2_CID_COLOR_KILLER                   (V4L2_CID_BASE+30)
+#define V4L2_CID_COLORFX			(V4L2_CID_BASE+31)
+enum v4l2_colorfx {
+	V4L2_COLORFX_NONE	= 0,
+	V4L2_COLORFX_BW		= 1,
+	V4L2_COLORFX_SEPIA	= 2,
+};
+
 /* last CID + 1 */
-#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+31)
+#define V4L2_CID_LASTP1                         (V4L2_CID_BASE+32)
 
 /*  MPEG-class control IDs defined by V4L2 */
 #define V4L2_CID_MPEG_BASE 			(V4L2_CTRL_CLASS_MPEG | 0x900)
-- 
cgit v1.2.3-71-gd317


From 1a367f3bc3a750b839c5711ecd0c9941e2c5aafa Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 20 Feb 2009 05:55:39 -0300
Subject: V4L/DVB (10686): v4l2: add V4L2_CTRL_FLAG_WRITE_ONLY flag.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index c64e76a087b4..11b8b3ec77b4 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -830,6 +830,7 @@ struct v4l2_querymenu {
 #define V4L2_CTRL_FLAG_UPDATE 		0x0008
 #define V4L2_CTRL_FLAG_INACTIVE 	0x0010
 #define V4L2_CTRL_FLAG_SLIDER 		0x0020
+#define V4L2_CTRL_FLAG_WRITE_ONLY 	0x0040
 
 /*  Query flag, to be ORed with the control ID */
 #define V4L2_CTRL_FLAG_NEXT_CTRL	0x80000000
-- 
cgit v1.2.3-71-gd317


From cc1139c7cdc1455fdf460c33fe63a36524753834 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Thu, 26 Feb 2009 23:08:22 -0300
Subject: V4L/DVB(10738a): remove include/linux/video_encoder.h

include/linux/video_encoder.h is not used anymore by a v4l driver.
Let's remove it and its occurences.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 Documentation/ioctl/ioctl-number.txt |  1 -
 include/linux/Kbuild                 |  1 -
 include/linux/video_encoder.h        | 23 -----------------------
 3 files changed, 25 deletions(-)
 delete mode 100644 include/linux/video_encoder.h

(limited to 'include/linux')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index f1d639903325..3a311fe952ed 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -125,7 +125,6 @@ Code	Seq#	Include File		Comments
 'd'	00-DF	linux/video_decoder.h	conflict!
 'd'	F0-FF	linux/digi1.h
 'e'	all	linux/digi1.h		conflict!
-'e'	00-1F	linux/video_encoder.h	conflict!
 'e'	00-1F	net/irda/irtty.h	conflict!
 'f'	00-1F	linux/ext2_fs.h
 'h'	00-7F				Charon filesystem
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e9581fd9fb66..da7ff0ba3860 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -159,7 +159,6 @@ header-y += un.h
 header-y += utime.h
 header-y += veth.h
 header-y += video_decoder.h
-header-y += video_encoder.h
 header-y += videotext.h
 header-y += x25.h
 
diff --git a/include/linux/video_encoder.h b/include/linux/video_encoder.h
deleted file mode 100644
index b7b6423bbb8a..000000000000
--- a/include/linux/video_encoder.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _LINUX_VIDEO_ENCODER_H
-#define _LINUX_VIDEO_ENCODER_H
-
-#include <linux/types.h>
-
-struct video_encoder_capability { /* this name is too long */
-	__u32	flags;
-#define	VIDEO_ENCODER_PAL	1	/* can encode PAL signal */
-#define	VIDEO_ENCODER_NTSC	2	/* can encode NTSC */
-#define	VIDEO_ENCODER_SECAM	4	/* can encode SECAM */
-#define	VIDEO_ENCODER_CCIR	16	/* CCIR-601 pixel rate (720 pixels per line) instead of square pixel rate */
-	int	inputs;			/* number of inputs */
-	int	outputs;		/* number of outputs */
-};
-
-#define	ENCODER_GET_CAPABILITIES _IOR('e', 1, struct video_encoder_capability)
-#define	ENCODER_SET_NORM	_IOW('e', 2, int)
-#define	ENCODER_SET_INPUT	_IOW('e', 3, int)	/* 0 <= input < #inputs */
-#define	ENCODER_SET_OUTPUT	_IOW('e', 4, int)	/* 0 <= output < #outputs */
-#define	ENCODER_ENABLE_OUTPUT	_IOW('e', 5, int)	/* boolean output enable control */
-
-
-#endif
-- 
cgit v1.2.3-71-gd317


From 42d12f5aa105af08bc0ed0580e32156a1a325c6b Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Tue, 10 Mar 2009 05:02:28 -0300
Subject: V4L/DVB (10870a): remove all references for video_decoder.h

changeset 04934e44e3784a1b969582e2d59afcec278470c6 removed the last implementation
that were still using the V4L1 obsoleted header.
Now, video_decoder.h is not used anymore by any driver.

Let's remove it and all references for it in Kernel.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 Documentation/feature-removal-schedule.txt |  8 ++---
 Documentation/ioctl/ioctl-number.txt       |  1 -
 drivers/media/video/mxb.c                  |  1 -
 include/linux/Kbuild                       |  1 -
 include/linux/video_decoder.h              | 48 ------------------------------
 5 files changed, 4 insertions(+), 55 deletions(-)
 delete mode 100644 include/linux/video_decoder.h

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1135996bec8b..5e02b83ac12b 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -64,10 +64,10 @@ Who:	Pavel Machek <pavel@suse.cz>
 
 ---------------------------
 
-What:	Video4Linux API 1 ioctls and video_decoder.h from Video devices.
-When:	December 2008
-Files:	include/linux/video_decoder.h include/linux/videodev.h
-Check:	include/linux/video_decoder.h include/linux/videodev.h
+What:	Video4Linux API 1 ioctls and from Video devices.
+When:	July 2009
+Files:	include/linux/videodev.h
+Check:	include/linux/videodev.h
 Why:	V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
 	series. The old API have lots of drawbacks and don't provide enough
 	means to work with all video and audio standards. The newer API is
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 3a311fe952ed..1f779a25c703 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -122,7 +122,6 @@ Code	Seq#	Include File		Comments
 'c'	00-7F	linux/coda.h		conflict!
 'c'	80-9F	arch/s390/include/asm/chsc.h
 'd'	00-FF	linux/char/drm/drm/h	conflict!
-'d'	00-DF	linux/video_decoder.h	conflict!
 'd'	F0-FF	linux/digi1.h
 'e'	all	linux/digi1.h		conflict!
 'e'	00-1F	net/irda/irtty.h	conflict!
diff --git a/drivers/media/video/mxb.c b/drivers/media/video/mxb.c
index 996011f2aba5..84aec62e8452 100644
--- a/drivers/media/video/mxb.c
+++ b/drivers/media/video/mxb.c
@@ -25,7 +25,6 @@
 
 #include <media/saa7146_vv.h>
 #include <media/tuner.h>
-#include <linux/video_decoder.h>
 #include <media/v4l2-common.h>
 #include <media/saa7115.h>
 
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index da7ff0ba3860..a67b6227d272 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -158,7 +158,6 @@ header-y += ultrasound.h
 header-y += un.h
 header-y += utime.h
 header-y += veth.h
-header-y += video_decoder.h
 header-y += videotext.h
 header-y += x25.h
 
diff --git a/include/linux/video_decoder.h b/include/linux/video_decoder.h
deleted file mode 100644
index e26c0c86a6ea..000000000000
--- a/include/linux/video_decoder.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _LINUX_VIDEO_DECODER_H
-#define _LINUX_VIDEO_DECODER_H
-
-#include <linux/types.h>
-
-#define HAVE_VIDEO_DECODER 1
-
-struct video_decoder_capability { /* this name is too long */
-	__u32	flags;
-#define	VIDEO_DECODER_PAL	1	/* can decode PAL signal */
-#define	VIDEO_DECODER_NTSC	2	/* can decode NTSC */
-#define	VIDEO_DECODER_SECAM	4	/* can decode SECAM */
-#define	VIDEO_DECODER_AUTO	8	/* can autosense norm */
-#define	VIDEO_DECODER_CCIR	16	/* CCIR-601 pixel rate (720 pixels per line) instead of square pixel rate */
-	int	inputs;			/* number of inputs */
-	int	outputs;		/* number of outputs */
-};
-
-/*
-DECODER_GET_STATUS returns the following flags.  The only one you need is
-DECODER_STATUS_GOOD, the others are just nice things to know.
-*/
-#define	DECODER_STATUS_GOOD	1	/* receiving acceptable input */
-#define	DECODER_STATUS_COLOR	2	/* receiving color information */
-#define	DECODER_STATUS_PAL	4	/* auto detected */
-#define	DECODER_STATUS_NTSC	8	/* auto detected */
-#define	DECODER_STATUS_SECAM	16	/* auto detected */
-
-struct video_decoder_init {
-	unsigned char len;
-	const unsigned char *data;
-};
-
-#define	DECODER_GET_CAPABILITIES _IOR('d', 1, struct video_decoder_capability)
-#define	DECODER_GET_STATUS    	_IOR('d', 2, int)
-#define	DECODER_SET_NORM	_IOW('d', 3, int)
-#define	DECODER_SET_INPUT	_IOW('d', 4, int)	/* 0 <= input < #inputs */
-#define	DECODER_SET_OUTPUT	_IOW('d', 5, int)	/* 0 <= output < #outputs */
-#define	DECODER_ENABLE_OUTPUT	_IOW('d', 6, int)	/* boolean output enable control */
-#define	DECODER_SET_PICTURE   	_IOW('d', 7, struct video_picture)
-#define	DECODER_SET_GPIO	_IOW('d', 8, int)	/* switch general purpose pin */
-#define	DECODER_INIT		_IOW('d', 9, struct video_decoder_init)	/* init internal registers at once */
-#define	DECODER_SET_VBI_BYPASS	_IOW('d', 10, int)	/* switch vbi bypass */
-
-#define	DECODER_DUMP		_IO('d', 192)		/* debug hook */
-
-
-#endif
-- 
cgit v1.2.3-71-gd317


From 7e0a16f6118a297dd467c1e5a0908429fcdf56af Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Tue, 10 Mar 2009 05:31:34 -0300
Subject: V4L/DVB (10907): avoid loading the entire videodev.h header on V4L2
 drivers

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/bt8xx/bttv.h            |  2 +-
 drivers/media/video/bt8xx/bttvp.h           |  1 -
 drivers/media/video/cpia2/cpia2_v4l.c       |  1 +
 drivers/media/video/cx23885/cx23885-video.c |  5 -----
 drivers/media/video/cx88/cx88-video.c       |  5 -----
 drivers/media/video/msp3400-driver.c        |  2 +-
 drivers/media/video/ov7670.c                |  2 +-
 drivers/media/video/saa7134/saa7134-video.c |  5 -----
 drivers/media/video/saa7146.h               |  2 --
 drivers/media/video/v4l2-ioctl.c            |  1 +
 drivers/media/video/vivi.c                  |  4 ----
 drivers/media/video/w9966.c                 |  2 +-
 drivers/media/video/w9968cf.c               |  1 +
 drivers/media/video/zoran/zoran_driver.c    |  2 +-
 include/linux/videodev.h                    | 18 ++++++++++++++++++
 include/media/v4l2-ioctl.h                  |  1 +
 16 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/bt8xx/bttv.h b/drivers/media/video/bt8xx/bttv.h
index 737a464606a9..e08719b378bd 100644
--- a/drivers/media/video/bt8xx/bttv.h
+++ b/drivers/media/video/bt8xx/bttv.h
@@ -14,7 +14,7 @@
 #ifndef _BTTV_H_
 #define _BTTV_H_
 
-#include <linux/videodev.h>
+#include <linux/videodev2.h>
 #include <linux/i2c.h>
 #include <media/ir-common.h>
 #include <media/ir-kbd-i2c.h>
diff --git a/drivers/media/video/bt8xx/bttvp.h b/drivers/media/video/bt8xx/bttvp.h
index b8274d233fd0..2c0a2cc61d03 100644
--- a/drivers/media/video/bt8xx/bttvp.h
+++ b/drivers/media/video/bt8xx/bttvp.h
@@ -32,7 +32,6 @@
 #include <linux/wait.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
-#include <linux/videodev.h>
 #include <linux/pci.h>
 #include <linux/input.h>
 #include <linux/mutex.h>
diff --git a/drivers/media/video/cpia2/cpia2_v4l.c b/drivers/media/video/cpia2/cpia2_v4l.c
index 9c25894fdd8e..d4099f5312ac 100644
--- a/drivers/media/video/cpia2/cpia2_v4l.c
+++ b/drivers/media/video/cpia2/cpia2_v4l.c
@@ -37,6 +37,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/videodev.h>
 #include <media/v4l2-ioctl.h>
 
 #include "cpia2.h"
diff --git a/drivers/media/video/cx23885/cx23885-video.c b/drivers/media/video/cx23885/cx23885-video.c
index c2ed2505b725..726602935353 100644
--- a/drivers/media/video/cx23885/cx23885-video.c
+++ b/drivers/media/video/cx23885/cx23885-video.c
@@ -35,11 +35,6 @@
 #include <media/v4l2-common.h>
 #include <media/v4l2-ioctl.h>
 
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-/* Include V4L1 specific functions. Should be removed soon */
-#include <linux/videodev.h>
-#endif
-
 MODULE_DESCRIPTION("v4l2 driver module for cx23885 based TV cards");
 MODULE_AUTHOR("Steven Toth <stoth@linuxtv.org>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index 2092e439ef00..5b0fbc602f3e 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -41,11 +41,6 @@
 #include <media/v4l2-common.h>
 #include <media/v4l2-ioctl.h>
 
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-/* Include V4L1 specific functions. Should be removed soon */
-#include <linux/videodev.h>
-#endif
-
 MODULE_DESCRIPTION("v4l2 driver module for cx2388x based TV cards");
 MODULE_AUTHOR("Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]");
 MODULE_LICENSE("GPL");
diff --git a/drivers/media/video/msp3400-driver.c b/drivers/media/video/msp3400-driver.c
index d972828d1cbe..bca768a1f34c 100644
--- a/drivers/media/video/msp3400-driver.c
+++ b/drivers/media/video/msp3400-driver.c
@@ -53,7 +53,7 @@
 #include <linux/i2c.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/videodev2.h>
+#include <linux/videodev.h>
 #include <media/v4l2-device.h>
 #include <media/v4l2-ioctl.h>
 #include <media/v4l2-i2c-drv-legacy.h>
diff --git a/drivers/media/video/ov7670.c b/drivers/media/video/ov7670.c
index 05c14a29375a..003120c07482 100644
--- a/drivers/media/video/ov7670.c
+++ b/drivers/media/video/ov7670.c
@@ -14,7 +14,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
-#include <linux/videodev.h>
+#include <linux/videodev2.h>
 #include <media/v4l2-common.h>
 #include <media/v4l2-chip-ident.h>
 #include <linux/i2c.h>
diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index aa7fa1f73a56..6a4ae89a81a9 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -31,11 +31,6 @@
 #include "saa7134.h"
 #include <media/v4l2-common.h>
 
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-/* Include V4L1 specific functions. Should be removed soon */
-#include <linux/videodev.h>
-#endif
-
 /* ------------------------------------------------------------------ */
 
 unsigned int video_debug;
diff --git a/drivers/media/video/saa7146.h b/drivers/media/video/saa7146.h
index 2830b5e33aec..9fadb331a40b 100644
--- a/drivers/media/video/saa7146.h
+++ b/drivers/media/video/saa7146.h
@@ -25,8 +25,6 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 
-#include <linux/videodev.h>
-
 #ifndef O_NONCAP
 #define O_NONCAP	O_TRUNC
 #endif
diff --git a/drivers/media/video/v4l2-ioctl.c b/drivers/media/video/v4l2-ioctl.c
index 6a7955547474..583f9c158e63 100644
--- a/drivers/media/video/v4l2-ioctl.c
+++ b/drivers/media/video/v4l2-ioctl.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 
 #define __OLD_VIDIOC_ /* To allow fixing old calls */
+#include <linux/videodev.h>
 #include <linux/videodev2.h>
 
 #ifdef CONFIG_VIDEO_V4L1
diff --git a/drivers/media/video/vivi.c b/drivers/media/video/vivi.c
index 616eb1a8dbee..980620f411f0 100644
--- a/drivers/media/video/vivi.c
+++ b/drivers/media/video/vivi.c
@@ -28,10 +28,6 @@
 #include <linux/mutex.h>
 #include <linux/videodev2.h>
 #include <linux/dma-mapping.h>
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-/* Include V4L1 specific functions. Should be removed soon */
-#include <linux/videodev.h>
-#endif
 #include <linux/interrupt.h>
 #include <linux/kthread.h>
 #include <linux/highmem.h>
diff --git a/drivers/media/video/w9966.c b/drivers/media/video/w9966.c
index 038ff32b01b8..dcade619cbd8 100644
--- a/drivers/media/video/w9966.c
+++ b/drivers/media/video/w9966.c
@@ -57,7 +57,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/videodev2.h>
+#include <linux/videodev.h>
 #include <media/v4l2-common.h>
 #include <media/v4l2-ioctl.h>
 #include <linux/parport.h>
diff --git a/drivers/media/video/w9968cf.c b/drivers/media/video/w9968cf.c
index fd5c4c87a73b..2a25580a4b66 100644
--- a/drivers/media/video/w9968cf.c
+++ b/drivers/media/video/w9968cf.c
@@ -42,6 +42,7 @@
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <linux/page-flags.h>
+#include <linux/videodev.h>
 #include <media/v4l2-ioctl.h>
 
 #include "w9968cf.h"
diff --git a/drivers/media/video/zoran/zoran_driver.c b/drivers/media/video/zoran/zoran_driver.c
index 2dd8d90aedf9..b7f03d163730 100644
--- a/drivers/media/video/zoran/zoran_driver.c
+++ b/drivers/media/video/zoran/zoran_driver.c
@@ -59,7 +59,7 @@
 
 #include <linux/spinlock.h>
 
-#include <linux/videodev2.h>
+#include <linux/videodev.h>
 #include <media/v4l2-common.h>
 #include <media/v4l2-ioctl.h>
 #include "videocodec.h"
diff --git a/include/linux/videodev.h b/include/linux/videodev.h
index 837f392fbe97..b19eab140977 100644
--- a/include/linux/videodev.h
+++ b/include/linux/videodev.h
@@ -16,6 +16,23 @@
 #include <linux/ioctl.h>
 #include <linux/videodev2.h>
 
+#if defined(__MIN_V4L1) && defined (__KERNEL__)
+
+/*
+ * Used by those V4L2 core functions that need a minimum V4L1 support,
+ * in order to allow V4L1 Compatibilty code compilation.
+ */
+
+struct video_mbuf
+{
+	int	size;		/* Total memory to map */
+	int	frames;		/* Frames */
+	int	offsets[VIDEO_MAX_FRAME];
+};
+
+#define VIDIOCGMBUF		_IOR('v',20, struct video_mbuf)		/* Memory map buffer info */
+
+#else
 #if defined(CONFIG_VIDEO_V4L1_COMPAT) || !defined (__KERNEL__)
 
 #define VID_TYPE_CAPTURE	1	/* Can capture */
@@ -312,6 +329,7 @@ struct video_code
 #define VID_PLAY_END_MARK		14
 
 #endif /* CONFIG_VIDEO_V4L1_COMPAT */
+#endif /* __MIN_V4L1 */
 
 #endif /* __LINUX_VIDEODEV_H */
 
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index a8b4c0b678ec..7a4529defa88 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -15,6 +15,7 @@
 #include <linux/mutex.h>
 #include <linux/compiler.h> /* need __user */
 #ifdef CONFIG_VIDEO_V4L1_COMPAT
+#define __MIN_V4L1
 #include <linux/videodev.h>
 #else
 #include <linux/videodev2.h>
-- 
cgit v1.2.3-71-gd317


From fbc9fa4e8781170e2fbca2859feda114d4758132 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 6 Mar 2009 09:55:42 -0300
Subject: V4L/DVB (10910): videodev2.h: remove deprecated
 VIDIOC_G_CHIP_IDENT_OLD

As announced VIDIOC_G_CHIP_IDENT_OLD is now removed for 2.6.30.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/v4l2-compat-ioctl32.c |  1 -
 drivers/media/video/v4l2-ioctl.c          |  5 -----
 include/linux/videodev2.h                 | 10 ----------
 3 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c
index 110376be5d2b..0056b115b42e 100644
--- a/drivers/media/video/v4l2-compat-ioctl32.c
+++ b/drivers/media/video/v4l2-compat-ioctl32.c
@@ -1047,7 +1047,6 @@ long v4l2_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
 	case VIDIOC_DBG_S_REGISTER:
 	case VIDIOC_DBG_G_REGISTER:
 	case VIDIOC_DBG_G_CHIP_IDENT:
-	case VIDIOC_G_CHIP_IDENT_OLD:
 	case VIDIOC_S_HW_FREQ_SEEK:
 		ret = do_video_ioctl(file, cmd, arg);
 		break;
diff --git a/drivers/media/video/v4l2-ioctl.c b/drivers/media/video/v4l2-ioctl.c
index 583f9c158e63..df8d1ff1a577 100644
--- a/drivers/media/video/v4l2-ioctl.c
+++ b/drivers/media/video/v4l2-ioctl.c
@@ -1692,11 +1692,6 @@ static long __video_do_ioctl(struct file *file,
 			dbgarg(cmd, "chip_ident=%u, revision=0x%x\n", p->ident, p->revision);
 		break;
 	}
-	case VIDIOC_G_CHIP_IDENT_OLD:
-		printk(KERN_ERR "VIDIOC_G_CHIP_IDENT has been deprecated and will disappear in 2.6.30.\n");
-		printk(KERN_ERR "It is a debugging ioctl and must not be used in applications!\n");
-		return -EINVAL;
-
 	case VIDIOC_S_HW_FREQ_SEEK:
 	{
 		struct v4l2_hw_freq_seek *p = arg;
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 11b8b3ec77b4..78ba0755ffb3 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1412,14 +1412,6 @@ struct v4l2_dbg_chip_ident {
 	__u32 revision;    /* chip revision, chip specific */
 } __attribute__ ((packed));
 
-/* VIDIOC_G_CHIP_IDENT_OLD: Deprecated, do not use */
-struct v4l2_chip_ident_old {
-	__u32 match_type;  /* Match type */
-	__u32 match_chip;  /* Match this chip, meaning determined by match_type */
-	__u32 ident;       /* chip identifier as specified in <media/v4l2-chip-ident.h> */
-	__u32 revision;    /* chip revision, chip specific */
-};
-
 /*
  *	I O C T L   C O D E S   F O R   V I D E O   D E V I C E S
  *
@@ -1497,8 +1489,6 @@ struct v4l2_chip_ident_old {
 /* Experimental, meant for debugging, testing and internal use.
    Never use this ioctl in applications! */
 #define VIDIOC_DBG_G_CHIP_IDENT _IOWR('V', 81, struct v4l2_dbg_chip_ident)
-/* This is deprecated and will go away in 2.6.30 */
-#define VIDIOC_G_CHIP_IDENT_OLD _IOWR('V', 81, struct v4l2_chip_ident_old)
 #endif
 
 #define VIDIOC_S_HW_FREQ_SEEK	 _IOW('V', 82, struct v4l2_hw_freq_seek)
-- 
cgit v1.2.3-71-gd317


From 6273fda6e32e2cd9a478545d0cbc15ac497b1f4b Mon Sep 17 00:00:00 2001
From: Andy Walls <awalls@radix.net>
Date: Sat, 14 Mar 2009 17:06:07 -0300
Subject: V4L/DVB (11042): v4l2-api: Add definitions for
 V4L2_MPEG_STREAM_VBI_FMT_IVTV payloads

This addition to the v4l2-api add definitions for the constants and
data structures used for sliced VBI data insertion into MPEG streams triggered
by V4L2_MPEG_STREAM_VBI_FMT_IVTV.  This simply declares what the ivtv and
cx18 drivers and MythTV have already been doing and provides a proper data
structure definition to user space.

Signed-off-by: Andy Walls <awalls@radix.net>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/ivtv.h      | 10 +++++-----
 include/linux/videodev2.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ivtv.h b/include/linux/ivtv.h
index f2720280b9ec..062d20f74322 100644
--- a/include/linux/ivtv.h
+++ b/include/linux/ivtv.h
@@ -60,10 +60,10 @@ struct ivtv_dma_frame {
 
 #define IVTV_IOC_DMA_FRAME  _IOW ('V', BASE_VIDIOC_PRIVATE+0, struct ivtv_dma_frame)
 
-/* These are the VBI types as they appear in the embedded VBI private packets. */
-#define IVTV_SLICED_TYPE_TELETEXT_B     (1)
-#define IVTV_SLICED_TYPE_CAPTION_525    (4)
-#define IVTV_SLICED_TYPE_WSS_625        (5)
-#define IVTV_SLICED_TYPE_VPS            (7)
+/* Deprecated defines: applications should use the defines from videodev2.h */
+#define IVTV_SLICED_TYPE_TELETEXT_B     V4L2_MPEG_VBI_IVTV_TELETEXT_B
+#define IVTV_SLICED_TYPE_CAPTION_525    V4L2_MPEG_VBI_IVTV_CAPTION_525
+#define IVTV_SLICED_TYPE_WSS_625        V4L2_MPEG_VBI_IVTV_WSS_625
+#define IVTV_SLICED_TYPE_VPS            V4L2_MPEG_VBI_IVTV_VPS
 
 #endif /* _LINUX_IVTV_H */
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 78ba0755ffb3..61f1a4921afd 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1347,6 +1347,53 @@ struct v4l2_sliced_vbi_data {
 	__u8    data[48];
 };
 
+/*
+ * Sliced VBI data inserted into MPEG Streams
+ */
+
+/*
+ * V4L2_MPEG_STREAM_VBI_FMT_IVTV:
+ *
+ * Structure of payload contained in an MPEG 2 Private Stream 1 PES Packet in an
+ * MPEG-2 Program Pack that contains V4L2_MPEG_STREAM_VBI_FMT_IVTV Sliced VBI
+ * data
+ *
+ * Note, the MPEG-2 Program Pack and Private Stream 1 PES packet header
+ * definitions are not included here.  See the MPEG-2 specifications for details
+ * on these headers.
+ */
+
+/* Line type IDs */
+#define V4L2_MPEG_VBI_IVTV_TELETEXT_B     (1)
+#define V4L2_MPEG_VBI_IVTV_CAPTION_525    (4)
+#define V4L2_MPEG_VBI_IVTV_WSS_625        (5)
+#define V4L2_MPEG_VBI_IVTV_VPS            (7)
+
+struct v4l2_mpeg_vbi_itv0_line {
+	__u8 id;	/* One of V4L2_MPEG_VBI_IVTV_* above */
+	__u8 data[42];	/* Sliced VBI data for the line */
+} __attribute__ ((packed));
+
+struct v4l2_mpeg_vbi_itv0 {
+	__le32 linemask[2]; /* Bitmasks of VBI service lines present */
+	struct v4l2_mpeg_vbi_itv0_line line[35];
+} __attribute__ ((packed));
+
+struct v4l2_mpeg_vbi_ITV0 {
+	struct v4l2_mpeg_vbi_itv0_line line[36];
+} __attribute__ ((packed));
+
+#define V4L2_MPEG_VBI_IVTV_MAGIC0	"itv0"
+#define V4L2_MPEG_VBI_IVTV_MAGIC1	"ITV0"
+
+struct v4l2_mpeg_vbi_fmt_ivtv {
+	__u8 magic[4];
+	union {
+		struct v4l2_mpeg_vbi_itv0 itv0;
+		struct v4l2_mpeg_vbi_ITV0 ITV0;
+	};
+} __attribute__ ((packed));
+
 /*
  *	A G G R E G A T E   S T R U C T U R E S
  */
-- 
cgit v1.2.3-71-gd317


From 968cf78285ef03672ae514e9ad7a60919eb97551 Mon Sep 17 00:00:00 2001
From: Devin Heitmueller <dheitmueller@linuxtv.org>
Date: Wed, 11 Mar 2009 03:00:38 -0300
Subject: V4L/DVB (11065): au8522: add support for analog side of demodulator

Add support for the analog functionality in the au8522 analog/digital
demodulator

Thanks to Michael Krufky <mkrufky@linuxtv.org> and Steven Toth
<stoth@linuxtv.org> for providing sample hardware, engineering level support,
and testing.

Signed-off-by: Devin Heitmueller <dheitmueller@linuxtv.org>
Signed-off-by: Michael Krufky <mkrufky@linuxtv.org>
[mchehab: renamed drivers/media/video/au8522_decoder.c as drivers/media/dvb/frontends/au8522_decoder.c to avoid breaking bisect]
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/dvb/frontends/Makefile         |   2 +-
 drivers/media/dvb/frontends/au8522.h         |  16 +
 drivers/media/dvb/frontends/au8522_decoder.c | 839 +++++++++++++++++++++++++++
 drivers/media/dvb/frontends/au8522_priv.h    | 347 +++++++++++
 include/linux/i2c-id.h                       |   1 +
 5 files changed, 1204 insertions(+), 1 deletion(-)
 create mode 100644 drivers/media/dvb/frontends/au8522_decoder.c

(limited to 'include/linux')

diff --git a/drivers/media/dvb/frontends/Makefile b/drivers/media/dvb/frontends/Makefile
index 742826523df5..65a336aa1db6 100644
--- a/drivers/media/dvb/frontends/Makefile
+++ b/drivers/media/dvb/frontends/Makefile
@@ -8,7 +8,7 @@ EXTRA_CFLAGS += -Idrivers/media/common/tuners/
 s921-objs := s921_module.o s921_core.o
 stb0899-objs = stb0899_drv.o stb0899_algo.o
 stv0900-objs = stv0900_core.o stv0900_sw.o
-au8522-objs = au8522_dig.o
+au8522-objs = au8522_dig.o au8522_decoder.o
 
 obj-$(CONFIG_DVB_PLL) += dvb-pll.o
 obj-$(CONFIG_DVB_STV0299) += stv0299.o
diff --git a/drivers/media/dvb/frontends/au8522.h b/drivers/media/dvb/frontends/au8522.h
index 7b94f554a093..565dcf31af57 100644
--- a/drivers/media/dvb/frontends/au8522.h
+++ b/drivers/media/dvb/frontends/au8522.h
@@ -74,6 +74,22 @@ struct dvb_frontend *au8522_attach(const struct au8522_config *config,
 }
 #endif /* CONFIG_DVB_AU8522 */
 
+/* Other modes may need to be added later */
+enum au8522_video_input {
+	AU8522_COMPOSITE_CH1 = 1,
+	AU8522_COMPOSITE_CH2,
+	AU8522_COMPOSITE_CH3,
+	AU8522_COMPOSITE_CH4,
+	AU8522_COMPOSITE_CH4_SIF,
+	AU8522_SVIDEO_CH13,
+	AU8522_SVIDEO_CH24,
+};
+
+enum au8522_audio_input {
+	AU8522_AUDIO_NONE,
+	AU8522_AUDIO_SIF,
+};
+
 #endif /* __AU8522_H__ */
 
 /*
diff --git a/drivers/media/dvb/frontends/au8522_decoder.c b/drivers/media/dvb/frontends/au8522_decoder.c
new file mode 100644
index 000000000000..e2927c145cd8
--- /dev/null
+++ b/drivers/media/dvb/frontends/au8522_decoder.c
@@ -0,0 +1,839 @@
+/*
+ * Auvitek AU8522 QAM/8VSB demodulator driver and video decoder
+ *
+ * Copyright (C) 2009 Devin Heitmueller <dheitmueller@linuxtv.org>
+ * Copyright (C) 2005-2008 Auvitek International, Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * As published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/* Developer notes:
+ *
+ * VBI support is not yet working
+ * Saturation and hue setting are not yet working
+ * Enough is implemented here for CVBS and S-Video inputs, but the actual
+ *  analog demodulator code isn't implemented (not needed for xc5000 since it
+ *  has its own demodulator and outputs CVBS)
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/videodev2.h>
+#include <linux/i2c.h>
+#include <linux/delay.h>
+#include <media/v4l2-common.h>
+#include <media/v4l2-chip-ident.h>
+#include <media/v4l2-i2c-drv-legacy.h>
+#include <media/v4l2-device.h>
+#include "au8522.h"
+#include "au8522_priv.h"
+
+MODULE_AUTHOR("Devin Heitmueller");
+MODULE_LICENSE("GPL");
+
+static int au8522_analog_debug;
+
+static unsigned short normal_i2c[] = { 0x8e >> 1, I2C_CLIENT_END };
+
+module_param_named(analog_debug, au8522_analog_debug, int, 0644);
+
+MODULE_PARM_DESC(analog_debug,
+		 "Analog debugging messages [0=Off (default) 1=On]");
+
+I2C_CLIENT_INSMOD;
+
+struct au8522_register_config {
+	u16 reg_name;
+	u8 reg_val[8];
+};
+
+
+/* Video Decoder Filter Coefficients
+   The values are as follows from left to right
+   0="ATV RF" 1="ATV RF13" 2="CVBS" 3="S-Video" 4="PAL" 5=CVBS13" 6="SVideo13"
+*/
+struct au8522_register_config filter_coef[] = {
+	{AU8522_FILTER_COEF_R410, {0x25, 0x00, 0x25, 0x25, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R411, {0x20, 0x00, 0x20, 0x20, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R412, {0x03, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R413, {0xe6, 0x00, 0xe6, 0xe6, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R414, {0x40, 0x00, 0x40, 0x40, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R415, {0x1b, 0x00, 0x1b, 0x1b, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R416, {0xc0, 0x00, 0xc0, 0x04, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R417, {0x04, 0x00, 0x04, 0x04, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R418, {0x8c, 0x00, 0x8c, 0x8c, 0x00, 0x00, 0x00}},
+	{AU8522_FILTER_COEF_R419, {0xa0, 0x40, 0xa0, 0xa0, 0x40, 0x40, 0x40}},
+	{AU8522_FILTER_COEF_R41A, {0x21, 0x09, 0x21, 0x21, 0x09, 0x09, 0x09}},
+	{AU8522_FILTER_COEF_R41B, {0x6c, 0x38, 0x6c, 0x6c, 0x38, 0x38, 0x38}},
+	{AU8522_FILTER_COEF_R41C, {0x03, 0xff, 0x03, 0x03, 0xff, 0xff, 0xff}},
+	{AU8522_FILTER_COEF_R41D, {0xbf, 0xc7, 0xbf, 0xbf, 0xc7, 0xc7, 0xc7}},
+	{AU8522_FILTER_COEF_R41E, {0xa0, 0xdf, 0xa0, 0xa0, 0xdf, 0xdf, 0xdf}},
+	{AU8522_FILTER_COEF_R41F, {0x10, 0x06, 0x10, 0x10, 0x06, 0x06, 0x06}},
+	{AU8522_FILTER_COEF_R420, {0xae, 0x30, 0xae, 0xae, 0x30, 0x30, 0x30}},
+	{AU8522_FILTER_COEF_R421, {0xc4, 0x01, 0xc4, 0xc4, 0x01, 0x01, 0x01}},
+	{AU8522_FILTER_COEF_R422, {0x54, 0xdd, 0x54, 0x54, 0xdd, 0xdd, 0xdd}},
+	{AU8522_FILTER_COEF_R423, {0xd0, 0xaf, 0xd0, 0xd0, 0xaf, 0xaf, 0xaf}},
+	{AU8522_FILTER_COEF_R424, {0x1c, 0xf7, 0x1c, 0x1c, 0xf7, 0xf7, 0xf7}},
+	{AU8522_FILTER_COEF_R425, {0x76, 0xdb, 0x76, 0x76, 0xdb, 0xdb, 0xdb}},
+	{AU8522_FILTER_COEF_R426, {0x61, 0xc0, 0x61, 0x61, 0xc0, 0xc0, 0xc0}},
+	{AU8522_FILTER_COEF_R427, {0xd1, 0x2f, 0xd1, 0xd1, 0x2f, 0x2f, 0x2f}},
+	{AU8522_FILTER_COEF_R428, {0x84, 0xd8, 0x84, 0x84, 0xd8, 0xd8, 0xd8}},
+	{AU8522_FILTER_COEF_R429, {0x06, 0xfb, 0x06, 0x06, 0xfb, 0xfb, 0xfb}},
+	{AU8522_FILTER_COEF_R42A, {0x21, 0xd5, 0x21, 0x21, 0xd5, 0xd5, 0xd5}},
+	{AU8522_FILTER_COEF_R42B, {0x0a, 0x3e, 0x0a, 0x0a, 0x3e, 0x3e, 0x3e}},
+	{AU8522_FILTER_COEF_R42C, {0xe6, 0x15, 0xe6, 0xe6, 0x15, 0x15, 0x15}},
+	{AU8522_FILTER_COEF_R42D, {0x01, 0x34, 0x01, 0x01, 0x34, 0x34, 0x34}},
+
+};
+#define NUM_FILTER_COEF (sizeof (filter_coef) / sizeof(struct au8522_register_config))
+
+
+/* Registers 0x060b through 0x0652 are the LP Filter coefficients
+   The values are as follows from left to right
+   0="SIF" 1="ATVRF/ATVRF13"
+   Note: the "ATVRF/ATVRF13" mode has never been tested
+*/
+struct au8522_register_config lpfilter_coef[] = {
+	{0x060b, {0x21, 0x0b}},
+	{0x060c, {0xad, 0xad}},
+	{0x060d, {0x70, 0xf0}},
+	{0x060e, {0xea, 0xe9}},
+	{0x060f, {0xdd, 0xdd}},
+	{0x0610, {0x08, 0x64}},
+	{0x0611, {0x60, 0x60}},
+	{0x0612, {0xf8, 0xb2}},
+	{0x0613, {0x01, 0x02}},
+	{0x0614, {0xe4, 0xb4}},
+	{0x0615, {0x19, 0x02}},
+	{0x0616, {0xae, 0x2e}},
+	{0x0617, {0xee, 0xc5}},
+	{0x0618, {0x56, 0x56}},
+	{0x0619, {0x30, 0x58}},
+	{0x061a, {0xf9, 0xf8}},
+	{0x061b, {0x24, 0x64}},
+	{0x061c, {0x07, 0x07}},
+	{0x061d, {0x30, 0x30}},
+	{0x061e, {0xa9, 0xed}},
+	{0x061f, {0x09, 0x0b}},
+	{0x0620, {0x42, 0xc2}},
+	{0x0621, {0x1d, 0x2a}},
+	{0x0622, {0xd6, 0x56}},
+	{0x0623, {0x95, 0x8b}},
+	{0x0624, {0x2b, 0x2b}},
+	{0x0625, {0x30, 0x24}},
+	{0x0626, {0x3e, 0x3e}},
+	{0x0627, {0x62, 0xe2}},
+	{0x0628, {0xe9, 0xf5}},
+	{0x0629, {0x99, 0x19}},
+	{0x062a, {0xd4, 0x11}},
+	{0x062b, {0x03, 0x04}},
+	{0x062c, {0xb5, 0x85}},
+	{0x062d, {0x1e, 0x20}},
+	{0x062e, {0x2a, 0xea}},
+	{0x062f, {0xd7, 0xd2}},
+	{0x0630, {0x15, 0x15}},
+	{0x0631, {0xa3, 0xa9}},
+	{0x0632, {0x1f, 0x1f}},
+	{0x0633, {0xf9, 0xd1}},
+	{0x0634, {0xc0, 0xc3}},
+	{0x0635, {0x4d, 0x8d}},
+	{0x0636, {0x21, 0x31}},
+	{0x0637, {0x83, 0x83}},
+	{0x0638, {0x08, 0x8c}},
+	{0x0639, {0x19, 0x19}},
+	{0x063a, {0x45, 0xa5}},
+	{0x063b, {0xef, 0xec}},
+	{0x063c, {0x8a, 0x8a}},
+	{0x063d, {0xf4, 0xf6}},
+	{0x063e, {0x8f, 0x8f}},
+	{0x063f, {0x44, 0x0c}},
+	{0x0640, {0xef, 0xf0}},
+	{0x0641, {0x66, 0x66}},
+	{0x0642, {0xcc, 0xd2}},
+	{0x0643, {0x41, 0x41}},
+	{0x0644, {0x63, 0x93}},
+	{0x0645, {0x8e, 0x8e}},
+	{0x0646, {0xa2, 0x42}},
+	{0x0647, {0x7b, 0x7b}},
+	{0x0648, {0x04, 0x04}},
+	{0x0649, {0x00, 0x00}},
+	{0x064a, {0x40, 0x40}},
+	{0x064b, {0x8c, 0x98}},
+	{0x064c, {0x00, 0x00}},
+	{0x064d, {0x63, 0xc3}},
+	{0x064e, {0x04, 0x04}},
+	{0x064f, {0x20, 0x20}},
+	{0x0650, {0x00, 0x00}},
+	{0x0651, {0x40 ,0x40}},
+	{0x0652, {0x01, 0x01}},
+};
+#define NUM_LPFILTER_COEF (sizeof (lpfilter_coef) / sizeof(struct au8522_register_config))
+
+static inline struct au8522_state *to_state(struct v4l2_subdev *sd)
+{
+	return container_of(sd, struct au8522_state, sd);
+}
+
+static void setup_vbi(struct au8522_state *state, int aud_input)
+{
+	int i;
+
+	/* These are set to zero regardless of what mode we're in */
+	au8522_writereg(state, AU8522_TVDEC_VBI_CTRL_H_REG017H, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_CTRL_L_REG018H, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_TOTAL_BITS_REG019H, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_TUNIT_H_REG01AH, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_TUNIT_L_REG01BH, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_THRESH1_REG01CH, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_FRAME_PAT2_REG01EH, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_FRAME_PAT1_REG01FH, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_FRAME_PAT0_REG020H, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_FRAME_MASK2_REG021H,0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_FRAME_MASK1_REG022H,0x00);
+	au8522_writereg(state, AU8522_TVDEC_VBI_USER_FRAME_MASK0_REG023H,0x00);
+
+	/* Setup the VBI registers */
+	for (i = 0x30; i < 0x60; i++) {
+		au8522_writereg(state, i, 0x40);
+	}
+	/* For some reason, every register is 0x40 except register 0x44
+	   (confirmed via the HVR-950q USB capture) */
+	au8522_writereg(state, 0x44, 0x60);
+
+	/* Enable VBI (we always do this regardless of whether the user is
+	   viewing closed caption info) */
+	au8522_writereg(state, AU8522_TVDEC_VBI_CTRL_H_REG017H,
+			AU8522_TVDEC_VBI_CTRL_H_REG017H_CCON);
+
+}
+
+static void setup_decoder_defaults(struct au8522_state *state, u8 input_mode)
+{
+	int i;
+	int filter_coef_type;
+
+	/* Provide reasonable defaults for picture tuning values */
+	au8522_writereg(state, AU8522_TVDEC_SHARPNESSREG009H, 0x07);
+	au8522_writereg(state, AU8522_TVDEC_BRIGHTNESS_REG00AH, 0xed);
+	state->brightness = 0xed - 128;
+	au8522_writereg(state, AU8522_TVDEC_CONTRAST_REG00BH, 0x79);
+	state->contrast = 0x79;
+	au8522_writereg(state, AU8522_TVDEC_SATURATION_CB_REG00CH, 0x80);
+	au8522_writereg(state, AU8522_TVDEC_SATURATION_CR_REG00DH, 0x80);
+	au8522_writereg(state, AU8522_TVDEC_HUE_H_REG00EH, 0x00);
+	au8522_writereg(state, AU8522_TVDEC_HUE_L_REG00FH, 0x00);
+
+	/* Other decoder registers */
+	au8522_writereg(state, AU8522_TVDEC_INT_MASK_REG010H, 0x00);
+
+	if (input_mode == 0x23) {
+		/* S-Video input mapping */
+		au8522_writereg(state, AU8522_VIDEO_MODE_REG011H, 0x04);
+	} else {
+		/* All other modes (CVBS/ATVRF etc.) */
+		au8522_writereg(state, AU8522_VIDEO_MODE_REG011H, 0x00);
+	}
+
+	au8522_writereg(state, AU8522_TVDEC_PGA_REG012H,
+			AU8522_TVDEC_PGA_REG012H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_MODE_REG015H,
+			AU8522_TVDEC_COMB_MODE_REG015H_CVBS);
+	au8522_writereg(state, AU8522_TVDED_DBG_MODE_REG060H,
+			AU8522_TVDED_DBG_MODE_REG060H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_FORMAT_CTRL1_REG061H,
+			AU8522_TVDEC_FORMAT_CTRL1_REG061H_CVBS13);
+	au8522_writereg(state, AU8522_TVDEC_FORMAT_CTRL2_REG062H,
+			AU8522_TVDEC_FORMAT_CTRL2_REG062H_CVBS13);
+	au8522_writereg(state, AU8522_TVDEC_VCR_DET_LLIM_REG063H,
+			AU8522_TVDEC_VCR_DET_LLIM_REG063H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_VCR_DET_HLIM_REG064H,
+			AU8522_TVDEC_VCR_DET_HLIM_REG064H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_VDIF_THR1_REG065H,
+			AU8522_TVDEC_COMB_VDIF_THR1_REG065H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_VDIF_THR2_REG066H,
+			AU8522_TVDEC_COMB_VDIF_THR2_REG066H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_VDIF_THR3_REG067H,
+			AU8522_TVDEC_COMB_VDIF_THR3_REG067H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_NOTCH_THR_REG068H,
+			AU8522_TVDEC_COMB_NOTCH_THR_REG068H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_HDIF_THR1_REG069H,
+			AU8522_TVDEC_COMB_HDIF_THR1_REG069H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_HDIF_THR2_REG06AH,
+			AU8522_TVDEC_COMB_HDIF_THR2_REG06AH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_HDIF_THR3_REG06BH,
+			AU8522_TVDEC_COMB_HDIF_THR3_REG06BH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_DCDIF_THR1_REG06CH,
+			AU8522_TVDEC_COMB_DCDIF_THR1_REG06CH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_DCDIF_THR2_REG06DH,
+			AU8522_TVDEC_COMB_DCDIF_THR2_REG06DH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_DCDIF_THR3_REG06EH,
+			AU8522_TVDEC_COMB_DCDIF_THR3_REG06EH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_UV_SEP_THR_REG06FH,
+			AU8522_TVDEC_UV_SEP_THR_REG06FH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_DC_THR1_NTSC_REG070H,
+			AU8522_TVDEC_COMB_DC_THR1_NTSC_REG070H_CVBS);
+	au8522_writereg(state, AU8522_REG071H, AU8522_REG071H_CVBS);
+	au8522_writereg(state, AU8522_REG072H, AU8522_REG072H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_COMB_DC_THR2_NTSC_REG073H,
+			AU8522_TVDEC_COMB_DC_THR2_NTSC_REG073H_CVBS);
+	au8522_writereg(state, AU8522_REG074H, AU8522_REG074H_CVBS);
+	au8522_writereg(state, AU8522_REG075H, AU8522_REG075H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_DCAGC_CTRL_REG077H,
+			AU8522_TVDEC_DCAGC_CTRL_REG077H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_PIC_START_ADJ_REG078H,
+			AU8522_TVDEC_PIC_START_ADJ_REG078H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_AGC_HIGH_LIMIT_REG079H,
+			AU8522_TVDEC_AGC_HIGH_LIMIT_REG079H_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_MACROVISION_SYNC_THR_REG07AH,
+			AU8522_TVDEC_MACROVISION_SYNC_THR_REG07AH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_INTRP_CTRL_REG07BH,
+			AU8522_TVDEC_INTRP_CTRL_REG07BH_CVBS);
+	au8522_writereg(state, AU8522_TVDEC_AGC_LOW_LIMIT_REG0E4H,
+			AU8522_TVDEC_AGC_LOW_LIMIT_REG0E4H_CVBS);
+	au8522_writereg(state, AU8522_TOREGAAGC_REG0E5H,
+			AU8522_TOREGAAGC_REG0E5H_CVBS);
+	au8522_writereg(state, AU8522_REG016H, AU8522_REG016H_CVBS);
+
+	setup_vbi(state, 0);
+
+	if (input_mode == AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH13 ||
+	    input_mode == AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH24) {
+		/* Despite what the table says, for the HVR-950q we still need
+		   to be in CVBS mode for the S-Video input (reason uknown). */
+		/* filter_coef_type = 3; */
+		filter_coef_type = 5;
+	} else {
+		filter_coef_type = 5;
+	}
+
+	/* Load the Video Decoder Filter Coefficients */
+	for (i = 0; i < NUM_FILTER_COEF; i++) {
+		au8522_writereg(state, filter_coef[i].reg_name,
+				filter_coef[i].reg_val[filter_coef_type]);
+	}
+
+	/* It's not clear what these registers are for, but they are always
+	   set to the same value regardless of what mode we're in */
+	au8522_writereg(state, AU8522_REG42EH, 0x87);
+	au8522_writereg(state, AU8522_REG42FH, 0xa2);
+	au8522_writereg(state, AU8522_REG430H, 0xbf);
+	au8522_writereg(state, AU8522_REG431H, 0xcb);
+	au8522_writereg(state, AU8522_REG432H, 0xa1);
+	au8522_writereg(state, AU8522_REG433H, 0x41);
+	au8522_writereg(state, AU8522_REG434H, 0x88);
+	au8522_writereg(state, AU8522_REG435H, 0xc2);
+	au8522_writereg(state, AU8522_REG436H, 0x3c);
+}
+
+static void au8522_setup_cvbs_mode(struct au8522_state *state)
+{
+	/* here we're going to try the pre-programmed route */
+	au8522_writereg(state, AU8522_MODULE_CLOCK_CONTROL_REG0A3H,
+			AU8522_MODULE_CLOCK_CONTROL_REG0A3H_CVBS);
+
+	au8522_writereg(state, AU8522_PGA_CONTROL_REG082H, 0x00);
+	au8522_writereg(state, AU8522_CLAMPING_CONTROL_REG083H, 0x0e);
+	au8522_writereg(state, AU8522_PGA_CONTROL_REG082H, 0x10);
+
+	au8522_writereg(state, AU8522_INPUT_CONTROL_REG081H,
+			AU8522_INPUT_CONTROL_REG081H_CVBS_CH1);
+
+	setup_decoder_defaults(state, AU8522_INPUT_CONTROL_REG081H_CVBS_CH1);
+
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H,
+			AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS);
+}
+
+static void au8522_setup_cvbs_tuner_mode(struct au8522_state *state)
+{
+	/* here we're going to try the pre-programmed route */
+	au8522_writereg(state, AU8522_MODULE_CLOCK_CONTROL_REG0A3H,
+			AU8522_MODULE_CLOCK_CONTROL_REG0A3H_CVBS);
+
+	/* It's not clear why they turn off the PGA before enabling the clamp
+	   control, but the Windows trace does it so we will too... */
+	au8522_writereg(state, AU8522_PGA_CONTROL_REG082H, 0x00);
+
+	/* Enable clamping control */
+	au8522_writereg(state, AU8522_CLAMPING_CONTROL_REG083H, 0x0e);
+
+	/* Turn on the PGA */
+	au8522_writereg(state, AU8522_PGA_CONTROL_REG082H, 0x10);
+
+	/* Set input mode to CVBS on channel 4 with SIF audio input enabled */
+	au8522_writereg(state, AU8522_INPUT_CONTROL_REG081H,
+			AU8522_INPUT_CONTROL_REG081H_CVBS_CH4_SIF);
+
+	setup_decoder_defaults(state,
+			       AU8522_INPUT_CONTROL_REG081H_CVBS_CH4_SIF);
+
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H,
+			AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS);
+}
+
+static void au8522_setup_svideo_mode(struct au8522_state *state)
+{
+	au8522_writereg(state, AU8522_MODULE_CLOCK_CONTROL_REG0A3H,
+			AU8522_MODULE_CLOCK_CONTROL_REG0A3H_SVIDEO);
+
+	/* Set input to Y on Channe1, C on Channel 3 */
+	au8522_writereg(state, AU8522_INPUT_CONTROL_REG081H,
+			AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH13);
+
+	/* Disable clamping control (required for S-video) */
+	au8522_writereg(state, AU8522_CLAMPING_CONTROL_REG083H, 0x00);
+
+	setup_decoder_defaults(state,
+			       AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH13);
+
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H,
+			AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS);
+}
+
+/* ----------------------------------------------------------------------- */
+
+static void disable_audio_input(struct au8522_state *state)
+{
+	/* This can probably be optimized */
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_L_REG0F2H, 0x00);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_R_REG0F3H, 0x00);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_REG0F4H, 0x00);
+	au8522_writereg(state, AU8522_I2C_CONTROL_REG1_REG091H, 0x80);
+	au8522_writereg(state, AU8522_I2C_CONTROL_REG0_REG090H, 0x84);
+
+	au8522_writereg(state, AU8522_ENA_USB_REG101H, 0x00);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_L_REG0F2H, 0x7F);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_R_REG0F3H, 0x7F);
+	au8522_writereg(state, AU8522_REG0F9H, AU8522_REG0F9H_AUDIO);
+	au8522_writereg(state, AU8522_AUDIO_MODE_REG0F1H, 0x40);
+
+	au8522_writereg(state, AU8522_GPIO_DATA_REG0E2H, 0x11);
+	msleep(5);
+	au8522_writereg(state, AU8522_GPIO_DATA_REG0E2H, 0x00);
+
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H, 0x04);
+	au8522_writereg(state, AU8522_AUDIOFREQ_REG606H, 0x03);
+	au8522_writereg(state, AU8522_I2S_CTRL_2_REG112H, 0x02);
+
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H,
+			AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS);
+}
+
+/* 0=disable, 1=SIF */
+static void set_audio_input(struct au8522_state *state, int aud_input)
+{
+	int i;
+
+	/* Note that this function needs to be used in conjunction with setting
+	   the input routing via register 0x81 */
+
+	if (aud_input == AU8522_AUDIO_NONE) {
+		disable_audio_input(state);
+		return;
+	}
+
+	if (aud_input != AU8522_AUDIO_SIF) {
+		/* The caller asked for a mode we don't currently support */
+		printk("Unsupported audio mode requested! mode=%d\n",
+		       aud_input);
+		return;
+	}
+
+	/* Load the Audio Decoder Filter Coefficients */
+	for (i = 0; i < NUM_LPFILTER_COEF; i++) {
+		au8522_writereg(state, lpfilter_coef[i].reg_name,
+				lpfilter_coef[i].reg_val[0]);
+	}
+
+	/* Setup audio */
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_L_REG0F2H, 0x00);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_R_REG0F3H, 0x00);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_REG0F4H, 0x00);
+	au8522_writereg(state, AU8522_I2C_CONTROL_REG1_REG091H, 0x80);
+	au8522_writereg(state, AU8522_I2C_CONTROL_REG0_REG090H, 0x84);
+	msleep(150);
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H, 0x00);
+	msleep(1);
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H, 0x9d);
+	msleep(50);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_L_REG0F2H, 0x7F);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_R_REG0F3H, 0x7F);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_REG0F4H, 0xff);
+	msleep(80);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_L_REG0F2H, 0x7F);
+	au8522_writereg(state, AU8522_AUDIO_VOLUME_R_REG0F3H, 0x7F);
+	au8522_writereg(state, AU8522_REG0F9H, AU8522_REG0F9H_AUDIO);
+	au8522_writereg(state, AU8522_AUDIO_MODE_REG0F1H, 0x82);
+	msleep(70);
+	au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H, 0x09);
+	au8522_writereg(state, AU8522_AUDIOFREQ_REG606H, 0x03);
+	au8522_writereg(state, AU8522_I2S_CTRL_2_REG112H, 0xc2);
+}
+
+/* ----------------------------------------------------------------------- */
+
+static int au8522_s_ctrl(struct v4l2_subdev *sd, struct v4l2_control *ctrl)
+{
+	struct au8522_state *state = to_state(sd);
+
+	switch (ctrl->id) {
+	case V4L2_CID_BRIGHTNESS:
+		state->brightness = ctrl->value;
+		au8522_writereg(state, AU8522_TVDEC_BRIGHTNESS_REG00AH,
+				ctrl->value - 128);
+		break;
+	case V4L2_CID_CONTRAST:
+		state->contrast = ctrl->value;
+		au8522_writereg(state, AU8522_TVDEC_CONTRAST_REG00BH,
+				ctrl->value);
+		break;
+	case V4L2_CID_SATURATION:
+	case V4L2_CID_HUE:
+	case V4L2_CID_AUDIO_VOLUME:
+	case V4L2_CID_AUDIO_BASS:
+	case V4L2_CID_AUDIO_TREBLE:
+	case V4L2_CID_AUDIO_BALANCE:
+	case V4L2_CID_AUDIO_MUTE:
+		/* Not yet implemented */
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int au8522_g_ctrl(struct v4l2_subdev *sd, struct v4l2_control *ctrl)
+{
+	struct au8522_state *state = to_state(sd);
+
+	/* Note that we are using values cached in the state structure instead
+	   of reading the registers due to issues with i2c reads not working
+	   properly/consistently yet on the HVR-950q */
+
+	switch (ctrl->id) {
+	case V4L2_CID_BRIGHTNESS:
+		ctrl->value = state->brightness;
+		break;
+	case V4L2_CID_CONTRAST:
+		ctrl->value = state->contrast;
+		break;
+	case V4L2_CID_SATURATION:
+	case V4L2_CID_HUE:
+	case V4L2_CID_AUDIO_VOLUME:
+	case V4L2_CID_AUDIO_BASS:
+	case V4L2_CID_AUDIO_TREBLE:
+	case V4L2_CID_AUDIO_BALANCE:
+	case V4L2_CID_AUDIO_MUTE:
+		/* Not yet supported */
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* ----------------------------------------------------------------------- */
+
+static int au8522_g_fmt(struct v4l2_subdev *sd, struct v4l2_format *fmt)
+{
+	switch (fmt->type) {
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int au8522_s_fmt(struct v4l2_subdev *sd, struct v4l2_format *fmt)
+{
+	switch (fmt->type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		/* Not yet implemented */
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* ----------------------------------------------------------------------- */
+
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+static int au8522_g_register(struct v4l2_subdev *sd,
+			     struct v4l2_dbg_register *reg)
+{
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct au8522_state *state = to_state(sd);
+
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	reg->val = au8522_readreg(state, reg->reg & 0xffff);
+	return 0;
+}
+
+static int au8522_s_register(struct v4l2_subdev *sd,
+			     struct v4l2_dbg_register *reg)
+{
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct au8522_state *state = to_state(sd);
+
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	au8522_writereg(state, reg->reg, reg->val & 0xff);
+	return 0;
+}
+#endif
+
+static int au8522_s_stream(struct v4l2_subdev *sd, int enable)
+{
+	struct au8522_state *state = to_state(sd);
+
+	if (enable) {
+		au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H,
+				0x01);
+		msleep(1);
+		au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H,
+				AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS);
+	} else {
+		/* This does not completely power down the device
+		   (it only reduces it from around 140ma to 80ma) */
+		au8522_writereg(state, AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H,
+				1 << 5);
+	}
+	return 0;
+}
+
+static int au8522_queryctrl(struct v4l2_subdev *sd, struct v4l2_queryctrl *qc)
+{
+	switch (qc->id) {
+	case V4L2_CID_CONTRAST:
+		return v4l2_ctrl_query_fill(qc, 0, 255, 1,
+					    AU8522_TVDEC_CONTRAST_REG00BH_CVBS);
+	case V4L2_CID_BRIGHTNESS:
+		return v4l2_ctrl_query_fill(qc, 0, 255, 1, 128);
+	case V4L2_CID_SATURATION:
+	case V4L2_CID_HUE:
+		/* Not yet implemented */
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static int au8522_reset(struct v4l2_subdev *sd, u32 val)
+{
+	struct au8522_state *state = to_state(sd);
+
+	au8522_writereg(state, 0xa4, 1 << 5);
+
+	return 0;
+}
+
+static int au8522_s_video_routing(struct v4l2_subdev *sd,
+				  const struct v4l2_routing *route)
+{
+	struct au8522_state *state = to_state(sd);
+
+	au8522_reset(sd, 0);
+
+	/* Jam open the i2c gate to the tuner.  We do this here to handle the
+	   case where the user went into digital mode (causing the gate to be
+	   closed), and then came back to analog mode */
+	au8522_writereg(state, 0x106, 1);
+
+	if (route->input == AU8522_COMPOSITE_CH1) {
+		au8522_setup_cvbs_mode(state);
+	} else if (route->input == AU8522_SVIDEO_CH13) {
+		au8522_setup_svideo_mode(state);
+	} else if (route->input == AU8522_COMPOSITE_CH4_SIF) {
+		au8522_setup_cvbs_tuner_mode(state);
+	} else {
+		printk("au8522 mode not currently supported\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int au8522_s_audio_routing(struct v4l2_subdev *sd,
+				  const struct v4l2_routing *route)
+{
+	struct au8522_state *state = to_state(sd);
+	set_audio_input(state, route->input);
+	return 0;
+}
+
+static int au8522_g_tuner(struct v4l2_subdev *sd, struct v4l2_tuner *vt)
+{
+	int val = 0;
+	struct au8522_state *state = to_state(sd);
+	u8 lock_status;
+
+	/* Interrogate the decoder to see if we are getting a real signal */
+	lock_status = au8522_readreg(state, 0x00);
+	if (lock_status == 0xa2)
+		vt->signal = 0x01;
+	else
+		vt->signal = 0x00;
+
+	vt->capability |=
+		V4L2_TUNER_CAP_STEREO | V4L2_TUNER_CAP_LANG1 |
+		V4L2_TUNER_CAP_LANG2 | V4L2_TUNER_CAP_SAP;
+
+	val = V4L2_TUNER_SUB_MONO;
+	vt->rxsubchans = val;
+	vt->audmode = V4L2_TUNER_MODE_STEREO;
+	return 0;
+}
+
+static int au8522_g_chip_ident(struct v4l2_subdev *sd,
+			       struct v4l2_dbg_chip_ident *chip)
+{
+	struct au8522_state *state = to_state(sd);
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+
+	return v4l2_chip_ident_i2c_client(client, chip, state->id, state->rev);
+}
+
+static int au8522_log_status(struct v4l2_subdev *sd)
+{
+	/* FIXME: Add some status info here */
+	return 0;
+}
+
+static int au8522_command(struct i2c_client *client, unsigned cmd, void *arg)
+{
+	return v4l2_subdev_command(i2c_get_clientdata(client), cmd, arg);
+}
+
+/* ----------------------------------------------------------------------- */
+
+static const struct v4l2_subdev_core_ops au8522_core_ops = {
+	.log_status = au8522_log_status,
+	.g_chip_ident = au8522_g_chip_ident,
+	.g_ctrl = au8522_g_ctrl,
+	.s_ctrl = au8522_s_ctrl,
+	.queryctrl = au8522_queryctrl,
+	.reset = au8522_reset,
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+	.g_register = au8522_g_register,
+	.s_register = au8522_s_register,
+#endif
+};
+
+static const struct v4l2_subdev_tuner_ops au8522_tuner_ops = {
+	.g_tuner = au8522_g_tuner,
+};
+
+static const struct v4l2_subdev_audio_ops au8522_audio_ops = {
+	.s_routing = au8522_s_audio_routing,
+};
+
+static const struct v4l2_subdev_video_ops au8522_video_ops = {
+	.s_routing = au8522_s_video_routing,
+	.g_fmt = au8522_g_fmt,
+	.s_fmt = au8522_s_fmt,
+	.s_stream = au8522_s_stream,
+};
+
+static const struct v4l2_subdev_ops au8522_ops = {
+	.core = &au8522_core_ops,
+	.tuner = &au8522_tuner_ops,
+	.audio = &au8522_audio_ops,
+	.video = &au8522_video_ops,
+};
+
+/* ----------------------------------------------------------------------- */
+
+static int au8522_probe(struct i2c_client *client,
+			const struct i2c_device_id *did)
+{
+	struct au8522_state *state;
+	struct v4l2_subdev *sd;
+	int instance;
+	struct au8522_config *demod_config;
+
+	/* Check if the adapter supports the needed features */
+	if (!i2c_check_functionality(client->adapter,
+				     I2C_FUNC_SMBUS_BYTE_DATA)) {
+		return -EIO;
+	}
+
+	/* allocate memory for the internal state */
+	instance = au8522_get_state(&state, client->adapter, client->addr);
+	switch (instance) {
+	case 0:
+		printk("au8522_decoder allocation failed\n");
+		return -EIO;
+	case 1:
+		/* new demod instance */
+		printk("au8522_decoder creating new instance...\n");
+		break;
+	default:
+		/* existing demod instance */
+		printk("au8522_decoder attaching to existing instance...\n");
+		break;
+	}
+
+	demod_config = kzalloc(sizeof(struct au8522_config), GFP_KERNEL);
+	demod_config->demod_address = 0x8e >> 1;
+
+	state->config = demod_config;
+	state->i2c = client->adapter;
+
+	sd = &state->sd;
+	v4l2_i2c_subdev_init(sd, client, &au8522_ops);
+
+	state->c = client;
+	state->vid_input = AU8522_COMPOSITE_CH1;
+	state->aud_input = AU8522_AUDIO_NONE;
+	state->id = 8522;
+	state->rev = 0;
+
+	/* Jam open the i2c gate to the tuner */
+	au8522_writereg(state, 0x106, 1);
+
+	return 0;
+}
+
+static int au8522_remove(struct i2c_client *client)
+{
+	struct v4l2_subdev *sd = i2c_get_clientdata(client);
+	v4l2_device_unregister_subdev(sd);
+	au8522_release_state(to_state(sd));
+	return 0;
+}
+
+static const struct i2c_device_id au8522_id[] = {
+	{"au8522", 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, au8522_id);
+
+static struct v4l2_i2c_driver_data v4l2_i2c_data = {
+	.name = "au8522",
+	.driverid = I2C_DRIVERID_AU8522,
+	.command = au8522_command,
+	.probe = au8522_probe,
+	.remove = au8522_remove,
+	.id_table = au8522_id,
+};
diff --git a/drivers/media/dvb/frontends/au8522_priv.h b/drivers/media/dvb/frontends/au8522_priv.h
index 98b09caa2123..f328f2b3ad3d 100644
--- a/drivers/media/dvb/frontends/au8522_priv.h
+++ b/drivers/media/dvb/frontends/au8522_priv.h
@@ -35,6 +35,7 @@
 #include "tuner-i2c.h"
 
 struct au8522_state {
+	struct i2c_client *c;
 	struct i2c_adapter *i2c;
 
 	/* Used for sharing of the state between analog and digital mode */
@@ -51,6 +52,16 @@ struct au8522_state {
 
 	u32 fe_status;
 	unsigned int led_state;
+
+	/* Analog settings */
+	struct v4l2_subdev sd;
+	v4l2_std_id std;
+	int vid_input;
+	int aud_input;
+	u32 id;
+	u32 rev;
+	u8 brightness;
+	u8 contrast;
 };
 
 /* These are routines shared by both the VSB/QAM demodulator and the analog
@@ -63,3 +74,339 @@ int au8522_sleep(struct dvb_frontend *fe);
 int au8522_get_state(struct au8522_state **state, struct i2c_adapter *i2c,
 		     u8 client_address);
 void au8522_release_state(struct au8522_state *state);
+
+/* REGISTERS */
+#define AU8522_INPUT_CONTROL_REG081H			0x081
+#define AU8522_PGA_CONTROL_REG082H			0x082
+#define AU8522_CLAMPING_CONTROL_REG083H			0x083
+
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H		0x0A3
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H		0x0A4
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H		0x0A5
+#define AU8522_AGC_CONTROL_RANGE_REG0A6H		0x0A6
+#define AU8522_SYSTEM_GAIN_CONTROL_REG0A7H		0x0A7
+#define AU8522_TUNER_AGC_RF_STOP_REG0A8H  		0x0A8
+#define AU8522_TUNER_AGC_RF_START_REG0A9H		0x0A9
+#define AU8522_TUNER_RF_AGC_DEFAULT_REG0AAH		0x0AA
+#define AU8522_TUNER_AGC_IF_STOP_REG0ABH		0x0AB
+#define AU8522_TUNER_AGC_IF_START_REG0ACH		0x0AC
+#define AU8522_TUNER_AGC_IF_DEFAULT_REG0ADH		0x0AD
+#define AU8522_TUNER_AGC_STEP_REG0AEH			0x0AE
+#define AU8522_TUNER_GAIN_STEP_REG0AFH			0x0AF
+
+/* Receiver registers */
+#define AU8522_FRMREGTHRD1_REG0B0H			0x0B0
+#define AU8522_FRMREGAGC1H_REG0B1H 			0x0B1
+#define AU8522_FRMREGSHIFT1_REG0B2H 			0x0B2
+#define AU8522_TOREGAGC1_REG0B3H 			0x0B3
+#define AU8522_TOREGASHIFT1_REG0B4H 			0x0B4
+#define AU8522_FRMREGBBH_REG0B5H			0x0B5
+#define AU8522_FRMREGBBM_REG0B6H 			0x0B6
+#define AU8522_FRMREGBBL_REG0B7H     			0x0B7
+/* 0xB8 TO 0xD7 are the filter coefficients */
+#define AU8522_FRMREGTHRD2_REG0D8H 			0x0D8
+#define AU8522_FRMREGAGC2H_REG0D9H 			0x0D9
+#define AU8522_TOREGAGC2_REG0DAH 			0x0DA
+#define AU8522_TOREGSHIFT2_REG0DBH 			0x0DB
+#define AU8522_FRMREGPILOTH_REG0DCH			0x0DC
+#define AU8522_FRMREGPILOTM_REG0DDH			0x0DD
+#define AU8522_FRMREGPILOTL_REG0DEH			0x0DE
+#define AU8522_TOREGFREQ_REG0DFH			0x0DF
+
+#define AU8522_RX_PGA_RFOUT_REG0EBH			0x0EB
+#define AU8522_RX_PGA_IFOUT_REG0ECH			0x0EC
+#define AU8522_RX_PGA_PGAOUT_REG0EDH			0x0ED
+
+#define AU8522_CHIP_MODE_REG0FEH			0x0FE
+
+/* I2C bus control registers */
+#define AU8522_I2C_CONTROL_REG0_REG090H    		0x090
+#define AU8522_I2C_CONTROL_REG1_REG091H    		0x091
+#define AU8522_I2C_STATUS_REG092H          		0x092
+#define AU8522_I2C_WR_DATA0_REG093H			0x093
+#define AU8522_I2C_WR_DATA1_REG094H			0x094
+#define AU8522_I2C_WR_DATA2_REG095H			0x095
+#define AU8522_I2C_WR_DATA3_REG096H			0x096
+#define AU8522_I2C_WR_DATA4_REG097H			0x097
+#define AU8522_I2C_WR_DATA5_REG098H			0x098
+#define AU8522_I2C_WR_DATA6_REG099H			0x099
+#define AU8522_I2C_WR_DATA7_REG09AH			0x09A
+#define AU8522_I2C_RD_DATA0_REG09BH			0x09B
+#define AU8522_I2C_RD_DATA1_REG09CH			0x09C
+#define AU8522_I2C_RD_DATA2_REG09DH			0x09D
+#define AU8522_I2C_RD_DATA3_REG09EH			0x09E
+#define AU8522_I2C_RD_DATA4_REG09FH			0x09F
+#define AU8522_I2C_RD_DATA5_REG0A0H			0x0A0
+#define AU8522_I2C_RD_DATA6_REG0A1H			0x0A1
+#define AU8522_I2C_RD_DATA7_REG0A2H			0x0A2
+
+#define AU8522_ENA_USB_REG101H				0x101
+
+#define AU8522_I2S_CTRL_0_REG110H  			0x110
+#define AU8522_I2S_CTRL_1_REG111H 			0x111
+#define AU8522_I2S_CTRL_2_REG112H 			0x112
+
+#define AU8522_FRMREGFFECONTROL_REG121H    		0x121
+#define AU8522_FRMREGDFECONTROL_REG122H    		0x122
+
+#define AU8522_CARRFREQOFFSET0_REG201H 			0x201
+#define AU8522_CARRFREQOFFSET1_REG202H			0x202
+
+#define AU8522_DECIMATION_GAIN_REG21AH			0x21A
+#define AU8522_FRMREGIFSLP_REG21BH 			0x21B
+#define AU8522_FRMREGTHRDL2_REG21CH 			0x21C
+#define AU8522_FRMREGSTEP3DB_REG21DH 			0x21D
+#define AU8522_DAGC_GAIN_ADJUSTMENT_REG21EH		0x21E
+#define AU8522_FRMREGPLLMODE_REG21FH 			0x21F
+#define AU8522_FRMREGCSTHRD_REG220H 			0x220
+#define AU8522_FRMREGCRLOCKDMAX_REG221H 		0x221
+#define AU8522_FRMREGCRPERIODMASK_REG222H 		0x222
+#define AU8522_FRMREGCRLOCK0THH_REG223H 		0x223
+#define AU8522_FRMREGCRLOCK1THH_REG224H 		0x224
+#define AU8522_FRMREGCRLOCK0THL_REG225H 		0x225
+#define AU8522_FRMREGCRLOCK1THL_REG226H 		0x226
+#define AU_FRMREGPLLACQPHASESCL_REG227H			0x227
+#define AU8522_FRMREGFREQFBCTRL_REG228H 		0x228
+
+/* Analog TV Decoder */
+#define AU8522_TVDEC_STATUS_REG000H			0x000
+#define AU8522_TVDEC_INT_STATUS_REG001H			0x001
+#define AU8522_TVDEC_MACROVISION_STATUS_REG002H 	0x002
+#define AU8522_TVDEC_SHARPNESSREG009H			0x009
+#define AU8522_TVDEC_BRIGHTNESS_REG00AH			0x00A
+#define AU8522_TVDEC_CONTRAST_REG00BH			0x00B
+#define AU8522_TVDEC_SATURATION_CB_REG00CH		0x00C
+#define AU8522_TVDEC_SATURATION_CR_REG00DH		0x00D
+#define AU8522_TVDEC_HUE_H_REG00EH			0x00E
+#define AU8522_TVDEC_HUE_L_REG00FH                   	0x00F
+#define AU8522_TVDEC_INT_MASK_REG010H			0x010
+#define AU8522_VIDEO_MODE_REG011H			0x011
+#define AU8522_TVDEC_PGA_REG012H			0x012
+#define AU8522_TVDEC_COMB_MODE_REG015H			0x015
+#define AU8522_REG016H                            	0x016
+#define AU8522_TVDED_DBG_MODE_REG060H			0x060
+#define AU8522_TVDEC_FORMAT_CTRL1_REG061H		0x061
+#define AU8522_TVDEC_FORMAT_CTRL2_REG062H		0x062
+#define AU8522_TVDEC_VCR_DET_LLIM_REG063H		0x063
+#define AU8522_TVDEC_VCR_DET_HLIM_REG064H		0x064
+#define AU8522_TVDEC_COMB_VDIF_THR1_REG065H		0x065
+#define AU8522_TVDEC_COMB_VDIF_THR2_REG066H		0x066
+#define AU8522_TVDEC_COMB_VDIF_THR3_REG067H		0x067
+#define AU8522_TVDEC_COMB_NOTCH_THR_REG068H		0x068
+#define AU8522_TVDEC_COMB_HDIF_THR1_REG069H   		0x069
+#define AU8522_TVDEC_COMB_HDIF_THR2_REG06AH		0x06A
+#define AU8522_TVDEC_COMB_HDIF_THR3_REG06BH   		0x06B
+#define AU8522_TVDEC_COMB_DCDIF_THR1_REG06CH  		0x06C
+#define AU8522_TVDEC_COMB_DCDIF_THR2_REG06DH 		0x06D
+#define AU8522_TVDEC_COMB_DCDIF_THR3_REG06EH       	0x06E
+#define AU8522_TVDEC_UV_SEP_THR_REG06FH  		0x06F
+#define AU8522_TVDEC_COMB_DC_THR1_NTSC_REG070H		0x070
+#define AU8522_TVDEC_COMB_DC_THR2_NTSC_REG073H		0x073
+#define AU8522_TVDEC_DCAGC_CTRL_REG077H			0x077
+#define AU8522_TVDEC_PIC_START_ADJ_REG078H		0x078
+#define AU8522_TVDEC_AGC_HIGH_LIMIT_REG079H		0x079
+#define AU8522_TVDEC_MACROVISION_SYNC_THR_REG07AH	0x07A
+#define AU8522_TVDEC_INTRP_CTRL_REG07BH			0x07B
+#define AU8522_TVDEC_PLL_STATUS_REG07EH			0x07E
+#define AU8522_TVDEC_FSC_FREQ_REG07FH			0x07F
+
+#define AU8522_TVDEC_AGC_LOW_LIMIT_REG0E4H		0x0E4
+#define AU8522_TOREGAAGC_REG0E5H			0x0E5
+
+#define AU8522_TVDEC_CHROMA_AGC_REG401H		0x401
+#define AU8522_TVDEC_CHROMA_SFT_REG402H		0x402
+#define AU8522_FILTER_COEF_R410     		0x410
+#define AU8522_FILTER_COEF_R411     		0x411
+#define AU8522_FILTER_COEF_R412     		0x412
+#define AU8522_FILTER_COEF_R413     		0x413
+#define AU8522_FILTER_COEF_R414     		0x414
+#define AU8522_FILTER_COEF_R415     		0x415
+#define AU8522_FILTER_COEF_R416     		0x416
+#define AU8522_FILTER_COEF_R417     		0x417
+#define AU8522_FILTER_COEF_R418     		0x418
+#define AU8522_FILTER_COEF_R419     		0x419
+#define AU8522_FILTER_COEF_R41A     		0x41A
+#define AU8522_FILTER_COEF_R41B     		0x41B
+#define AU8522_FILTER_COEF_R41C     		0x41C
+#define AU8522_FILTER_COEF_R41D     		0x41D
+#define AU8522_FILTER_COEF_R41E     		0x41E
+#define AU8522_FILTER_COEF_R41F     		0x41F
+#define AU8522_FILTER_COEF_R420     		0x420
+#define AU8522_FILTER_COEF_R421     		0x421
+#define AU8522_FILTER_COEF_R422     		0x422
+#define AU8522_FILTER_COEF_R423     		0x423
+#define AU8522_FILTER_COEF_R424     		0x424
+#define AU8522_FILTER_COEF_R425     		0x425
+#define AU8522_FILTER_COEF_R426     		0x426
+#define AU8522_FILTER_COEF_R427     		0x427
+#define AU8522_FILTER_COEF_R428     		0x428
+#define AU8522_FILTER_COEF_R429     		0x429
+#define AU8522_FILTER_COEF_R42A     		0x42A
+#define AU8522_FILTER_COEF_R42B     		0x42B
+#define AU8522_FILTER_COEF_R42C     		0x42C
+#define AU8522_FILTER_COEF_R42D     		0x42D
+
+/* VBI Control Registers */
+#define AU8522_TVDEC_VBI_RX_FIFO_CONTAIN_REG004H  	0x004
+#define AU8522_TVDEC_VBI_TX_FIFO_CONTAIN_REG005H  	0x005
+#define AU8522_TVDEC_VBI_RX_FIFO_READ_REG006H      	0x006
+#define AU8522_TVDEC_VBI_FIFO_STATUS_REG007H       	0x007
+#define AU8522_TVDEC_VBI_CTRL_H_REG017H			0x017
+#define AU8522_TVDEC_VBI_CTRL_L_REG018H			0x018
+#define AU8522_TVDEC_VBI_USER_TOTAL_BITS_REG019H	0x019
+#define AU8522_TVDEC_VBI_USER_TUNIT_H_REG01AH		0x01A
+#define AU8522_TVDEC_VBI_USER_TUNIT_L_REG01BH		0x01B
+#define AU8522_TVDEC_VBI_USER_THRESH1_REG01CH		0x01C
+#define AU8522_TVDEC_VBI_USER_FRAME_PAT2_REG01EH	0x01E
+#define AU8522_TVDEC_VBI_USER_FRAME_PAT1_REG01FH   	0x01F
+#define AU8522_TVDEC_VBI_USER_FRAME_PAT0_REG020H   	0x020
+#define AU8522_TVDEC_VBI_USER_FRAME_MASK2_REG021H 	0x021
+#define AU8522_TVDEC_VBI_USER_FRAME_MASK1_REG022H  	0x022
+#define AU8522_TVDEC_VBI_USER_FRAME_MASK0_REG023H	0x023
+
+#define AU8522_REG071H					0x071
+#define AU8522_REG072H					0x072
+#define AU8522_REG074H					0x074
+#define AU8522_REG075H					0x075
+
+/* Digital Demodulator Registers */
+#define AU8522_FRAME_COUNT0_REG084H			0x084
+#define AU8522_RS_STATUS_G0_REG085H			0x085
+#define AU8522_RS_STATUS_B0_REG086H			0x086
+#define AU8522_RS_STATUS_E_REG087H			0x087
+#define AU8522_DEMODULATION_STATUS_REG088H		0x088
+#define AU8522_TOREGTRESTATUS_REG0E6H			0x0E6
+#define AU8522_TSPORT_CONTROL_REG10BH			0x10B
+#define AU8522_TSTHES_REG10CH				0x10C
+#define AU8522_FRMREGDFEKEEP_REG301H			0x301
+#define AU8522_DFE_AVERAGE_REG302H			0x302
+#define AU8522_FRMREGEQLERRWIN_REG303H			0x303
+#define AU8522_FRMREGFFEKEEP_REG304H			0x304
+#define AU8522_FRMREGDFECONTROL1_REG305H		0x305
+#define AU8522_FRMREGEQLERRLOW_REG306H			0x306
+
+#define AU8522_REG42EH				0x42E
+#define AU8522_REG42FH				0x42F
+#define AU8522_REG430H				0x430
+#define AU8522_REG431H				0x431
+#define AU8522_REG432H				0x432
+#define AU8522_REG433H				0x433
+#define AU8522_REG434H				0x434
+#define AU8522_REG435H				0x435
+#define AU8522_REG436H				0x436
+
+/* GPIO Registers */
+#define AU8522_GPIO_CONTROL_REG0E0H			0x0E0
+#define AU8522_GPIO_STATUS_REG0E1H			0x0E1
+#define AU8522_GPIO_DATA_REG0E2H			0x0E2
+
+/* Audio Control Registers */
+#define AU8522_AUDIOAGC_REG0EEH 			0x0EE
+#define AU8522_AUDIO_STATUS_REG0F0H 			0x0F0
+#define AU8522_AUDIO_MODE_REG0F1H 			0x0F1
+#define AU8522_AUDIO_VOLUME_L_REG0F2H 			0x0F2
+#define AU8522_AUDIO_VOLUME_R_REG0F3H 			0x0F3
+#define AU8522_AUDIO_VOLUME_REG0F4H 			0x0F4
+#define AU8522_FRMREGAUPHASE_REG0F7H 			0x0F7
+#define AU8522_REG0F9H					0x0F9
+
+#define AU8522_AUDIOAGC2_REG605H 			0x605
+#define AU8522_AUDIOFREQ_REG606H 			0x606
+
+
+/**************************************************************/
+
+#define AU8522_INPUT_CONTROL_REG081H_ATSC               	0xC4
+#define AU8522_INPUT_CONTROL_REG081H_ATVRF			0xC4
+#define AU8522_INPUT_CONTROL_REG081H_ATVRF13			0xC4
+#define AU8522_INPUT_CONTROL_REG081H_J83B64             	0xC4
+#define AU8522_INPUT_CONTROL_REG081H_J83B256            	0xC4
+#define AU8522_INPUT_CONTROL_REG081H_CVBS               	0x20
+#define AU8522_INPUT_CONTROL_REG081H_CVBS_CH1			0xA2
+#define AU8522_INPUT_CONTROL_REG081H_CVBS_CH2			0xA0
+#define AU8522_INPUT_CONTROL_REG081H_CVBS_CH3			0x69
+#define AU8522_INPUT_CONTROL_REG081H_CVBS_CH4			0x68
+#define AU8522_INPUT_CONTROL_REG081H_CVBS_CH4_SIF        	0x28
+/* CH1 AS Y,CH3 AS C */
+#define AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH13        	0x23
+/* CH2 AS Y,CH4 AS C */
+#define AU8522_INPUT_CONTROL_REG081H_SVIDEO_CH24        	0x20
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATSC        	0x0C
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_J83B64      	0x09
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_J83B256    		0x09
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_CVBS        	0x12
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATVRF       	0x1A
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_ATVRF13		0x1A
+#define AU8522_MODULE_CLOCK_CONTROL_REG0A3H_SVIDEO		0x02
+
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CLEAR		0x00
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_SVIDEO		0x9C
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_CVBS     	0x9D
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_ATSC		0xE8
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_J83B256 		0xCA
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_J83B64  		0xCA
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_ATVRF   		0xDD
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_ATVRF13		0xDD
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_PAL		0xDD
+#define AU8522_SYSTEM_MODULE_CONTROL_0_REG0A4H_FM		0xDD
+
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_ATSC		0x80
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_J83B256 		0x80
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_J83B64  		0x80
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_ATSC	0x40
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_J83B256	0x40
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_J83B64	0x40
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_DONGLE_CLEAR	0x00
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_ATVRF		0x01
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_ATVRF13		0x01
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_SVIDEO  		0x04
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_CVBS		0x01
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_PWM     		0x03
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_IIS      	0x09
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_PAL		0x01
+#define AU8522_SYSTEM_MODULE_CONTROL_1_REG0A5H_FM		0x01
+
+/* STILL NEED TO BE REFACTORED @@@@@@@@@@@@@@ */
+#define AU8522_TVDEC_CONTRAST_REG00BH_CVBS			0x79
+#define AU8522_TVDEC_SATURATION_CB_REG00CH_CVBS			0x80
+#define AU8522_TVDEC_SATURATION_CR_REG00DH_CVBS			0x80
+#define AU8522_TVDEC_HUE_H_REG00EH_CVBS				0x00
+#define AU8522_TVDEC_HUE_L_REG00FH_CVBS				0x00
+#define AU8522_TVDEC_PGA_REG012H_CVBS				0x0F
+#define AU8522_TVDEC_COMB_MODE_REG015H_CVBS			0x00
+#define AU8522_REG016H_CVBS					0x00
+#define AU8522_TVDED_DBG_MODE_REG060H_CVBS			0x00
+#define AU8522_TVDEC_FORMAT_CTRL1_REG061H_CVBS			0x0B
+#define AU8522_TVDEC_FORMAT_CTRL1_REG061H_CVBS13		0x03
+#define AU8522_TVDEC_FORMAT_CTRL2_REG062H_CVBS13		0x00
+#define AU8522_TVDEC_VCR_DET_LLIM_REG063H_CVBS			0x19
+#define AU8522_REG0F9H_AUDIO					0x20
+#define AU8522_TVDEC_VCR_DET_HLIM_REG064H_CVBS			0xA7
+#define AU8522_TVDEC_COMB_VDIF_THR1_REG065H_CVBS		0x0A
+#define AU8522_TVDEC_COMB_VDIF_THR2_REG066H_CVBS		0x32
+#define AU8522_TVDEC_COMB_VDIF_THR3_REG067H_CVBS		0x19
+#define AU8522_TVDEC_COMB_NOTCH_THR_REG068H_CVBS		0x23
+#define AU8522_TVDEC_COMB_HDIF_THR1_REG069H_CVBS		0x41
+#define AU8522_TVDEC_COMB_HDIF_THR2_REG06AH_CVBS		0x0A
+#define AU8522_TVDEC_COMB_HDIF_THR3_REG06BH_CVBS		0x32
+#define AU8522_TVDEC_COMB_DCDIF_THR1_REG06CH_CVBS		0x34
+#define AU8522_TVDEC_COMB_DCDIF_THR2_REG06DH_CVBS		0x05
+#define AU8522_TVDEC_COMB_DCDIF_THR3_REG06EH_CVBS		0x6E
+#define AU8522_TVDEC_UV_SEP_THR_REG06FH_CVBS			0x0F
+#define AU8522_TVDEC_COMB_DC_THR1_NTSC_REG070H_CVBS		0x80
+#define AU8522_REG071H_CVBS					0x18
+#define AU8522_REG072H_CVBS					0x30
+#define AU8522_TVDEC_COMB_DC_THR2_NTSC_REG073H_CVBS		0xF0
+#define AU8522_REG074H_CVBS					0x80
+#define AU8522_REG075H_CVBS					0xF0
+#define AU8522_TVDEC_DCAGC_CTRL_REG077H_CVBS			0xFB
+#define AU8522_TVDEC_PIC_START_ADJ_REG078H_CVBS			0x04
+#define AU8522_TVDEC_AGC_HIGH_LIMIT_REG079H_CVBS		0x00
+#define AU8522_TVDEC_MACROVISION_SYNC_THR_REG07AH_CVBS		0x00
+#define AU8522_TVDEC_INTRP_CTRL_REG07BH_CVBS			0xEE
+#define AU8522_TVDEC_AGC_LOW_LIMIT_REG0E4H_CVBS			0xFE
+#define AU8522_TOREGAAGC_REG0E5H_CVBS				0x00
+#define AU8522_TVDEC_VBI6A_REG035H_CVBS				0x40
+
+/* Enables Closed captioning */
+#define AU8522_TVDEC_VBI_CTRL_H_REG017H_CCON			0x21
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 1ffc23bc5d1e..17d9af070f06 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -71,6 +71,7 @@
 #define I2C_DRIVERID_VP27SMPX	93	/* Panasonic VP27s tuner internal MPX */
 #define I2C_DRIVERID_M52790 	95      /* Mitsubishi M52790SP/FP AV switch */
 #define I2C_DRIVERID_CS5345	96	/* cs5345 audio processor	*/
+#define I2C_DRIVERID_AU8522	97	/* Auvitek au8522       */
 
 #define I2C_DRIVERID_OV7670 1048	/* Omnivision 7670 camera */
 
-- 
cgit v1.2.3-71-gd317


From 9aba42efe85bc7a55e3fed0747ce14abc9ee96e7 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Wed, 18 Mar 2009 18:10:04 -0300
Subject: V4L/DVB (11096): V4L2 Driver for the Hauppauge HD PVR usb capture
 device

The device encodes component video up to 1080i to a MPEG-TS stream with
H.264 video and stereo AAC audio. Newer firmwares accept also AC3
(up to 5.1) audio over optical SPDIF without reencoding.
Firmware upgrade is unimplemeted but rather unimportant since
the firmware sits on a flash chip.

The I2C adapter to drive the integrated infrared receiver/sender is
currently disabled due to a conflict with cx18-based devices.

Tested-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Janne Grunau <j@jannau.net>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/Kconfig               |    2 +
 drivers/media/video/Makefile              |    2 +
 drivers/media/video/hdpvr/Kconfig         |   10 +
 drivers/media/video/hdpvr/Makefile        |    7 +
 drivers/media/video/hdpvr/hdpvr-control.c |  201 +++++
 drivers/media/video/hdpvr/hdpvr-core.c    |  439 +++++++++++
 drivers/media/video/hdpvr/hdpvr-i2c.c     |  145 ++++
 drivers/media/video/hdpvr/hdpvr-video.c   | 1228 +++++++++++++++++++++++++++++
 drivers/media/video/hdpvr/hdpvr.h         |  298 +++++++
 include/linux/i2c-id.h                    |    1 +
 10 files changed, 2333 insertions(+)
 create mode 100644 drivers/media/video/hdpvr/Kconfig
 create mode 100644 drivers/media/video/hdpvr/Makefile
 create mode 100644 drivers/media/video/hdpvr/hdpvr-control.c
 create mode 100644 drivers/media/video/hdpvr/hdpvr-core.c
 create mode 100644 drivers/media/video/hdpvr/hdpvr-i2c.c
 create mode 100644 drivers/media/video/hdpvr/hdpvr-video.c
 create mode 100644 drivers/media/video/hdpvr/hdpvr.h

(limited to 'include/linux')

diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
index d5ddb819a961..3f85b9e63754 100644
--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -789,6 +789,8 @@ source "drivers/media/video/gspca/Kconfig"
 
 source "drivers/media/video/pvrusb2/Kconfig"
 
+source "drivers/media/video/hdpvr/Kconfig"
+
 source "drivers/media/video/em28xx/Kconfig"
 
 source "drivers/media/video/usbvision/Kconfig"
diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index 08a0675fea34..b9046744463b 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -119,6 +119,8 @@ obj-$(CONFIG_USB_PWC)           += pwc/
 obj-$(CONFIG_USB_ZC0301)        += zc0301/
 obj-$(CONFIG_USB_GSPCA)         += gspca/
 
+obj-$(CONFIG_VIDEO_HDPVR)	+= hdpvr/
+
 obj-$(CONFIG_USB_IBMCAM)        += usbvideo/
 obj-$(CONFIG_USB_KONICAWC)      += usbvideo/
 obj-$(CONFIG_USB_VICAM)         += usbvideo/
diff --git a/drivers/media/video/hdpvr/Kconfig b/drivers/media/video/hdpvr/Kconfig
new file mode 100644
index 000000000000..de247f3c7d05
--- /dev/null
+++ b/drivers/media/video/hdpvr/Kconfig
@@ -0,0 +1,10 @@
+
+config VIDEO_HDPVR
+	tristate "Hauppauge HD PVR support"
+	depends on VIDEO_DEV
+	---help---
+	  This is a video4linux driver for Hauppauge's HD PVR USB device.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called hdpvr
+
diff --git a/drivers/media/video/hdpvr/Makefile b/drivers/media/video/hdpvr/Makefile
new file mode 100644
index 000000000000..79ad2e16cb8f
--- /dev/null
+++ b/drivers/media/video/hdpvr/Makefile
@@ -0,0 +1,7 @@
+hdpvr-objs	:= hdpvr-control.o hdpvr-core.o hdpvr-i2c.o hdpvr-video.o
+
+obj-$(CONFIG_VIDEO_HDPVR) += hdpvr.o
+
+EXTRA_CFLAGS += -Idrivers/media/video
+
+EXTRA_CFLAGS += $(extra-cflags-y) $(extra-cflags-m)
diff --git a/drivers/media/video/hdpvr/hdpvr-control.c b/drivers/media/video/hdpvr/hdpvr-control.c
new file mode 100644
index 000000000000..ecf02c621f13
--- /dev/null
+++ b/drivers/media/video/hdpvr/hdpvr-control.c
@@ -0,0 +1,201 @@
+/*
+ * Hauppage HD PVR USB driver - video 4 linux 2 interface
+ *
+ * Copyright (C) 2008      Janne Grunau (j@jannau.net)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/mutex.h>
+
+#include <linux/videodev2.h>
+
+#include <media/v4l2-common.h>
+
+#include "hdpvr.h"
+
+
+int hdpvr_config_call(struct hdpvr_device *dev, uint value, u8 valbuf)
+{
+	int ret;
+	char request_type = 0x38, snd_request = 0x01;
+
+	msleep(10);
+
+	mutex_lock(&dev->usbc_mutex);
+	dev->usbc_buf[0] = valbuf;
+	ret = usb_control_msg(dev->udev,
+			      usb_sndctrlpipe(dev->udev, 0),
+			      snd_request, 0x00 | request_type,
+			      value, CTRL_DEFAULT_INDEX,
+			      dev->usbc_buf, 1, 10000);
+
+	mutex_unlock(&dev->usbc_mutex);
+	dev_dbg(&dev->udev->dev,
+		"config call request for value 0x%x returned %d\n", value,
+		ret);
+
+	return ret < 0 ? ret : 0;
+}
+
+struct hdpvr_video_info *get_video_info(struct hdpvr_device *dev)
+{
+	struct hdpvr_video_info *vidinf = NULL;
+#ifdef HDPVR_DEBUG
+	char print_buf[15];
+#endif
+	int ret;
+
+	vidinf = kzalloc(sizeof(struct hdpvr_video_info), GFP_KERNEL);
+	if (!vidinf) {
+		dev_err(&dev->udev->dev, "out of memory");
+		goto err;
+	}
+
+	mutex_lock(&dev->usbc_mutex);
+	ret = usb_control_msg(dev->udev,
+			      usb_rcvctrlpipe(dev->udev, 0),
+			      0x81, 0x80 | 0x38,
+			      0x1400, 0x0003,
+			      dev->usbc_buf, 5,
+			      1000);
+	if (ret == 5) {
+		vidinf->width	= dev->usbc_buf[1] << 8 | dev->usbc_buf[0];
+		vidinf->height	= dev->usbc_buf[3] << 8 | dev->usbc_buf[2];
+		vidinf->fps	= dev->usbc_buf[4];
+	}
+
+#ifdef HDPVR_DEBUG
+	if (hdpvr_debug & MSG_INFO) {
+		hex_dump_to_buffer(dev->usbc_buf, 5, 16, 1, print_buf,
+				   sizeof(print_buf), 0);
+		dev_dbg(&dev->udev->dev, "get video info returned: %d, %s\n",
+			ret, print_buf);
+	}
+#endif
+	mutex_unlock(&dev->usbc_mutex);
+
+	if (!vidinf->width || !vidinf->height || !vidinf->fps) {
+		kfree(vidinf);
+		vidinf = NULL;
+	}
+err:
+	return vidinf;
+}
+
+int get_input_lines_info(struct hdpvr_device *dev)
+{
+#ifdef HDPVR_DEBUG
+	char print_buf[9];
+#endif
+	int ret, lines;
+
+	mutex_lock(&dev->usbc_mutex);
+	ret = usb_control_msg(dev->udev,
+			      usb_rcvctrlpipe(dev->udev, 0),
+			      0x81, 0x80 | 0x38,
+			      0x1800, 0x0003,
+			      dev->usbc_buf, 3,
+			      1000);
+
+#ifdef HDPVR_DEBUG
+	if (hdpvr_debug & MSG_INFO) {
+		hex_dump_to_buffer(dev->usbc_buf, 3, 16, 1, print_buf,
+				   sizeof(print_buf), 0);
+		dev_dbg(&dev->udev->dev,
+			"get input lines info returned: %d, %s\n", ret,
+			print_buf);
+	}
+#endif
+	lines = dev->usbc_buf[1] << 8 | dev->usbc_buf[0];
+	mutex_unlock(&dev->usbc_mutex);
+	return lines;
+}
+
+
+int hdpvr_set_bitrate(struct hdpvr_device *dev)
+{
+	int ret;
+
+	mutex_lock(&dev->usbc_mutex);
+	memset(dev->usbc_buf, 0, 4);
+	dev->usbc_buf[0] = dev->options.bitrate;
+	dev->usbc_buf[2] = dev->options.peak_bitrate;
+
+	ret = usb_control_msg(dev->udev,
+			      usb_sndctrlpipe(dev->udev, 0),
+			      0x01, 0x38, CTRL_BITRATE_VALUE,
+			      CTRL_DEFAULT_INDEX, dev->usbc_buf, 4, 1000);
+	mutex_unlock(&dev->usbc_mutex);
+
+	return ret;
+}
+
+int hdpvr_set_audio(struct hdpvr_device *dev, u8 input,
+		    enum v4l2_mpeg_audio_encoding codec)
+{
+	int ret = 0;
+
+	if (dev->flags & HDPVR_FLAG_AC3_CAP) {
+		mutex_lock(&dev->usbc_mutex);
+		memset(dev->usbc_buf, 0, 2);
+		dev->usbc_buf[0] = input;
+		if (codec == V4L2_MPEG_AUDIO_ENCODING_AAC)
+			dev->usbc_buf[1] = 0;
+		else if (codec == V4L2_MPEG_AUDIO_ENCODING_AC3)
+			dev->usbc_buf[1] = 1;
+		else {
+			mutex_unlock(&dev->usbc_mutex);
+			dev_err(&dev->udev->dev, "invalid audio codec %d\n",
+				codec);
+			ret = -EINVAL;
+			goto error;
+		}
+
+		ret = usb_control_msg(dev->udev,
+				      usb_sndctrlpipe(dev->udev, 0),
+				      0x01, 0x38, CTRL_AUDIO_INPUT_VALUE,
+				      CTRL_DEFAULT_INDEX, dev->usbc_buf, 2,
+				      1000);
+		mutex_unlock(&dev->usbc_mutex);
+		if (ret == 2)
+			ret = 0;
+	} else
+		ret = hdpvr_config_call(dev, CTRL_AUDIO_INPUT_VALUE,
+					dev->options.audio_input+1);
+error:
+	return ret;
+}
+
+int hdpvr_set_options(struct hdpvr_device *dev)
+{
+       hdpvr_config_call(dev, CTRL_VIDEO_STD_TYPE, dev->options.video_std);
+
+       hdpvr_config_call(dev, CTRL_VIDEO_INPUT_VALUE,
+			 dev->options.video_input+1);
+
+       hdpvr_set_audio(dev, dev->options.audio_input+1,
+		       dev->options.audio_codec);
+
+       hdpvr_set_bitrate(dev);
+       hdpvr_config_call(dev, CTRL_BITRATE_MODE_VALUE,
+			 dev->options.bitrate_mode);
+       hdpvr_config_call(dev, CTRL_GOP_MODE_VALUE, dev->options.gop_mode);
+
+       hdpvr_config_call(dev, CTRL_BRIGHTNESS, dev->options.brightness);
+       hdpvr_config_call(dev, CTRL_CONTRAST,   dev->options.contrast);
+       hdpvr_config_call(dev, CTRL_HUE,        dev->options.hue);
+       hdpvr_config_call(dev, CTRL_SATURATION, dev->options.saturation);
+       hdpvr_config_call(dev, CTRL_SHARPNESS,  dev->options.sharpness);
+
+       return 0;
+}
diff --git a/drivers/media/video/hdpvr/hdpvr-core.c b/drivers/media/video/hdpvr/hdpvr-core.c
new file mode 100644
index 000000000000..e7300b570bb7
--- /dev/null
+++ b/drivers/media/video/hdpvr/hdpvr-core.c
@@ -0,0 +1,439 @@
+/*
+ * Hauppage HD PVR USB driver
+ *
+ * Copyright (C) 2001-2004 Greg Kroah-Hartman (greg@kroah.com)
+ * Copyright (C) 2008      Janne Grunau (j@jannau.net)
+ * Copyright (C) 2008      John Poet
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/atomic.h>
+#include <linux/usb.h>
+#include <linux/mutex.h>
+#include <linux/i2c.h>
+
+#include <linux/videodev2.h>
+#include <media/v4l2-dev.h>
+#include <media/v4l2-common.h>
+
+#include "hdpvr.h"
+
+static int video_nr[HDPVR_MAX] = {[0 ... (HDPVR_MAX - 1)] = UNSET};
+module_param_array(video_nr, int, NULL, 0);
+MODULE_PARM_DESC(video_nr, "video device number (-1=Auto)");
+
+/* holds the number of currently registered devices */
+static atomic_t dev_nr = ATOMIC_INIT(-1);
+
+int hdpvr_debug;
+module_param(hdpvr_debug, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(hdpvr_debug, "enable debugging output");
+
+uint default_video_input = HDPVR_VIDEO_INPUTS;
+module_param(default_video_input, uint, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(default_video_input, "default video input: 0=Component / "
+		 "1=S-Video / 2=Composite");
+
+uint default_audio_input = HDPVR_AUDIO_INPUTS;
+module_param(default_audio_input, uint, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(default_audio_input, "default audio input: 0=RCA back / "
+		 "1=RCA front / 2=S/PDIF");
+
+static int boost_audio;
+module_param(boost_audio, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(boost_audio, "boost the audio signal");
+
+
+/* table of devices that work with this driver */
+static struct usb_device_id hdpvr_table[] = {
+	{ USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID) },
+	{ USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID1) },
+	{ USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID2) },
+	{ }					/* Terminating entry */
+};
+MODULE_DEVICE_TABLE(usb, hdpvr_table);
+
+
+void hdpvr_delete(struct hdpvr_device *dev)
+{
+	hdpvr_free_buffers(dev);
+
+	if (dev->video_dev)
+		video_device_release(dev->video_dev);
+
+	usb_put_dev(dev->udev);
+}
+
+static void challenge(u8 *bytes)
+{
+	u64 *i64P, tmp64;
+	uint i, idx;
+
+	for (idx = 0; idx < 32; ++idx) {
+
+		if (idx & 0x3)
+			bytes[(idx >> 3) + 3] = bytes[(idx >> 2) & 0x3];
+
+		switch (idx & 0x3) {
+		case 0x3:
+			bytes[2] += bytes[3] * 4 + bytes[4] + bytes[5];
+			bytes[4] += bytes[(idx & 0x1) * 2] * 9 + 9;
+			break;
+		case 0x1:
+			bytes[0] *= 8;
+			bytes[0] += 7*idx + 4;
+			bytes[6] += bytes[3] * 3;
+			break;
+		case 0x0:
+			bytes[3 - (idx >> 3)] = bytes[idx >> 2];
+			bytes[5] += bytes[6] * 3;
+			for (i = 0; i < 3; i++)
+				bytes[3] *= bytes[3] + 1;
+			break;
+		case 0x2:
+			for (i = 0; i < 3; i++)
+				bytes[1] *= bytes[6] + 1;
+			for (i = 0; i < 3; i++) {
+				i64P = (u64 *)bytes;
+				tmp64 = le64_to_cpup(i64P);
+				tmp64 <<= bytes[7] & 0x0f;
+				*i64P += cpu_to_le64(tmp64);
+			}
+			break;
+		}
+	}
+}
+
+/* try to init the device like the windows driver */
+static int device_authorization(struct hdpvr_device *dev)
+{
+
+	int ret, retval = -ENOMEM;
+	char request_type = 0x38, rcv_request = 0x81;
+	char *response;
+#ifdef HDPVR_DEBUG
+	size_t buf_size = 46;
+	char *print_buf = kzalloc(5*buf_size+1, GFP_KERNEL);
+	if (!print_buf) {
+		dev_err(&dev->udev->dev, "Out of memory");
+		goto error;
+	}
+#endif
+
+	mutex_lock(&dev->usbc_mutex);
+	ret = usb_control_msg(dev->udev,
+			      usb_rcvctrlpipe(dev->udev, 0),
+			      rcv_request, 0x80 | request_type,
+			      0x0400, 0x0003,
+			      dev->usbc_buf, 46,
+			      10000);
+	if (ret != 46) {
+		dev_err(&dev->udev->dev,
+			"unexpected answer of status request, len %d", ret);
+		goto error;
+	}
+#ifdef HDPVR_DEBUG
+	else {
+		hex_dump_to_buffer(dev->usbc_buf, 46, 16, 1, print_buf,
+				   sizeof(print_buf), 0);
+		dev_dbg(&dev->udev->dev,
+			"Status request returned, len %d: %s\n",
+			ret, print_buf);
+	}
+#endif
+	if (dev->usbc_buf[1] == HDPVR_FIRMWARE_VERSION) {
+		dev->flags &= ~HDPVR_FLAG_AC3_CAP;
+	} else if (dev->usbc_buf[1] == HDPVR_FIRMWARE_VERSION_AC3) {
+		dev->flags |= HDPVR_FLAG_AC3_CAP;
+	} else if (dev->usbc_buf[1] > HDPVR_FIRMWARE_VERSION_AC3) {
+		dev_notice(&dev->udev->dev, "untested firmware version 0x%x, "
+			   "the driver might not work\n", dev->usbc_buf[1]);
+		dev->flags |= HDPVR_FLAG_AC3_CAP;
+	} else {
+		dev_err(&dev->udev->dev, "unknown firmware version 0x%x\n",
+			dev->usbc_buf[1]);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	response = dev->usbc_buf+38;
+#ifdef HDPVR_DEBUG
+	hex_dump_to_buffer(response, 8, 16, 1, print_buf, sizeof(print_buf), 0);
+	dev_dbg(&dev->udev->dev, "challenge: %s\n", print_buf);
+#endif
+	challenge(response);
+#ifdef HDPVR_DEBUG
+	hex_dump_to_buffer(response, 8, 16, 1, print_buf, sizeof(print_buf), 0);
+	dev_dbg(&dev->udev->dev, " response: %s\n", print_buf);
+#endif
+
+	msleep(100);
+	ret = usb_control_msg(dev->udev,
+			      usb_sndctrlpipe(dev->udev, 0),
+			      0xd1, 0x00 | request_type,
+			      0x0000, 0x0000,
+			      response, 8,
+			      10000);
+	dev_dbg(&dev->udev->dev, "magic request returned %d\n", ret);
+	mutex_unlock(&dev->usbc_mutex);
+
+	retval = ret != 8;
+error:
+	return retval;
+}
+
+static int hdpvr_device_init(struct hdpvr_device *dev)
+{
+	int ret;
+	u8 *buf;
+	struct hdpvr_video_info *vidinf;
+
+	if (device_authorization(dev))
+		return -EACCES;
+
+	/* default options for init */
+	hdpvr_set_options(dev);
+
+	/* set filter options */
+	mutex_lock(&dev->usbc_mutex);
+	buf = dev->usbc_buf;
+	buf[0] = 0x03; buf[1] = 0x03; buf[2] = 0x00; buf[3] = 0x00;
+	ret = usb_control_msg(dev->udev,
+			      usb_sndctrlpipe(dev->udev, 0),
+			      0x01, 0x38,
+			      CTRL_LOW_PASS_FILTER_VALUE, CTRL_DEFAULT_INDEX,
+			      buf, 4,
+			      1000);
+	dev_dbg(&dev->udev->dev, "control request returned %d\n", ret);
+	mutex_unlock(&dev->usbc_mutex);
+
+	vidinf = get_video_info(dev);
+	if (!vidinf)
+		dev_dbg(&dev->udev->dev,
+			"no valid video signal or device init failed\n");
+	else
+		kfree(vidinf);
+
+	/* enable fan and bling leds */
+	mutex_lock(&dev->usbc_mutex);
+	buf[0] = 0x1;
+	ret = usb_control_msg(dev->udev,
+			      usb_sndctrlpipe(dev->udev, 0),
+			      0xd4, 0x38, 0, 0, buf, 1,
+			      1000);
+	dev_dbg(&dev->udev->dev, "control request returned %d\n", ret);
+
+	/* boost analog audio */
+	buf[0] = boost_audio;
+	ret = usb_control_msg(dev->udev,
+			      usb_sndctrlpipe(dev->udev, 0),
+			      0xd5, 0x38, 0, 0, buf, 1,
+			      1000);
+	dev_dbg(&dev->udev->dev, "control request returned %d\n", ret);
+	mutex_unlock(&dev->usbc_mutex);
+
+	dev->status = STATUS_IDLE;
+	return 0;
+}
+
+static const struct hdpvr_options hdpvr_default_options = {
+	.video_std	= HDPVR_60HZ,
+	.video_input	= HDPVR_COMPONENT,
+	.audio_input	= HDPVR_RCA_BACK,
+	.bitrate	= 65, /* 6 mbps */
+	.peak_bitrate	= 90, /* 9 mbps */
+	.bitrate_mode	= HDPVR_CONSTANT,
+	.gop_mode	= HDPVR_SIMPLE_IDR_GOP,
+	.audio_codec	= V4L2_MPEG_AUDIO_ENCODING_AAC,
+	.brightness	= 0x86,
+	.contrast	= 0x80,
+	.hue		= 0x80,
+	.saturation	= 0x80,
+	.sharpness	= 0x80,
+};
+
+static int hdpvr_probe(struct usb_interface *interface,
+		       const struct usb_device_id *id)
+{
+	struct hdpvr_device *dev;
+	struct usb_host_interface *iface_desc;
+	struct usb_endpoint_descriptor *endpoint;
+	size_t buffer_size;
+	int i;
+	int retval = -ENOMEM;
+
+	/* allocate memory for our device state and initialize it */
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		err("Out of memory");
+		goto error;
+	}
+	mutex_init(&dev->io_mutex);
+	mutex_init(&dev->i2c_mutex);
+	mutex_init(&dev->usbc_mutex);
+	dev->usbc_buf = kmalloc(64, GFP_KERNEL);
+	if (!dev->usbc_buf) {
+		dev_err(&dev->udev->dev, "Out of memory");
+		goto error;
+	}
+
+	init_waitqueue_head(&dev->wait_buffer);
+	init_waitqueue_head(&dev->wait_data);
+
+	dev->workqueue = create_singlethread_workqueue("hdpvr_buffer");
+	if (!dev->workqueue)
+		goto error;
+
+	/* init video transfer queues */
+	INIT_LIST_HEAD(&dev->free_buff_list);
+	INIT_LIST_HEAD(&dev->rec_buff_list);
+
+	dev->options = hdpvr_default_options;
+
+	if (default_video_input < HDPVR_VIDEO_INPUTS)
+		dev->options.video_input = default_video_input;
+
+	if (default_audio_input < HDPVR_AUDIO_INPUTS)
+		dev->options.audio_input = default_audio_input;
+
+	dev->udev = usb_get_dev(interface_to_usbdev(interface));
+
+	/* set up the endpoint information */
+	/* use only the first bulk-in and bulk-out endpoints */
+	iface_desc = interface->cur_altsetting;
+	for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) {
+		endpoint = &iface_desc->endpoint[i].desc;
+
+		if (!dev->bulk_in_endpointAddr &&
+		    usb_endpoint_is_bulk_in(endpoint)) {
+			/* USB interface description is buggy, reported max
+			 * packet size is 512 bytes, windows driver uses 8192 */
+			buffer_size = 8192;
+			dev->bulk_in_size = buffer_size;
+			dev->bulk_in_endpointAddr = endpoint->bEndpointAddress;
+		}
+
+	}
+	if (!dev->bulk_in_endpointAddr) {
+		err("Could not find bulk-in endpoint");
+		goto error;
+	}
+
+	/* init the device */
+	if (hdpvr_device_init(dev)) {
+		err("device init failed");
+		goto error;
+	}
+
+	mutex_lock(&dev->io_mutex);
+	if (hdpvr_alloc_buffers(dev, NUM_BUFFERS)) {
+		err("allocating transfer buffers failed");
+		goto error;
+	}
+	mutex_unlock(&dev->io_mutex);
+
+	if (hdpvr_register_videodev(dev,
+				    video_nr[atomic_inc_return(&dev_nr)])) {
+		err("registering videodev failed");
+		goto error;
+	}
+
+
+	/* save our data pointer in this interface device */
+	usb_set_intfdata(interface, dev);
+
+	/* let the user know what node this device is now attached to */
+	v4l2_info(dev->video_dev, "device now attached to /dev/video%d\n",
+		  dev->video_dev->minor);
+	return 0;
+
+error:
+	if (dev) {
+		mutex_unlock(&dev->io_mutex);
+		/* this frees allocated memory */
+		hdpvr_delete(dev);
+	}
+	return retval;
+}
+
+static void hdpvr_disconnect(struct usb_interface *interface)
+{
+	struct hdpvr_device *dev;
+	int minor;
+
+	dev = usb_get_intfdata(interface);
+	usb_set_intfdata(interface, NULL);
+
+	minor = dev->video_dev->minor;
+
+	/* prevent more I/O from starting and stop any ongoing */
+	mutex_lock(&dev->io_mutex);
+	dev->status = STATUS_DISCONNECTED;
+	video_unregister_device(dev->video_dev);
+	wake_up_interruptible(&dev->wait_data);
+	wake_up_interruptible(&dev->wait_buffer);
+	msleep(100);
+	flush_workqueue(dev->workqueue);
+	hdpvr_cancel_queue(dev);
+	destroy_workqueue(dev->workqueue);
+	mutex_unlock(&dev->io_mutex);
+
+	/* deregister I2C adapter */
+	mutex_lock(&dev->i2c_mutex);
+	if (dev->i2c_adapter)
+		i2c_del_adapter(dev->i2c_adapter);
+	kfree(dev->i2c_adapter);
+	dev->i2c_adapter = NULL;
+	mutex_unlock(&dev->i2c_mutex);
+
+	atomic_dec(&dev_nr);
+
+	printk(KERN_INFO "Hauppauge HD PVR: device /dev/video%d disconnected\n",
+	       minor);
+
+	kfree(dev->usbc_buf);
+	kfree(dev);
+}
+
+
+static struct usb_driver hdpvr_usb_driver = {
+	.name =		"hdpvr",
+	.probe =	hdpvr_probe,
+	.disconnect =	hdpvr_disconnect,
+	.id_table =	hdpvr_table,
+};
+
+static int __init hdpvr_init(void)
+{
+	int result;
+
+	/* register this driver with the USB subsystem */
+	result = usb_register(&hdpvr_usb_driver);
+	if (result)
+		err("usb_register failed. Error number %d", result);
+
+	return result;
+}
+
+static void __exit hdpvr_exit(void)
+{
+	/* deregister this driver with the USB subsystem */
+	usb_deregister(&hdpvr_usb_driver);
+}
+
+module_init(hdpvr_init);
+module_exit(hdpvr_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Janne Grunau");
+MODULE_DESCRIPTION("Hauppauge HD PVR driver");
diff --git a/drivers/media/video/hdpvr/hdpvr-i2c.c b/drivers/media/video/hdpvr/hdpvr-i2c.c
new file mode 100644
index 000000000000..35096dec2411
--- /dev/null
+++ b/drivers/media/video/hdpvr/hdpvr-i2c.c
@@ -0,0 +1,145 @@
+
+/*
+ * Hauppage HD PVR USB driver
+ *
+ * Copyright (C) 2008      Janne Grunau (j@jannau.net)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ */
+
+#include <linux/i2c.h>
+
+#include "hdpvr.h"
+
+#define CTRL_READ_REQUEST	0xb8
+#define CTRL_WRITE_REQUEST	0x38
+
+#define REQTYPE_I2C_READ	0xb1
+#define REQTYPE_I2C_WRITE	0xb0
+#define REQTYPE_I2C_WRITE_STATT	0xd0
+
+static int hdpvr_i2c_read(struct hdpvr_device *dev, unsigned char addr,
+			  char *data, int len)
+{
+	int ret;
+	char *buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = usb_control_msg(dev->udev,
+			      usb_rcvctrlpipe(dev->udev, 0),
+			      REQTYPE_I2C_READ, CTRL_READ_REQUEST,
+			      0x100|addr, 0, buf, len, 1000);
+
+	if (ret == len) {
+		memcpy(data, buf, len);
+		ret = 0;
+	} else if (ret >= 0)
+		ret = -EIO;
+
+	kfree(buf);
+
+	return ret;
+}
+
+static int hdpvr_i2c_write(struct hdpvr_device *dev, unsigned char addr,
+			   char *data, int len)
+{
+	int ret;
+	char *buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	memcpy(buf, data, len);
+	ret = usb_control_msg(dev->udev,
+			      usb_sndctrlpipe(dev->udev, 0),
+			      REQTYPE_I2C_WRITE, CTRL_WRITE_REQUEST,
+			      0x100|addr, 0, buf, len, 1000);
+
+	if (ret < 0)
+		goto error;
+
+	ret = usb_control_msg(dev->udev,
+			      usb_rcvctrlpipe(dev->udev, 0),
+			      REQTYPE_I2C_WRITE_STATT, CTRL_READ_REQUEST,
+			      0, 0, buf, 2, 1000);
+
+	if (ret == 2)
+		ret = 0;
+	else if (ret >= 0)
+		ret = -EIO;
+
+error:
+	kfree(buf);
+	return ret;
+}
+
+static int hdpvr_transfer(struct i2c_adapter *i2c_adapter, struct i2c_msg *msgs,
+			  int num)
+{
+	struct hdpvr_device *dev = i2c_get_adapdata(i2c_adapter);
+	int retval = 0, i, addr;
+
+	if (num <= 0)
+		return 0;
+
+	mutex_lock(&dev->i2c_mutex);
+
+	for (i = 0; i < num && !retval; i++) {
+		addr = msgs[i].addr << 1;
+
+		if (msgs[i].flags & I2C_M_RD)
+			retval = hdpvr_i2c_read(dev, addr, msgs[i].buf,
+						msgs[i].len);
+		else
+			retval = hdpvr_i2c_write(dev, addr, msgs[i].buf,
+						 msgs[i].len);
+	}
+
+	mutex_unlock(&dev->i2c_mutex);
+
+	return retval ? retval : num;
+}
+
+static u32 hdpvr_functionality(struct i2c_adapter *adapter)
+{
+	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
+}
+
+static struct i2c_algorithm hdpvr_algo = {
+	.master_xfer   = hdpvr_transfer,
+	.functionality = hdpvr_functionality,
+};
+
+int hdpvr_register_i2c_adapter(struct hdpvr_device *dev)
+{
+	struct i2c_adapter *i2c_adap;
+	int retval = -ENOMEM;
+
+	i2c_adap = kzalloc(sizeof(struct i2c_adapter), GFP_KERNEL);
+	if (i2c_adap == NULL)
+		goto error;
+
+	strlcpy(i2c_adap->name, "Hauppauge HD PVR I2C",
+		sizeof(i2c_adap->name));
+	i2c_adap->algo  = &hdpvr_algo;
+	i2c_adap->class = I2C_CLASS_TV_ANALOG;
+	i2c_adap->id    = I2C_HW_B_HDPVR;
+	i2c_adap->owner = THIS_MODULE;
+	i2c_adap->dev.parent = &dev->udev->dev;
+
+	i2c_set_adapdata(i2c_adap, dev);
+
+	retval = i2c_add_adapter(i2c_adap);
+
+	if (!retval)
+		dev->i2c_adapter = i2c_adap;
+	else
+		kfree(i2c_adap);
+
+error:
+	return retval;
+}
diff --git a/drivers/media/video/hdpvr/hdpvr-video.c b/drivers/media/video/hdpvr/hdpvr-video.c
new file mode 100644
index 000000000000..ee481495e4fc
--- /dev/null
+++ b/drivers/media/video/hdpvr/hdpvr-video.c
@@ -0,0 +1,1228 @@
+/*
+ * Hauppage HD PVR USB driver - video 4 linux 2 interface
+ *
+ * Copyright (C) 2008      Janne Grunau (j@jannau.net)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/usb.h>
+#include <linux/mutex.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include <linux/videodev2.h>
+#include <media/v4l2-dev.h>
+#include <media/v4l2-common.h>
+#include <media/v4l2-ioctl.h>
+#include "hdpvr.h"
+
+#define BULK_URB_TIMEOUT 1250 /* 1.25 seconds */
+
+struct hdpvr_fh {
+	struct hdpvr_device	*dev;
+};
+
+static uint list_size(struct list_head *list)
+{
+	struct list_head *tmp;
+	uint count = 0;
+
+	list_for_each(tmp, list) {
+		count++;
+	}
+
+	return count;
+}
+
+/*=========================================================================*/
+/* urb callback */
+static void hdpvr_read_bulk_callback(struct urb *urb)
+{
+	struct hdpvr_buffer *buf = (struct hdpvr_buffer *)urb->context;
+	struct hdpvr_device *dev = buf->dev;
+
+	/* marking buffer as received and wake waiting */
+	buf->status = BUFSTAT_READY;
+	wake_up_interruptible(&dev->wait_data);
+}
+
+/*=========================================================================*/
+/* bufffer bits */
+
+/* function expects dev->io_mutex to be hold by caller */
+int hdpvr_cancel_queue(struct hdpvr_device *dev)
+{
+	struct hdpvr_buffer *buf;
+
+	list_for_each_entry(buf, &dev->rec_buff_list, buff_list) {
+		usb_kill_urb(buf->urb);
+		buf->status = BUFSTAT_AVAILABLE;
+	}
+
+	list_splice_init(&dev->rec_buff_list, dev->free_buff_list.prev);
+
+	return 0;
+}
+
+static int hdpvr_free_queue(struct list_head *q)
+{
+	struct list_head *tmp;
+	struct list_head *p;
+	struct hdpvr_buffer *buf;
+	struct urb *urb;
+
+	for (p = q->next; p != q;) {
+		buf = list_entry(p, struct hdpvr_buffer, buff_list);
+
+		urb = buf->urb;
+		usb_buffer_free(urb->dev, urb->transfer_buffer_length,
+				urb->transfer_buffer, urb->transfer_dma);
+		usb_free_urb(urb);
+		tmp = p->next;
+		list_del(p);
+		kfree(buf);
+		p = tmp;
+	}
+
+	return 0;
+}
+
+/* function expects dev->io_mutex to be hold by caller */
+int hdpvr_free_buffers(struct hdpvr_device *dev)
+{
+	hdpvr_cancel_queue(dev);
+
+	hdpvr_free_queue(&dev->free_buff_list);
+	hdpvr_free_queue(&dev->rec_buff_list);
+
+	return 0;
+}
+
+/* function expects dev->io_mutex to be hold by caller */
+int hdpvr_alloc_buffers(struct hdpvr_device *dev, uint count)
+{
+	uint i;
+	int retval = -ENOMEM;
+	u8 *mem;
+	struct hdpvr_buffer *buf;
+	struct urb *urb;
+
+	v4l2_dbg(MSG_INFO, hdpvr_debug, dev->video_dev,
+		 "allocating %u buffers\n", count);
+
+	for (i = 0; i < count; i++) {
+
+		buf = kzalloc(sizeof(struct hdpvr_buffer), GFP_KERNEL);
+		if (!buf) {
+			err("cannot allocate buffer");
+			goto exit;
+		}
+		buf->dev = dev;
+
+		urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (!urb) {
+			err("cannot allocate urb");
+			goto exit;
+		}
+		buf->urb = urb;
+
+		mem = usb_buffer_alloc(dev->udev, dev->bulk_in_size, GFP_KERNEL,
+				       &urb->transfer_dma);
+		if (!mem) {
+			err("cannot allocate usb transfer buffer");
+			goto exit;
+		}
+
+		usb_fill_bulk_urb(buf->urb, dev->udev,
+				  usb_rcvbulkpipe(dev->udev,
+						  dev->bulk_in_endpointAddr),
+				  mem, dev->bulk_in_size,
+				  hdpvr_read_bulk_callback, buf);
+
+		buf->status = BUFSTAT_AVAILABLE;
+		list_add_tail(&buf->buff_list, &dev->free_buff_list);
+	}
+	return 0;
+exit:
+	hdpvr_free_buffers(dev);
+	return retval;
+}
+
+static int hdpvr_submit_buffers(struct hdpvr_device *dev)
+{
+	struct hdpvr_buffer *buf;
+	struct urb *urb;
+	int ret = 0, err_count = 0;
+
+	mutex_lock(&dev->io_mutex);
+
+	while (dev->status == STATUS_STREAMING &&
+	       !list_empty(&dev->free_buff_list)) {
+
+		buf = list_entry(dev->free_buff_list.next, struct hdpvr_buffer,
+				 buff_list);
+		if (buf->status != BUFSTAT_AVAILABLE) {
+			err("buffer not marked as availbale");
+			ret = -EFAULT;
+			goto err;
+		}
+
+		urb = buf->urb;
+		urb->status = 0;
+		urb->actual_length = 0;
+		ret = usb_submit_urb(urb, GFP_KERNEL);
+		if (ret) {
+			err("usb_submit_urb in %s returned %d", __func__, ret);
+			if (++err_count > 2)
+				break;
+			continue;
+		}
+		buf->status = BUFSTAT_INPROGRESS;
+		list_move_tail(&buf->buff_list, &dev->rec_buff_list);
+	}
+err:
+	v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+		 "buffer queue stat: %d free, %d proc\n",
+		 list_size(&dev->free_buff_list),
+		 list_size(&dev->rec_buff_list));
+	mutex_unlock(&dev->io_mutex);
+	return ret;
+}
+
+static struct hdpvr_buffer *hdpvr_get_next_buffer(struct hdpvr_device *dev)
+{
+	struct hdpvr_buffer *buf;
+
+	mutex_lock(&dev->io_mutex);
+
+	if (list_empty(&dev->rec_buff_list)) {
+		mutex_unlock(&dev->io_mutex);
+		return NULL;
+	}
+
+	buf = list_entry(dev->rec_buff_list.next, struct hdpvr_buffer,
+			 buff_list);
+	mutex_unlock(&dev->io_mutex);
+
+	return buf;
+}
+
+static void hdpvr_transmit_buffers(struct work_struct *work)
+{
+	struct hdpvr_device *dev = container_of(work, struct hdpvr_device,
+						worker);
+
+	while (dev->status == STATUS_STREAMING) {
+
+		if (hdpvr_submit_buffers(dev)) {
+			v4l2_err(dev->video_dev, "couldn't submit buffers\n");
+			goto error;
+		}
+		if (wait_event_interruptible(dev->wait_buffer,
+				!list_empty(&dev->free_buff_list) ||
+					     dev->status != STATUS_STREAMING))
+			goto error;
+	}
+
+	v4l2_dbg(MSG_INFO, hdpvr_debug, dev->video_dev,
+		 "transmit worker exited\n");
+	return;
+error:
+	v4l2_dbg(MSG_INFO, hdpvr_debug, dev->video_dev,
+		 "transmit buffers errored\n");
+	dev->status = STATUS_ERROR;
+}
+
+/* function expects dev->io_mutex to be hold by caller */
+static int hdpvr_start_streaming(struct hdpvr_device *dev)
+{
+	int ret;
+	struct hdpvr_video_info *vidinf;
+
+	if (dev->status == STATUS_STREAMING)
+		return 0;
+	else if (dev->status != STATUS_IDLE)
+		return -EAGAIN;
+
+	vidinf = get_video_info(dev);
+
+	if (vidinf) {
+		v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+			 "video signal: %dx%d@%dhz\n", vidinf->width,
+			 vidinf->height, vidinf->fps);
+		kfree(vidinf);
+
+		/* start streaming 2 request */
+		ret = usb_control_msg(dev->udev,
+				      usb_sndctrlpipe(dev->udev, 0),
+				      0xb8, 0x38, 0x1, 0, NULL, 0, 8000);
+		v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+			 "encoder start control request returned %d\n", ret);
+
+		hdpvr_config_call(dev, CTRL_START_STREAMING_VALUE, 0x00);
+
+		INIT_WORK(&dev->worker, hdpvr_transmit_buffers);
+		queue_work(dev->workqueue, &dev->worker);
+
+		v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+			 "streaming started\n");
+		dev->status = STATUS_STREAMING;
+
+		return 0;
+	}
+	msleep(250);
+	v4l2_dbg(MSG_INFO, hdpvr_debug, dev->video_dev,
+		 "no video signal at input %d\n", dev->options.video_input);
+	return -EAGAIN;
+}
+
+
+/* function expects dev->io_mutex to be hold by caller */
+static int hdpvr_stop_streaming(struct hdpvr_device *dev)
+{
+	if (dev->status == STATUS_IDLE)
+		return 0;
+	else if (dev->status != STATUS_STREAMING)
+		return -EAGAIN;
+
+	dev->status = STATUS_SHUTTING_DOWN;
+	hdpvr_config_call(dev, CTRL_STOP_STREAMING_VALUE, 0x00);
+
+	wake_up_interruptible(&dev->wait_buffer);
+	msleep(50);
+
+	flush_workqueue(dev->workqueue);
+
+	/* kill the still outstanding urbs */
+	hdpvr_cancel_queue(dev);
+
+	dev->status = STATUS_IDLE;
+
+	return 0;
+}
+
+
+/*=======================================================================*/
+/*
+ * video 4 linux 2 file operations
+ */
+
+static int hdpvr_open(struct file *file)
+{
+	struct hdpvr_device *dev;
+	struct hdpvr_fh *fh;
+	int retval = -ENOMEM;
+
+	dev = (struct hdpvr_device *)video_get_drvdata(video_devdata(file));
+	if (!dev) {
+		err("open failing with with ENODEV");
+		retval = -ENODEV;
+		goto err;
+	}
+
+	fh = kzalloc(sizeof(struct hdpvr_fh), GFP_KERNEL);
+	if (!fh) {
+		err("Out of memory?");
+		goto err;
+	}
+	/* lock the device to allow correctly handling errors
+	 * in resumption */
+	mutex_lock(&dev->io_mutex);
+	dev->open_count++;
+
+	fh->dev = dev;
+
+	/* save our object in the file's private structure */
+	file->private_data = fh;
+
+	retval = 0;
+err:
+	mutex_unlock(&dev->io_mutex);
+	return retval;
+}
+
+static int hdpvr_release(struct file *file)
+{
+	struct hdpvr_fh		*fh  = (struct hdpvr_fh *)file->private_data;
+	struct hdpvr_device	*dev = fh->dev;
+
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->io_mutex);
+	if (!(--dev->open_count) && dev->status == STATUS_STREAMING)
+		hdpvr_stop_streaming(dev);
+
+	mutex_unlock(&dev->io_mutex);
+
+	return 0;
+}
+
+/*
+ * hdpvr_v4l2_read()
+ * will allocate buffers when called for the first time
+ */
+static ssize_t hdpvr_read(struct file *file, char __user *buffer, size_t count,
+			  loff_t *pos)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	struct hdpvr_buffer *buf = NULL;
+	struct urb *urb;
+	unsigned int ret = 0;
+	int rem, cnt;
+
+	if (*pos)
+		return -ESPIPE;
+
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->io_mutex);
+	if (dev->status == STATUS_IDLE) {
+		if (hdpvr_start_streaming(dev)) {
+			v4l2_dbg(MSG_INFO, hdpvr_debug, dev->video_dev,
+				 "start_streaming failed");
+			ret = -EIO;
+			msleep(200);
+			dev->status = STATUS_IDLE;
+			mutex_unlock(&dev->io_mutex);
+			goto err;
+		}
+
+		v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+			 "buffer queue stat: %d free, %d proc\n",
+			 list_size(&dev->free_buff_list),
+			 list_size(&dev->rec_buff_list));
+	}
+	mutex_unlock(&dev->io_mutex);
+
+	/* wait for the first buffer */
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (wait_event_interruptible(dev->wait_data,
+					     hdpvr_get_next_buffer(dev)))
+			return -ERESTARTSYS;
+	}
+
+	buf = hdpvr_get_next_buffer(dev);
+
+	while (count > 0 && buf) {
+
+		if (buf->status != BUFSTAT_READY &&
+		    dev->status != STATUS_DISCONNECTED) {
+			/* return nonblocking */
+			if (file->f_flags & O_NONBLOCK) {
+				if (!ret)
+					ret = -EAGAIN;
+				goto err;
+			}
+
+			if (wait_event_interruptible(dev->wait_data,
+					      buf->status == BUFSTAT_READY)) {
+				ret = -ERESTARTSYS;
+				goto err;
+			}
+		}
+
+		if (buf->status != BUFSTAT_READY)
+			break;
+
+		/* set remaining bytes to copy */
+		urb = buf->urb;
+		rem = urb->actual_length - buf->pos;
+		cnt = rem > count ? count : rem;
+
+		if (copy_to_user(buffer, urb->transfer_buffer + buf->pos,
+				 cnt)) {
+			err("read: copy_to_user failed");
+			if (!ret)
+				ret = -EFAULT;
+			goto err;
+		}
+
+		buf->pos += cnt;
+		count -= cnt;
+		buffer += cnt;
+		ret += cnt;
+
+		/* finished, take next buffer */
+		if (buf->pos == urb->actual_length) {
+			mutex_lock(&dev->io_mutex);
+			buf->pos = 0;
+			buf->status = BUFSTAT_AVAILABLE;
+
+			list_move_tail(&buf->buff_list, &dev->free_buff_list);
+
+			v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+				 "buffer queue stat: %d free, %d proc\n",
+				 list_size(&dev->free_buff_list),
+				 list_size(&dev->rec_buff_list));
+
+			mutex_unlock(&dev->io_mutex);
+
+			wake_up_interruptible(&dev->wait_buffer);
+
+			buf = hdpvr_get_next_buffer(dev);
+		}
+	}
+err:
+	if (!ret && !buf)
+		ret = -EAGAIN;
+	return ret;
+}
+
+static unsigned int hdpvr_poll(struct file *filp, poll_table *wait)
+{
+	struct hdpvr_fh *fh = (struct hdpvr_fh *)filp->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	unsigned int mask = 0;
+
+	mutex_lock(&dev->io_mutex);
+
+	if (video_is_unregistered(dev->video_dev))
+		return -EIO;
+
+	if (dev->status == STATUS_IDLE) {
+		if (hdpvr_start_streaming(dev)) {
+			v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+				 "start_streaming failed");
+			dev->status = STATUS_IDLE;
+		}
+
+		v4l2_dbg(MSG_BUFFER, hdpvr_debug, dev->video_dev,
+			 "buffer queue stat: %d free, %d proc\n",
+			 list_size(&dev->free_buff_list),
+			 list_size(&dev->rec_buff_list));
+	}
+	mutex_unlock(&dev->io_mutex);
+
+	poll_wait(filp, &dev->wait_data, wait);
+
+	mutex_lock(&dev->io_mutex);
+	if (!list_empty(&dev->rec_buff_list)) {
+
+		struct hdpvr_buffer *buf = list_entry(dev->rec_buff_list.next,
+						      struct hdpvr_buffer,
+						      buff_list);
+
+		if (buf->status == BUFSTAT_READY)
+			mask |= POLLIN | POLLRDNORM;
+	}
+	mutex_unlock(&dev->io_mutex);
+
+	return mask;
+}
+
+
+static long hdpvr_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct hdpvr_fh *fh = (struct hdpvr_fh *)filp->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int res;
+
+	if (video_is_unregistered(dev->video_dev))
+		return -EIO;
+
+	mutex_lock(&dev->io_mutex);
+	switch (cmd) {
+	case VIDIOC_TRY_ENCODER_CMD:
+	case VIDIOC_ENCODER_CMD: {
+		struct v4l2_encoder_cmd *enc = (struct v4l2_encoder_cmd *)arg;
+		int try = cmd == VIDIOC_TRY_ENCODER_CMD;
+
+		memset(&enc->raw, 0, sizeof(enc->raw));
+		switch (enc->cmd) {
+		case V4L2_ENC_CMD_START:
+			enc->flags = 0;
+			if (try)
+				return 0;
+			res = hdpvr_start_streaming(dev);
+			break;
+		case V4L2_ENC_CMD_STOP:
+			if (try)
+				return 0;
+			res = hdpvr_stop_streaming(dev);
+			break;
+		default:
+			v4l2_dbg(MSG_INFO, hdpvr_debug, dev->video_dev,
+				 "Unsupported encoder cmd %d\n", enc->cmd);
+			return -EINVAL;
+		}
+		break;
+	}
+	default:
+		res = video_ioctl2(filp, cmd, arg);
+	}
+	mutex_unlock(&dev->io_mutex);
+	return res;
+}
+
+static const struct v4l2_file_operations hdpvr_fops = {
+	.owner		= THIS_MODULE,
+	.open		= hdpvr_open,
+	.release	= hdpvr_release,
+	.read		= hdpvr_read,
+	.poll		= hdpvr_poll,
+	.unlocked_ioctl	= hdpvr_ioctl,
+};
+
+/*=======================================================================*/
+/*
+ * V4L2 ioctl handling
+ */
+
+static int vidioc_querycap(struct file *file, void  *priv,
+			   struct v4l2_capability *cap)
+{
+	struct hdpvr_device *dev = video_drvdata(file);
+
+	strcpy(cap->driver, "hdpvr");
+	strcpy(cap->card, "Haupauge HD PVR");
+	usb_make_path(dev->udev, cap->bus_info, sizeof(cap->bus_info));
+	cap->version = HDPVR_VERSION;
+	cap->capabilities =     V4L2_CAP_VIDEO_CAPTURE |
+				V4L2_CAP_AUDIO         |
+				V4L2_CAP_READWRITE;
+	return 0;
+}
+
+static int vidioc_s_std(struct file *file, void *private_data,
+			v4l2_std_id *std)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	u8 std_type = 1;
+
+	if (*std & (V4L2_STD_NTSC | V4L2_STD_PAL_60))
+		std_type = 0;
+
+	return hdpvr_config_call(dev, CTRL_VIDEO_STD_TYPE, std_type);
+}
+
+static const char *iname[] = {
+	[HDPVR_COMPONENT] = "Component",
+	[HDPVR_SVIDEO]    = "S-Video",
+	[HDPVR_COMPOSITE] = "Composite",
+};
+
+static int vidioc_enum_input(struct file *file, void *priv,
+				struct v4l2_input *i)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	unsigned int n;
+
+	n = i->index;
+	if (n >= HDPVR_VIDEO_INPUTS)
+		return -EINVAL;
+
+	i->type = V4L2_INPUT_TYPE_CAMERA;
+
+	strncpy(i->name, iname[n], sizeof(i->name) - 1);
+	i->name[sizeof(i->name) - 1] = '\0';
+
+	i->audioset = 1<<HDPVR_RCA_FRONT | 1<<HDPVR_RCA_BACK | 1<<HDPVR_SPDIF;
+
+	i->std = dev->video_dev->tvnorms;
+
+	return 0;
+}
+
+static int vidioc_s_input(struct file *file, void *private_data,
+			  unsigned int index)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int retval;
+
+	if (index >= HDPVR_VIDEO_INPUTS)
+		return -EINVAL;
+
+	if (dev->status != STATUS_IDLE)
+		return -EAGAIN;
+
+	retval = hdpvr_config_call(dev, CTRL_VIDEO_INPUT_VALUE, index+1);
+	if (!retval)
+		dev->options.video_input = index;
+
+	return retval;
+}
+
+static int vidioc_g_input(struct file *file, void *private_data,
+			  unsigned int *index)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+
+	*index = dev->options.video_input;
+	return 0;
+}
+
+
+static const char *audio_iname[] = {
+	[HDPVR_RCA_FRONT] = "RCA front",
+	[HDPVR_RCA_BACK]  = "RCA back",
+	[HDPVR_SPDIF]     = "SPDIF",
+};
+
+static int vidioc_enumaudio(struct file *file, void *priv,
+				struct v4l2_audio *audio)
+{
+	unsigned int n;
+
+	n = audio->index;
+	if (n >= HDPVR_AUDIO_INPUTS)
+		return -EINVAL;
+
+	audio->capability = V4L2_AUDCAP_STEREO;
+
+	strncpy(audio->name, audio_iname[n], sizeof(audio->name) - 1);
+	audio->name[sizeof(audio->name) - 1] = '\0';
+
+	return 0;
+}
+
+static int vidioc_s_audio(struct file *file, void *private_data,
+			  struct v4l2_audio *audio)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int retval;
+
+	if (audio->index >= HDPVR_AUDIO_INPUTS)
+		return -EINVAL;
+
+	if (dev->status != STATUS_IDLE)
+		return -EAGAIN;
+
+	retval = hdpvr_set_audio(dev, audio->index+1, dev->options.audio_codec);
+	if (!retval)
+		dev->options.audio_input = audio->index;
+
+	return retval;
+}
+
+static int vidioc_g_audio(struct file *file, void *private_data,
+			  struct v4l2_audio *audio)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+
+	audio->index = dev->options.audio_input;
+	audio->capability = V4L2_AUDCAP_STEREO;
+	strncpy(audio->name, audio_iname[audio->index], sizeof(audio->name));
+	audio->name[sizeof(audio->name) - 1] = '\0';
+	return 0;
+}
+
+static const s32 supported_v4l2_ctrls[] = {
+	V4L2_CID_BRIGHTNESS,
+	V4L2_CID_CONTRAST,
+	V4L2_CID_SATURATION,
+	V4L2_CID_HUE,
+	V4L2_CID_SHARPNESS,
+	V4L2_CID_MPEG_AUDIO_ENCODING,
+	V4L2_CID_MPEG_VIDEO_ENCODING,
+	V4L2_CID_MPEG_VIDEO_BITRATE_MODE,
+	V4L2_CID_MPEG_VIDEO_BITRATE,
+	V4L2_CID_MPEG_VIDEO_BITRATE_PEAK,
+};
+
+static int fill_queryctrl(struct hdpvr_options *opt, struct v4l2_queryctrl *qc,
+			  int ac3)
+{
+	int err;
+
+	switch (qc->id) {
+	case V4L2_CID_BRIGHTNESS:
+		return v4l2_ctrl_query_fill(qc, 0x0, 0xff, 1, 0x86);
+	case V4L2_CID_CONTRAST:
+		return v4l2_ctrl_query_fill(qc, 0x0, 0xff, 1, 0x80);
+	case V4L2_CID_SATURATION:
+		return v4l2_ctrl_query_fill(qc, 0x0, 0xff, 1, 0x80);
+	case V4L2_CID_HUE:
+		return v4l2_ctrl_query_fill(qc, 0x0, 0xff, 1, 0x80);
+	case V4L2_CID_SHARPNESS:
+		return v4l2_ctrl_query_fill(qc, 0x0, 0xff, 1, 0x80);
+	case V4L2_CID_MPEG_AUDIO_ENCODING:
+		return v4l2_ctrl_query_fill(
+			qc, V4L2_MPEG_AUDIO_ENCODING_AAC,
+			ac3 ? V4L2_MPEG_AUDIO_ENCODING_AC3
+			: V4L2_MPEG_AUDIO_ENCODING_AAC,
+			1, V4L2_MPEG_AUDIO_ENCODING_AAC);
+	case V4L2_CID_MPEG_VIDEO_ENCODING:
+		return v4l2_ctrl_query_fill(
+			qc, V4L2_MPEG_VIDEO_ENCODING_MPEG_4_AVC,
+			V4L2_MPEG_VIDEO_ENCODING_MPEG_4_AVC, 1,
+			V4L2_MPEG_VIDEO_ENCODING_MPEG_4_AVC);
+
+/* 	case V4L2_CID_MPEG_VIDEO_? maybe keyframe interval: */
+/* 		return v4l2_ctrl_query_fill(qc, 0, 128, 128, 0); */
+	case V4L2_CID_MPEG_VIDEO_BITRATE_MODE:
+		return v4l2_ctrl_query_fill(
+			qc, V4L2_MPEG_VIDEO_BITRATE_MODE_VBR,
+			V4L2_MPEG_VIDEO_BITRATE_MODE_CBR, 1,
+			V4L2_MPEG_VIDEO_BITRATE_MODE_CBR);
+
+	case V4L2_CID_MPEG_VIDEO_BITRATE:
+		return v4l2_ctrl_query_fill(qc, 1000000, 13500000, 100000,
+					    6500000);
+	case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK:
+		err = v4l2_ctrl_query_fill(qc, 1100000, 20200000, 100000,
+					   9000000);
+		if (!err && opt->bitrate_mode == HDPVR_CONSTANT)
+			qc->flags |= V4L2_CTRL_FLAG_INACTIVE;
+		return err;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int vidioc_queryctrl(struct file *file, void *private_data,
+			    struct v4l2_queryctrl *qc)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int i, next;
+	u32 id = qc->id;
+
+	memset(qc, 0, sizeof(*qc));
+
+	next = !!(id &  V4L2_CTRL_FLAG_NEXT_CTRL);
+	qc->id = id & ~V4L2_CTRL_FLAG_NEXT_CTRL;
+
+	for (i = 0; i < ARRAY_SIZE(supported_v4l2_ctrls); i++) {
+		if (next) {
+			if (qc->id < supported_v4l2_ctrls[i])
+				qc->id = supported_v4l2_ctrls[i];
+			else
+				continue;
+		}
+
+		if (qc->id == supported_v4l2_ctrls[i])
+			return fill_queryctrl(&dev->options, qc,
+					      dev->flags & HDPVR_FLAG_AC3_CAP);
+
+		if (qc->id < supported_v4l2_ctrls[i])
+			break;
+	}
+
+	return -EINVAL;
+}
+
+static int vidioc_g_ctrl(struct file *file, void *private_data,
+			 struct v4l2_control *ctrl)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+
+	switch (ctrl->id) {
+	case V4L2_CID_BRIGHTNESS:
+		ctrl->value = dev->options.brightness;
+		break;
+	case V4L2_CID_CONTRAST:
+		ctrl->value = dev->options.contrast;
+		break;
+	case V4L2_CID_SATURATION:
+		ctrl->value = dev->options.saturation;
+		break;
+	case V4L2_CID_HUE:
+		ctrl->value = dev->options.hue;
+		break;
+	case V4L2_CID_SHARPNESS:
+		ctrl->value = dev->options.sharpness;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int vidioc_s_ctrl(struct file *file, void *private_data,
+			 struct v4l2_control *ctrl)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int retval;
+
+	switch (ctrl->id) {
+	case V4L2_CID_BRIGHTNESS:
+		retval = hdpvr_config_call(dev, CTRL_BRIGHTNESS, ctrl->value);
+		if (!retval)
+			dev->options.brightness = ctrl->value;
+		break;
+	case V4L2_CID_CONTRAST:
+		retval = hdpvr_config_call(dev, CTRL_CONTRAST, ctrl->value);
+		if (!retval)
+			dev->options.contrast = ctrl->value;
+		break;
+	case V4L2_CID_SATURATION:
+		retval = hdpvr_config_call(dev, CTRL_SATURATION, ctrl->value);
+		if (!retval)
+			dev->options.saturation = ctrl->value;
+		break;
+	case V4L2_CID_HUE:
+		retval = hdpvr_config_call(dev, CTRL_HUE, ctrl->value);
+		if (!retval)
+			dev->options.hue = ctrl->value;
+		break;
+	case V4L2_CID_SHARPNESS:
+		retval = hdpvr_config_call(dev, CTRL_SHARPNESS, ctrl->value);
+		if (!retval)
+			dev->options.sharpness = ctrl->value;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return retval;
+}
+
+
+static int hdpvr_get_ctrl(struct hdpvr_options *opt,
+			  struct v4l2_ext_control *ctrl)
+{
+	switch (ctrl->id) {
+	case V4L2_CID_MPEG_AUDIO_ENCODING:
+		ctrl->value = opt->audio_codec;
+		break;
+	case V4L2_CID_MPEG_VIDEO_ENCODING:
+		ctrl->value = V4L2_MPEG_VIDEO_ENCODING_MPEG_4_AVC;
+		break;
+/* 	case V4L2_CID_MPEG_VIDEO_B_FRAMES: */
+/* 		ctrl->value = (opt->gop_mode & 0x2) ? 0 : 128; */
+/* 		break; */
+	case V4L2_CID_MPEG_VIDEO_BITRATE_MODE:
+		ctrl->value = opt->bitrate_mode == HDPVR_CONSTANT
+			? V4L2_MPEG_VIDEO_BITRATE_MODE_CBR
+			: V4L2_MPEG_VIDEO_BITRATE_MODE_VBR;
+		break;
+	case V4L2_CID_MPEG_VIDEO_BITRATE:
+		ctrl->value = opt->bitrate * 100000;
+		break;
+	case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK:
+		ctrl->value = opt->peak_bitrate * 100000;
+		break;
+	case V4L2_CID_MPEG_STREAM_TYPE:
+		ctrl->value = V4L2_MPEG_STREAM_TYPE_MPEG2_TS;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int vidioc_g_ext_ctrls(struct file *file, void *priv,
+			      struct v4l2_ext_controls *ctrls)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int i, err = 0;
+
+	if (ctrls->ctrl_class == V4L2_CTRL_CLASS_MPEG) {
+		for (i = 0; i < ctrls->count; i++) {
+			struct v4l2_ext_control *ctrl = ctrls->controls + i;
+
+			err = hdpvr_get_ctrl(&dev->options, ctrl);
+			if (err) {
+				ctrls->error_idx = i;
+				break;
+			}
+		}
+		return err;
+
+	}
+
+	return -EINVAL;
+}
+
+
+static int hdpvr_try_ctrl(struct v4l2_ext_control *ctrl, int ac3)
+{
+	int ret = -EINVAL;
+
+	switch (ctrl->id) {
+	case V4L2_CID_MPEG_AUDIO_ENCODING:
+		if (ctrl->value == V4L2_MPEG_AUDIO_ENCODING_AAC ||
+		    (ac3 && ctrl->value == V4L2_MPEG_AUDIO_ENCODING_AC3))
+			ret = 0;
+		break;
+	case V4L2_CID_MPEG_VIDEO_ENCODING:
+		if (ctrl->value == V4L2_MPEG_VIDEO_ENCODING_MPEG_4_AVC)
+			ret = 0;
+		break;
+/* 	case V4L2_CID_MPEG_VIDEO_B_FRAMES: */
+/* 		if (ctrl->value == 0 || ctrl->value == 128) */
+/* 			ret = 0; */
+/* 		break; */
+	case V4L2_CID_MPEG_VIDEO_BITRATE_MODE:
+		if (ctrl->value == V4L2_MPEG_VIDEO_BITRATE_MODE_CBR ||
+		    ctrl->value == V4L2_MPEG_VIDEO_BITRATE_MODE_VBR)
+			ret = 0;
+		break;
+	case V4L2_CID_MPEG_VIDEO_BITRATE:
+	{
+		uint bitrate = ctrl->value / 100000;
+		if (bitrate >= 10 && bitrate <= 135)
+			ret = 0;
+		break;
+	}
+	case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK:
+	{
+		uint peak_bitrate = ctrl->value / 100000;
+		if (peak_bitrate >= 10 && peak_bitrate <= 202)
+			ret = 0;
+		break;
+	}
+	case V4L2_CID_MPEG_STREAM_TYPE:
+		if (ctrl->value == V4L2_MPEG_STREAM_TYPE_MPEG2_TS)
+			ret = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int vidioc_try_ext_ctrls(struct file *file, void *priv,
+				struct v4l2_ext_controls *ctrls)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int i, err = 0;
+
+	if (ctrls->ctrl_class == V4L2_CTRL_CLASS_MPEG) {
+		for (i = 0; i < ctrls->count; i++) {
+			struct v4l2_ext_control *ctrl = ctrls->controls + i;
+
+			err = hdpvr_try_ctrl(ctrl,
+					     dev->flags & HDPVR_FLAG_AC3_CAP);
+			if (err) {
+				ctrls->error_idx = i;
+				break;
+			}
+		}
+		return err;
+	}
+
+	return -EINVAL;
+}
+
+
+static int hdpvr_set_ctrl(struct hdpvr_device *dev,
+			  struct v4l2_ext_control *ctrl)
+{
+	struct hdpvr_options *opt = &dev->options;
+	int ret = 0;
+
+	switch (ctrl->id) {
+	case V4L2_CID_MPEG_AUDIO_ENCODING:
+		if (dev->flags & HDPVR_FLAG_AC3_CAP) {
+			opt->audio_codec = ctrl->value;
+			ret = hdpvr_set_audio(dev, opt->audio_input,
+					      opt->audio_codec);
+		}
+		break;
+	case V4L2_CID_MPEG_VIDEO_ENCODING:
+		break;
+/* 	case V4L2_CID_MPEG_VIDEO_B_FRAMES: */
+/* 		if (ctrl->value == 0 && !(opt->gop_mode & 0x2)) { */
+/* 			opt->gop_mode |= 0x2; */
+/* 			hdpvr_config_call(dev, CTRL_GOP_MODE_VALUE, */
+/* 					  opt->gop_mode); */
+/* 		} */
+/* 		if (ctrl->value == 128 && opt->gop_mode & 0x2) { */
+/* 			opt->gop_mode &= ~0x2; */
+/* 			hdpvr_config_call(dev, CTRL_GOP_MODE_VALUE, */
+/* 					  opt->gop_mode); */
+/* 		} */
+/* 		break; */
+	case V4L2_CID_MPEG_VIDEO_BITRATE_MODE:
+		if (ctrl->value == V4L2_MPEG_VIDEO_BITRATE_MODE_CBR &&
+		    opt->bitrate_mode != HDPVR_CONSTANT) {
+			opt->bitrate_mode = HDPVR_CONSTANT;
+			hdpvr_config_call(dev, CTRL_BITRATE_MODE_VALUE,
+					  opt->bitrate_mode);
+		}
+		if (ctrl->value == V4L2_MPEG_VIDEO_BITRATE_MODE_VBR &&
+		    opt->bitrate_mode == HDPVR_CONSTANT) {
+			opt->bitrate_mode = HDPVR_VARIABLE_AVERAGE;
+			hdpvr_config_call(dev, CTRL_BITRATE_MODE_VALUE,
+					  opt->bitrate_mode);
+		}
+		break;
+	case V4L2_CID_MPEG_VIDEO_BITRATE: {
+		uint bitrate = ctrl->value / 100000;
+
+		opt->bitrate = bitrate;
+		if (bitrate >= opt->peak_bitrate)
+			opt->peak_bitrate = bitrate+1;
+
+		hdpvr_set_bitrate(dev);
+		break;
+	}
+	case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK: {
+		uint peak_bitrate = ctrl->value / 100000;
+
+		if (opt->bitrate_mode == HDPVR_CONSTANT)
+			break;
+
+		if (opt->bitrate < peak_bitrate) {
+			opt->peak_bitrate = peak_bitrate;
+			hdpvr_set_bitrate(dev);
+		} else
+			ret = -EINVAL;
+		break;
+	}
+	case V4L2_CID_MPEG_STREAM_TYPE:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return ret;
+}
+
+static int vidioc_s_ext_ctrls(struct file *file, void *priv,
+			      struct v4l2_ext_controls *ctrls)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	int i, err = 0;
+
+	if (ctrls->ctrl_class == V4L2_CTRL_CLASS_MPEG) {
+		for (i = 0; i < ctrls->count; i++) {
+			struct v4l2_ext_control *ctrl = ctrls->controls + i;
+
+			err = hdpvr_try_ctrl(ctrl,
+					     dev->flags & HDPVR_FLAG_AC3_CAP);
+			if (err) {
+				ctrls->error_idx = i;
+				break;
+			}
+			err = hdpvr_set_ctrl(dev, ctrl);
+			if (err) {
+				ctrls->error_idx = i;
+				break;
+			}
+		}
+		return err;
+
+	}
+
+	return -EINVAL;
+}
+
+static int vidioc_enum_fmt_vid_cap(struct file *file, void *private_data,
+				    struct v4l2_fmtdesc *f)
+{
+
+	if (f->index != 0 || f->type != V4L2_BUF_TYPE_VIDEO_CAPTURE)
+		return -EINVAL;
+
+	f->flags = V4L2_FMT_FLAG_COMPRESSED;
+	strncpy(f->description, "MPEG2-TS with AVC/AAC streams", 32);
+	f->pixelformat = V4L2_PIX_FMT_MPEG;
+
+	return 0;
+}
+
+static int vidioc_g_fmt_vid_cap(struct file *file, void *private_data,
+				struct v4l2_format *f)
+{
+	struct hdpvr_fh *fh = file->private_data;
+	struct hdpvr_device *dev = fh->dev;
+	struct hdpvr_video_info *vid_info;
+
+	if (!dev)
+		return -ENODEV;
+
+	vid_info = get_video_info(dev);
+	if (!vid_info)
+		return -EFAULT;
+
+	f->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+	f->fmt.pix.pixelformat	= V4L2_PIX_FMT_MPEG;
+	f->fmt.pix.width	= vid_info->width;
+	f->fmt.pix.height	= vid_info->height;
+	f->fmt.pix.sizeimage	= dev->bulk_in_size;
+	f->fmt.pix.colorspace	= 0;
+	f->fmt.pix.bytesperline	= 0;
+	f->fmt.pix.field	= V4L2_FIELD_ANY;
+
+	kfree(vid_info);
+	return 0;
+}
+
+
+static const struct v4l2_ioctl_ops hdpvr_ioctl_ops = {
+	.vidioc_querycap	= vidioc_querycap,
+	.vidioc_s_std		= vidioc_s_std,
+	.vidioc_enum_input	= vidioc_enum_input,
+	.vidioc_g_input		= vidioc_g_input,
+	.vidioc_s_input		= vidioc_s_input,
+	.vidioc_enumaudio	= vidioc_enumaudio,
+	.vidioc_g_audio		= vidioc_g_audio,
+	.vidioc_s_audio		= vidioc_s_audio,
+	.vidioc_queryctrl	= vidioc_queryctrl,
+	.vidioc_g_ctrl		= vidioc_g_ctrl,
+	.vidioc_s_ctrl		= vidioc_s_ctrl,
+	.vidioc_g_ext_ctrls	= vidioc_g_ext_ctrls,
+	.vidioc_s_ext_ctrls	= vidioc_s_ext_ctrls,
+	.vidioc_try_ext_ctrls	= vidioc_try_ext_ctrls,
+	.vidioc_enum_fmt_vid_cap	= vidioc_enum_fmt_vid_cap,
+	.vidioc_g_fmt_vid_cap		= vidioc_g_fmt_vid_cap,
+};
+
+static void hdpvr_device_release(struct video_device *vdev)
+{
+	struct hdpvr_device *dev = video_get_drvdata(vdev);
+
+	hdpvr_delete(dev);
+}
+
+static const struct video_device hdpvr_video_template = {
+/* 	.type			= VFL_TYPE_GRABBER, */
+/* 	.type2			= VID_TYPE_CAPTURE | VID_TYPE_MPEG_ENCODER, */
+	.fops			= &hdpvr_fops,
+	.release		= hdpvr_device_release,
+	.ioctl_ops 		= &hdpvr_ioctl_ops,
+	.tvnorms 		=
+		V4L2_STD_NTSC  | V4L2_STD_SECAM | V4L2_STD_PAL_B |
+		V4L2_STD_PAL_G | V4L2_STD_PAL_H | V4L2_STD_PAL_I |
+		V4L2_STD_PAL_D | V4L2_STD_PAL_M | V4L2_STD_PAL_N |
+		V4L2_STD_PAL_60,
+};
+
+int hdpvr_register_videodev(struct hdpvr_device *dev, int devnum)
+{
+	/* setup and register video device */
+	dev->video_dev = video_device_alloc();
+	if (!dev->video_dev) {
+		err("video_device_alloc() failed");
+		goto error;
+	}
+
+	*(dev->video_dev) = hdpvr_video_template;
+	strcpy(dev->video_dev->name, "Hauppauge HD PVR");
+	dev->video_dev->parent = &dev->udev->dev;
+	video_set_drvdata(dev->video_dev, dev);
+
+	if (video_register_device(dev->video_dev, VFL_TYPE_GRABBER, devnum)) {
+		err("V4L2 device registration failed");
+		goto error;
+	}
+
+	return 0;
+error:
+	return -ENOMEM;
+}
diff --git a/drivers/media/video/hdpvr/hdpvr.h b/drivers/media/video/hdpvr/hdpvr.h
new file mode 100644
index 000000000000..17db74feb884
--- /dev/null
+++ b/drivers/media/video/hdpvr/hdpvr.h
@@ -0,0 +1,298 @@
+/*
+ * Hauppage HD PVR USB driver
+ *
+ * Copyright (C) 2008      Janne Grunau (j@jannau.net)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ */
+
+#include <linux/usb.h>
+#include <linux/i2c.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/videodev2.h>
+
+#define HDPVR_MAJOR_VERSION 0
+#define HDPVR_MINOR_VERSION 2
+#define HDPVR_RELEASE 0
+#define HDPVR_VERSION \
+	KERNEL_VERSION(HDPVR_MAJOR_VERSION, HDPVR_MINOR_VERSION, HDPVR_RELEASE)
+
+#define HDPVR_MAX 8
+
+/* Define these values to match your devices */
+#define HD_PVR_VENDOR_ID	0x2040
+#define HD_PVR_PRODUCT_ID	0x4900
+#define HD_PVR_PRODUCT_ID1	0x4901
+#define HD_PVR_PRODUCT_ID2	0x4902
+
+#define UNSET    (-1U)
+
+#define NUM_BUFFERS 64
+
+#define HDPVR_FIRMWARE_VERSION		0x8
+#define HDPVR_FIRMWARE_VERSION_AC3	0xd
+
+/* #define HDPVR_DEBUG */
+
+extern int hdpvr_debug;
+
+#define MSG_INFO	1
+#define MSG_BUFFER	2
+
+struct hdpvr_options {
+	u8	video_std;
+	u8	video_input;
+	u8	audio_input;
+	u8	bitrate;	/* in 100kbps */
+	u8	peak_bitrate;	/* in 100kbps */
+	u8	bitrate_mode;
+	u8	gop_mode;
+	enum v4l2_mpeg_audio_encoding	audio_codec;
+	u8	brightness;
+	u8	contrast;
+	u8	hue;
+	u8	saturation;
+	u8	sharpness;
+};
+
+/* Structure to hold all of our device specific stuff */
+struct hdpvr_device {
+	/* the v4l device for this device */
+	struct video_device	*video_dev;
+	/* the usb device for this device */
+	struct usb_device	*udev;
+
+	/* the max packet size of the bulk endpoint */
+	size_t			bulk_in_size;
+	/* the address of the bulk in endpoint */
+	__u8			bulk_in_endpointAddr;
+
+	/* holds the current device status */
+	__u8			status;
+	/* count the number of openers */
+	uint			open_count;
+
+	/* holds the cureent set options */
+	struct hdpvr_options	options;
+
+	uint			flags;
+
+	/* synchronize I/O */
+	struct mutex		io_mutex;
+	/* available buffers */
+	struct list_head	free_buff_list;
+	/* in progress buffers */
+	struct list_head	rec_buff_list;
+	/* waitqueue for buffers */
+	wait_queue_head_t	wait_buffer;
+	/* waitqueue for data */
+	wait_queue_head_t	wait_data;
+	/**/
+	struct workqueue_struct	*workqueue;
+	/**/
+	struct work_struct	worker;
+
+	/* I2C adapter */
+	struct i2c_adapter	*i2c_adapter;
+	/* I2C lock */
+	struct mutex		i2c_mutex;
+
+	/* usb control transfer buffer and lock */
+	struct mutex		usbc_mutex;
+	u8			*usbc_buf;
+};
+
+
+/* buffer one bulk urb of data */
+struct hdpvr_buffer {
+	struct list_head	buff_list;
+
+	struct urb		*urb;
+
+	struct hdpvr_device	*dev;
+
+	uint			pos;
+
+	__u8			status;
+};
+
+/* */
+
+struct hdpvr_video_info {
+	u16	width;
+	u16	height;
+	u8	fps;
+};
+
+enum {
+	STATUS_UNINITIALIZED	= 0,
+	STATUS_IDLE,
+	STATUS_STARTING,
+	STATUS_SHUTTING_DOWN,
+	STATUS_STREAMING,
+	STATUS_ERROR,
+	STATUS_DISCONNECTED,
+};
+
+enum {
+	HDPVR_FLAG_AC3_CAP = 1,
+};
+
+enum {
+	BUFSTAT_UNINITIALIZED = 0,
+	BUFSTAT_AVAILABLE,
+	BUFSTAT_INPROGRESS,
+	BUFSTAT_READY,
+};
+
+#define CTRL_START_STREAMING_VALUE	0x0700
+#define CTRL_STOP_STREAMING_VALUE	0x0800
+#define CTRL_BITRATE_VALUE		0x1000
+#define CTRL_BITRATE_MODE_VALUE		0x1200
+#define CTRL_GOP_MODE_VALUE		0x1300
+#define CTRL_VIDEO_INPUT_VALUE		0x1500
+#define CTRL_VIDEO_STD_TYPE		0x1700
+#define CTRL_AUDIO_INPUT_VALUE		0x2500
+#define CTRL_BRIGHTNESS			0x2900
+#define CTRL_CONTRAST			0x2a00
+#define CTRL_HUE			0x2b00
+#define CTRL_SATURATION			0x2c00
+#define CTRL_SHARPNESS			0x2d00
+#define CTRL_LOW_PASS_FILTER_VALUE	0x3100
+
+#define CTRL_DEFAULT_INDEX		0x0003
+
+
+	/* :0 s 38 01 1000 0003 0004 4 = 0a00ca00
+	 * BITRATE SETTING
+	 *   1st and 2nd byte (little endian): average bitrate in 100 000 bit/s
+	 *                                     min: 1 mbit/s, max: 13.5 mbit/s
+	 *   3rd and 4th byte (little endian): peak bitrate in 100 000 bit/s
+	 *                                     min: average + 100kbit/s,
+	 *                                      max: 20.2 mbit/s
+	 */
+
+	/* :0 s 38 01 1200 0003 0001 1 = 02
+	 * BIT RATE MODE
+	 *  constant = 1, variable (peak) = 2, variable (average) = 3
+	 */
+
+	/* :0 s 38 01 1300 0003 0001 1 = 03
+	 * GOP MODE (2 bit)
+	 *    low bit 0/1: advanced/simple GOP
+	 *   high bit 0/1: IDR(4/32/128) / no IDR (4/32/0)
+	 */
+
+	/* :0 s 38 01 1700 0003 0001 1 = 00
+	 * VIDEO STANDARD or FREQUNCY 0 = 60hz, 1 = 50hz
+	 */
+
+	/* :0 s 38 01 3100 0003 0004 4 = 03030000
+	 * FILTER CONTROL
+	 *   1st byte luma low pass filter strength,
+	 *   2nd byte chroma low pass filter strength,
+	 *   3rd byte MF enable chroma, min=0, max=1
+	 *   4th byte n
+	 */
+
+
+	/* :0 s 38 b9 0001 0000 0000 0 */
+
+
+
+/* :0 s 38 d3 0000 0000 0001 1 = 00 */
+/* 		ret = usb_control_msg(dev->udev, */
+/* 				      usb_sndctrlpipe(dev->udev, 0), */
+/* 				      0xd3, 0x38, */
+/* 				      0, 0, */
+/* 				      "\0", 1, */
+/* 				      1000); */
+
+/* 		info("control request returned %d", ret); */
+/* 		msleep(5000); */
+
+
+	/* :0 s b8 81 1400 0003 0005 5 <
+	 * :0 0 5 = d0024002 19
+	 * QUERY FRAME SIZE AND RATE
+	 *   1st and 2nd byte (little endian): horizontal resolution
+	 *   3rd and 4th byte (little endian): vertical resolution
+	 *   5th byte: frame rate
+	 */
+
+	/* :0 s b8 81 1800 0003 0003 3 <
+	 * :0 0 3 = 030104
+	 * QUERY SIGNAL AND DETECTED LINES, maybe INPUT
+	 */
+
+enum hdpvr_video_std {
+	HDPVR_60HZ = 0,
+	HDPVR_50HZ,
+};
+
+enum hdpvr_video_input {
+	HDPVR_COMPONENT = 0,
+	HDPVR_SVIDEO,
+	HDPVR_COMPOSITE,
+	HDPVR_VIDEO_INPUTS
+};
+
+enum hdpvr_audio_inputs {
+	HDPVR_RCA_BACK = 0,
+	HDPVR_RCA_FRONT,
+	HDPVR_SPDIF,
+	HDPVR_AUDIO_INPUTS
+};
+
+enum hdpvr_bitrate_mode {
+	HDPVR_CONSTANT = 1,
+	HDPVR_VARIABLE_PEAK,
+	HDPVR_VARIABLE_AVERAGE,
+};
+
+enum hdpvr_gop_mode {
+	HDPVR_ADVANCED_IDR_GOP = 0,
+	HDPVR_SIMPLE_IDR_GOP,
+	HDPVR_ADVANCED_NOIDR_GOP,
+	HDPVR_SIMPLE_NOIDR_GOP,
+};
+
+void hdpvr_delete(struct hdpvr_device *dev);
+
+/*========================================================================*/
+/* hardware control functions */
+int hdpvr_set_options(struct hdpvr_device *dev);
+
+int hdpvr_set_bitrate(struct hdpvr_device *dev);
+
+int hdpvr_set_audio(struct hdpvr_device *dev, u8 input,
+		    enum v4l2_mpeg_audio_encoding codec);
+
+int hdpvr_config_call(struct hdpvr_device *dev, uint value,
+		      unsigned char valbuf);
+
+struct hdpvr_video_info *get_video_info(struct hdpvr_device *dev);
+
+/* :0 s b8 81 1800 0003 0003 3 < */
+/* :0 0 3 = 0301ff */
+int get_input_lines_info(struct hdpvr_device *dev);
+
+
+/*========================================================================*/
+/* v4l2 registration */
+int hdpvr_register_videodev(struct hdpvr_device *dev, int devnumber);
+
+int hdpvr_cancel_queue(struct hdpvr_device *dev);
+
+/*========================================================================*/
+/* i2c adapter registration */
+int hdpvr_register_i2c_adapter(struct hdpvr_device *dev);
+
+/*========================================================================*/
+/* buffer management */
+int hdpvr_free_buffers(struct hdpvr_device *dev);
+int hdpvr_alloc_buffers(struct hdpvr_device *dev, uint count);
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 17d9af070f06..f27604af8378 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -88,6 +88,7 @@
 #define I2C_HW_B_CX2341X	0x010020 /* Conexant CX2341X MPEG encoder cards */
 #define I2C_HW_B_CX23885	0x010022 /* conexant 23885 based tv cards (bus1) */
 #define I2C_HW_B_AU0828		0x010023 /* auvitek au0828 usb bridge */
+#define I2C_HW_B_HDPVR		0x010025 /* Hauppauge HD PVR */
 
 /* --- SGI adapters							*/
 #define I2C_HW_SGI_VINO		0x160000
-- 
cgit v1.2.3-71-gd317


From 14a19c0a2254ba58ed7559e072456ab94c9a2d3c Mon Sep 17 00:00:00 2001
From: Theodore Kilgore <kilgota@auburn.edu>
Date: Wed, 25 Mar 2009 07:13:13 -0300
Subject: V4L/DVB (11213): gspca - sq905c: New subdriver.

The code in the new sq905c.c is based upon the structure of the code in
gspca/sq905.c, and upon the code in libgphoto2/camlibs/digigr8, which supports
the same set of cameras in stillcam mode. I am a co-author of gspca/sq905.c and
I am the sole author of libgphoto2/camlibs/digigr8, which is licensed under the
LGPL. I hereby give myself permission to use my own code from libgphoto2 in
gspca/sq905c.c.

Signed-off-by: Theodore Kilgore <kilgota@auburn.edu>
Signed-off-by: Jean-Francois Moine <moinejf@free.fr>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/gspca/Kconfig  |   9 +
 drivers/media/video/gspca/Makefile |   2 +
 drivers/media/video/gspca/sq905c.c | 328 +++++++++++++++++++++++++++++++++++++
 include/linux/videodev2.h          |   1 +
 4 files changed, 340 insertions(+)
 create mode 100644 drivers/media/video/gspca/sq905c.c

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/Kconfig b/drivers/media/video/gspca/Kconfig
index a0f05ef5ca70..578dc4ffc965 100644
--- a/drivers/media/video/gspca/Kconfig
+++ b/drivers/media/video/gspca/Kconfig
@@ -185,6 +185,15 @@ config USB_GSPCA_SQ905
 	  To compile this driver as a module, choose M here: the
 	  module will be called gspca_sq905.
 
+config USB_GSPCA_SQ905C
+	tristate "SQ Technologies SQ905C based USB Camera Driver"
+	depends on VIDEO_V4L2 && USB_GSPCA
+	help
+	  Say Y here if you want support for cameras based on the SQ905C chip.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called gspca_sq905c.
+
 config USB_GSPCA_STK014
 	tristate "Syntek DV4000 (STK014) USB Camera Driver"
 	depends on VIDEO_V4L2 && USB_GSPCA
diff --git a/drivers/media/video/gspca/Makefile b/drivers/media/video/gspca/Makefile
index b6ec61185736..8a6643e8eb96 100644
--- a/drivers/media/video/gspca/Makefile
+++ b/drivers/media/video/gspca/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_USB_GSPCA_SPCA506)  += gspca_spca506.o
 obj-$(CONFIG_USB_GSPCA_SPCA508)  += gspca_spca508.o
 obj-$(CONFIG_USB_GSPCA_SPCA561)  += gspca_spca561.o
 obj-$(CONFIG_USB_GSPCA_SQ905)    += gspca_sq905.o
+obj-$(CONFIG_USB_GSPCA_SQ905C)   += gspca_sq905c.o
 obj-$(CONFIG_USB_GSPCA_SUNPLUS)  += gspca_sunplus.o
 obj-$(CONFIG_USB_GSPCA_STK014)   += gspca_stk014.o
 obj-$(CONFIG_USB_GSPCA_T613)     += gspca_t613.o
@@ -43,6 +44,7 @@ gspca_spca506-objs  := spca506.o
 gspca_spca508-objs  := spca508.o
 gspca_spca561-objs  := spca561.o
 gspca_sq905-objs    := sq905.o
+gspca_sq905c-objs   := sq905c.o
 gspca_stk014-objs   := stk014.o
 gspca_sunplus-objs  := sunplus.o
 gspca_t613-objs     := t613.o
diff --git a/drivers/media/video/gspca/sq905c.c b/drivers/media/video/gspca/sq905c.c
new file mode 100644
index 000000000000..0bcb74a1b143
--- /dev/null
+++ b/drivers/media/video/gspca/sq905c.c
@@ -0,0 +1,328 @@
+/*
+ * SQ905C subdriver
+ *
+ * Copyright (C) 2009 Theodore Kilgore
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *
+ * This driver uses work done in
+ * libgphoto2/camlibs/digigr8, Copyright (C) Theodore Kilgore.
+ *
+ * This driver has also used as a base the sq905c driver
+ * and may contain code fragments from it.
+ */
+
+#define MODULE_NAME "sq905c"
+
+#include <linux/workqueue.h>
+#include "gspca.h"
+
+MODULE_AUTHOR("Theodore Kilgore <kilgota@auburn.edu>");
+MODULE_DESCRIPTION("GSPCA/SQ905C USB Camera Driver");
+MODULE_LICENSE("GPL");
+
+/* Default timeouts, in ms */
+#define SQ905C_CMD_TIMEOUT 500
+#define SQ905C_DATA_TIMEOUT 1000
+
+/* Maximum transfer size to use. */
+#define SQ905C_MAX_TRANSFER 0x8000
+
+#define FRAME_HEADER_LEN 0x50
+
+/* Commands. These go in the "value" slot. */
+#define SQ905C_CLEAR   0xa0		/* clear everything */
+#define SQ905C_CAPTURE_LOW 0xa040	/* Starts capture at 160x120 */
+#define SQ905C_CAPTURE_MED 0x1440	/* Starts capture at 320x240 */
+#define SQ905C_CAPTURE_HI 0x2840	/* Starts capture at 320x240 */
+
+/* For capture, this must go in the "index" slot. */
+#define SQ905C_CAPTURE_INDEX 0x110f
+
+/* Structure to hold all of our device specific stuff */
+struct sd {
+	struct gspca_dev gspca_dev;	/* !! must be the first item */
+	const struct v4l2_pix_format *cap_mode;
+	/* Driver stuff */
+	struct work_struct work_struct;
+	struct workqueue_struct *work_thread;
+};
+
+/*
+ * Most of these cameras will do 640x480 and 320x240. 160x120 works
+ * in theory but gives very poor output. Therefore, not supported.
+ * The 0x2770:0x9050 cameras have max resolution of 320x240.
+ */
+static struct v4l2_pix_format sq905c_mode[] = {
+	{ 320, 240, V4L2_PIX_FMT_SQ905C, V4L2_FIELD_NONE,
+		.bytesperline = 320,
+		.sizeimage = 320 * 240,
+		.colorspace = V4L2_COLORSPACE_SRGB,
+		.priv = 0},
+	{ 640, 480, V4L2_PIX_FMT_SQ905C, V4L2_FIELD_NONE,
+		.bytesperline = 640,
+		.sizeimage = 640 * 480,
+		.colorspace = V4L2_COLORSPACE_SRGB,
+		.priv = 0}
+};
+
+/* Send a command to the camera. */
+static int sq905c_command(struct gspca_dev *gspca_dev, u16 command, u16 index)
+{
+	int ret;
+
+	ret = usb_control_msg(gspca_dev->dev,
+			      usb_sndctrlpipe(gspca_dev->dev, 0),
+			      USB_REQ_SYNCH_FRAME,                /* request */
+			      USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
+			      command, index, NULL, 0,
+			      SQ905C_CMD_TIMEOUT);
+	if (ret < 0) {
+		PDEBUG(D_ERR, "%s: usb_control_msg failed (%d)",
+			__func__, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/* This function is called as a workqueue function and runs whenever the camera
+ * is streaming data. Because it is a workqueue function it is allowed to sleep
+ * so we can use synchronous USB calls. To avoid possible collisions with other
+ * threads attempting to use the camera's USB interface the gspca usb_lock is
+ * used when performing the one USB control operation inside the workqueue,
+ * which tells the camera to close the stream. In practice the only thing
+ * which needs to be protected against is the usb_set_interface call that
+ * gspca makes during stream_off. Otherwise the camera doesn't provide any
+ * controls that the user could try to change.
+ */
+static void sq905c_dostream(struct work_struct *work)
+{
+	struct sd *dev = container_of(work, struct sd, work_struct);
+	struct gspca_dev *gspca_dev = &dev->gspca_dev;
+	struct gspca_frame *frame;
+	int bytes_left; /* bytes remaining in current frame. */
+	int data_len;   /* size to use for the next read. */
+	int act_len;
+	int discarding = 0; /* true if we failed to get space for frame. */
+	int packet_type;
+	int ret;
+	u8 *buffer;
+
+	buffer = kmalloc(SQ905C_MAX_TRANSFER, GFP_KERNEL | GFP_DMA);
+	if (!buffer) {
+		PDEBUG(D_ERR, "Couldn't allocate USB buffer");
+		goto quit_stream;
+	}
+
+	while (gspca_dev->present && gspca_dev->streaming) {
+		if (!gspca_dev->present)
+			goto quit_stream;
+		/* Request the header, which tells the size to download */
+		ret = usb_bulk_msg(gspca_dev->dev,
+				usb_rcvbulkpipe(gspca_dev->dev, 0x81),
+				buffer, FRAME_HEADER_LEN, &act_len,
+				SQ905C_DATA_TIMEOUT);
+		PDEBUG(D_STREAM,
+			"Got %d bytes out of %d for header",
+			act_len, FRAME_HEADER_LEN);
+		if (ret < 0 || act_len < FRAME_HEADER_LEN)
+			goto quit_stream;
+		/* size is read from 4 bytes starting 0x40, little endian */
+		bytes_left = buffer[0x40]|(buffer[0x41]<<8)|(buffer[0x42]<<16)
+					|(buffer[0x43]<<24);
+		PDEBUG(D_STREAM, "bytes_left = 0x%x", bytes_left);
+		/* We keep the header. It has other information, too. */
+		packet_type = FIRST_PACKET;
+		frame = gspca_get_i_frame(gspca_dev);
+		if (frame && !discarding) {
+			gspca_frame_add(gspca_dev, packet_type,
+				frame, buffer, FRAME_HEADER_LEN);
+			} else
+				discarding = 1;
+		while (bytes_left > 0) {
+			data_len = bytes_left > SQ905C_MAX_TRANSFER ?
+				SQ905C_MAX_TRANSFER : bytes_left;
+			if (!gspca_dev->present)
+				goto quit_stream;
+			ret = usb_bulk_msg(gspca_dev->dev,
+				usb_rcvbulkpipe(gspca_dev->dev, 0x81),
+				buffer, data_len, &act_len,
+				SQ905C_DATA_TIMEOUT);
+			if (ret < 0 || act_len < data_len)
+				goto quit_stream;
+			PDEBUG(D_STREAM,
+				"Got %d bytes out of %d for frame",
+				data_len, bytes_left);
+			bytes_left -= data_len;
+			if (bytes_left == 0)
+				packet_type = LAST_PACKET;
+			else
+				packet_type = INTER_PACKET;
+			frame = gspca_get_i_frame(gspca_dev);
+			if (frame && !discarding)
+				gspca_frame_add(gspca_dev, packet_type,
+						frame, buffer, data_len);
+			else
+				discarding = 1;
+		}
+	}
+quit_stream:
+	mutex_lock(&gspca_dev->usb_lock);
+	if (gspca_dev->present)
+		sq905c_command(gspca_dev, SQ905C_CLEAR, 0);
+	mutex_unlock(&gspca_dev->usb_lock);
+	kfree(buffer);
+}
+
+/* This function is called at probe time just before sd_init */
+static int sd_config(struct gspca_dev *gspca_dev,
+		const struct usb_device_id *id)
+{
+	struct cam *cam = &gspca_dev->cam;
+	struct sd *dev = (struct sd *) gspca_dev;
+
+	PDEBUG(D_PROBE,
+		"SQ9050 camera detected"
+		" (vid/pid 0x%04X:0x%04X)", id->idVendor, id->idProduct);
+	cam->cam_mode = sq905c_mode;
+	cam->nmodes = 2;
+	if (id->idProduct == 0x9050)
+		cam->nmodes = 1;
+	/* We don't use the buffer gspca allocates so make it small. */
+	cam->bulk_size = 32;
+	INIT_WORK(&dev->work_struct, sq905c_dostream);
+	return 0;
+}
+
+/* called on streamoff with alt==0 and on disconnect */
+/* the usb_lock is held at entry - restore on exit */
+static void sd_stop0(struct gspca_dev *gspca_dev)
+{
+	struct sd *dev = (struct sd *) gspca_dev;
+
+	/* wait for the work queue to terminate */
+	mutex_unlock(&gspca_dev->usb_lock);
+	/* This waits for sq905c_dostream to finish */
+	destroy_workqueue(dev->work_thread);
+	dev->work_thread = NULL;
+	mutex_lock(&gspca_dev->usb_lock);
+}
+
+/* this function is called at probe and resume time */
+static int sd_init(struct gspca_dev *gspca_dev)
+{
+	int ret;
+
+	/* connect to the camera and reset it. */
+	ret = sq905c_command(gspca_dev, SQ905C_CLEAR, 0);
+	return ret;
+}
+
+/* Set up for getting frames. */
+static int sd_start(struct gspca_dev *gspca_dev)
+{
+	struct sd *dev = (struct sd *) gspca_dev;
+	int ret;
+
+	dev->cap_mode = gspca_dev->cam.cam_mode;
+	/* "Open the shutter" and set size, to start capture */
+	switch (gspca_dev->width) {
+	case 640:
+		PDEBUG(D_STREAM, "Start streaming at high resolution");
+		dev->cap_mode++;
+		ret = sq905c_command(gspca_dev, SQ905C_CAPTURE_HI,
+						SQ905C_CAPTURE_INDEX);
+		break;
+	default: /* 320 */
+	PDEBUG(D_STREAM, "Start streaming at medium resolution");
+		ret = sq905c_command(gspca_dev, SQ905C_CAPTURE_MED,
+						SQ905C_CAPTURE_INDEX);
+	}
+
+	if (ret < 0) {
+		PDEBUG(D_ERR, "Start streaming command failed");
+		return ret;
+	}
+	/* Start the workqueue function to do the streaming */
+	dev->work_thread = create_singlethread_workqueue(MODULE_NAME);
+	queue_work(dev->work_thread, &dev->work_struct);
+
+	return 0;
+}
+
+/* Table of supported USB devices */
+static const __devinitdata struct usb_device_id device_table[] = {
+	{USB_DEVICE(0x2770, 0x905c)},
+	{USB_DEVICE(0x2770, 0x9050)},
+	{USB_DEVICE(0x2770, 0x913d)},
+	{}
+};
+
+MODULE_DEVICE_TABLE(usb, device_table);
+
+/* sub-driver description */
+static const struct sd_desc sd_desc = {
+	.name   = MODULE_NAME,
+	.config = sd_config,
+	.init   = sd_init,
+	.start  = sd_start,
+	.stop0  = sd_stop0,
+};
+
+/* -- device connect -- */
+static int sd_probe(struct usb_interface *intf,
+		const struct usb_device_id *id)
+{
+	return gspca_dev_probe(intf, id,
+			&sd_desc,
+			sizeof(struct sd),
+			THIS_MODULE);
+}
+
+static struct usb_driver sd_driver = {
+	.name       = MODULE_NAME,
+	.id_table   = device_table,
+	.probe      = sd_probe,
+	.disconnect = gspca_disconnect,
+#ifdef CONFIG_PM
+	.suspend = gspca_suspend,
+	.resume  = gspca_resume,
+#endif
+};
+
+/* -- module insert / remove -- */
+static int __init sd_mod_init(void)
+{
+	int ret;
+
+	ret = usb_register(&sd_driver);
+	if (ret < 0)
+		return ret;
+	PDEBUG(D_PROBE, "registered");
+	return 0;
+}
+
+static void __exit sd_mod_exit(void)
+{
+	usb_deregister(&sd_driver);
+	PDEBUG(D_PROBE, "deregistered");
+}
+
+module_init(sd_mod_init);
+module_exit(sd_mod_exit);
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 61f1a4921afd..139d234923cd 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -345,6 +345,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SPCA561  v4l2_fourcc('S', '5', '6', '1') /* compressed GBRG bayer */
 #define V4L2_PIX_FMT_PAC207   v4l2_fourcc('P', '2', '0', '7') /* compressed BGGR bayer */
 #define V4L2_PIX_FMT_MR97310A v4l2_fourcc('M', '3', '1', '0') /* compressed BGGR bayer */
+#define V4L2_PIX_FMT_SQ905C   v4l2_fourcc('9', '0', '5', 'C') /* compressed RGGB bayer */
 #define V4L2_PIX_FMT_PJPG     v4l2_fourcc('P', 'J', 'P', 'G') /* Pixart 73xx JPEG */
 #define V4L2_PIX_FMT_YVYU    v4l2_fourcc('Y', 'V', 'Y', 'U') /* 16  YVU 4:2:2     */
 
-- 
cgit v1.2.3-71-gd317


From 702d21c6f6c790b12c4820cd2f29bc8472aed633 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:16 -0400
Subject: reiserfs: add support for mount count incrementing

The following patch adds the fields for tracking mount counts and last
fsck timestamps to the superblock.  It also increments the mount count
on every read-write mount.

Reiserfsprogs 3.6.21 added support for these fields.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c            | 6 +++++-
 include/linux/reiserfs_fs.h    | 6 +++++-
 include/linux/reiserfs_fs_sb.h | 3 +++
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3c820b75829..4ad40afe54e1 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1280,6 +1280,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
 		s->s_flags &= ~MS_RDONLY;
 		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		if (!old_format_only(s))
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
 		/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
 		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
 		REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
@@ -1819,7 +1821,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			} else if (!silent) {
 				reiserfs_info(s, "using 3.5.x disk format\n");
 			}
-		}
+		} else
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
+
 
 		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
 		errval = journal_end(&th, s, 1);
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index bc5114d35e99..ab748a03fe97 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -171,7 +171,11 @@ struct reiserfs_super_block {
 	__le32 s_flags;		/* Right now used only by inode-attributes, if enabled */
 	unsigned char s_uuid[16];	/* filesystem unique identifier */
 	unsigned char s_label[16];	/* filesystem volume label */
-	char s_unused[88];	/* zero filled by mkreiserfs and
+	__le16 s_mnt_count;		/* Count of mounts since last fsck */
+	__le16 s_max_mnt_count;		/* Maximum mounts before check */
+	__le32 s_lastcheck;		/* Timestamp of last fsck */
+	__le32 s_check_interval;	/* Interval between checks */
+	char s_unused[76];	/* zero filled by mkreiserfs and
 				 * reiserfs_convert_objectid_map_v1()
 				 * so any additions must be updated
 				 * there as well. */
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index bda6b562a1e0..ccd38f351530 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -73,6 +73,9 @@ typedef enum {
 #define sb_version(sbp)            (le16_to_cpu((sbp)->s_v1.s_version))
 #define set_sb_version(sbp,v)      ((sbp)->s_v1.s_version = cpu_to_le16(v))
 
+#define sb_mnt_count(sbp)	   (le16_to_cpu((sbp)->s_mnt_count))
+#define set_sb_mnt_count(sbp, v)   ((sbp)->s_mnt_count = cpu_to_le16(v))
+
 #define sb_reserved_for_journal(sbp) \
               (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
 #define set_sb_reserved_for_journal(sbp,v) \
-- 
cgit v1.2.3-71-gd317


From 600ed41675d8c384519d8f0b3c76afed39ef2f4b Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:17 -0400
Subject: reiserfs: audit transaction ids to always be unsigned ints

This patch fixes up the reiserfs code such that transaction ids are
always unsigned ints.  In places they can currently be signed ints or
unsigned longs.

The former just causes an annoying clm-2200 warning and may join a
transaction when it should wait.

The latter is just for correctness since the disk format uses a 32-bit
transaction id.  There aren't any runtime problems that result from it
not wrapping at the correct location since the value is truncated
correctly even on big endian systems.  The 0 value might make it to
disk, but the mount-time checks will bump it to 10 itself.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c          | 46 +++++++++++++++++++++---------------------
 fs/reiserfs/procfs.c           |  4 ++--
 include/linux/reiserfs_fs.h    |  2 +-
 include/linux/reiserfs_fs_i.h  |  2 +-
 include/linux/reiserfs_fs_sb.h |  8 ++++----
 5 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9643c3bbeb3b..677bb926e7d6 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -574,7 +574,7 @@ static inline void put_journal_list(struct super_block *s,
 				    struct reiserfs_journal_list *jl)
 {
 	if (jl->j_refcount < 1) {
-		reiserfs_panic(s, "trans id %lu, refcount at %d",
+		reiserfs_panic(s, "trans id %u, refcount at %d",
 			       jl->j_trans_id, jl->j_refcount);
 	}
 	if (--jl->j_refcount == 0)
@@ -599,7 +599,7 @@ static void cleanup_freed_for_journal_list(struct super_block *p_s_sb,
 }
 
 static int journal_list_still_alive(struct super_block *s,
-				    unsigned long trans_id)
+				    unsigned int trans_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	struct list_head *entry = &journal->j_journal_list;
@@ -933,9 +933,9 @@ static int flush_older_commits(struct super_block *s,
 	struct reiserfs_journal_list *other_jl;
 	struct reiserfs_journal_list *first_jl;
 	struct list_head *entry;
-	unsigned long trans_id = jl->j_trans_id;
-	unsigned long other_trans_id;
-	unsigned long first_trans_id;
+	unsigned int trans_id = jl->j_trans_id;
+	unsigned int other_trans_id;
+	unsigned int first_trans_id;
 
       find_first:
 	/*
@@ -1014,7 +1014,7 @@ static int flush_commit_list(struct super_block *s,
 	int i;
 	b_blocknr_t bn;
 	struct buffer_head *tbh = NULL;
-	unsigned long trans_id = jl->j_trans_id;
+	unsigned int trans_id = jl->j_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	int barrier = 0;
 	int retval = 0;
@@ -1275,7 +1275,7 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
 */
 static int _update_journal_header_block(struct super_block *p_s_sb,
 					unsigned long offset,
-					unsigned long trans_id)
+					unsigned int trans_id)
 {
 	struct reiserfs_journal_header *jh;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
@@ -1329,7 +1329,7 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 
 static int update_journal_header_block(struct super_block *p_s_sb,
 				       unsigned long offset,
-				       unsigned long trans_id)
+				       unsigned int trans_id)
 {
 	return _update_journal_header_block(p_s_sb, offset, trans_id);
 }
@@ -1344,7 +1344,7 @@ static int flush_older_journal_lists(struct super_block *p_s_sb,
 	struct list_head *entry;
 	struct reiserfs_journal_list *other_jl;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-	unsigned long trans_id = jl->j_trans_id;
+	unsigned int trans_id = jl->j_trans_id;
 
 	/* we know we are the only ones flushing things, no extra race
 	 * protection is required.
@@ -1758,13 +1758,13 @@ static int dirty_one_transaction(struct super_block *s,
 static int kupdate_transactions(struct super_block *s,
 				struct reiserfs_journal_list *jl,
 				struct reiserfs_journal_list **next_jl,
-				unsigned long *next_trans_id,
+				unsigned int *next_trans_id,
 				int num_blocks, int num_trans)
 {
 	int ret = 0;
 	int written = 0;
 	int transactions_flushed = 0;
-	unsigned long orig_trans_id = jl->j_trans_id;
+	unsigned int orig_trans_id = jl->j_trans_id;
 	struct buffer_chunk chunk;
 	struct list_head *entry;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
@@ -1833,7 +1833,7 @@ static int flush_used_journal_lists(struct super_block *s,
 	int limit = 256;
 	struct reiserfs_journal_list *tjl;
 	struct reiserfs_journal_list *flush_jl;
-	unsigned long trans_id;
+	unsigned int trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 
 	flush_jl = tjl = jl;
@@ -2023,7 +2023,7 @@ static int journal_compare_desc_commit(struct super_block *p_s_sb,
 */
 static int journal_transaction_is_valid(struct super_block *p_s_sb,
 					struct buffer_head *d_bh,
-					unsigned long *oldest_invalid_trans_id,
+					unsigned int *oldest_invalid_trans_id,
 					unsigned long *newest_mount_id)
 {
 	struct reiserfs_journal_desc *desc;
@@ -2124,18 +2124,18 @@ static void brelse_array(struct buffer_head **heads, int num)
 static int journal_read_transaction(struct super_block *p_s_sb,
 				    unsigned long cur_dblock,
 				    unsigned long oldest_start,
-				    unsigned long oldest_trans_id,
+				    unsigned int oldest_trans_id,
 				    unsigned long newest_mount_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_desc *desc;
 	struct reiserfs_journal_commit *commit;
-	unsigned long trans_id = 0;
+	unsigned int trans_id = 0;
 	struct buffer_head *c_bh;
 	struct buffer_head *d_bh;
 	struct buffer_head **log_blocks = NULL;
 	struct buffer_head **real_blocks = NULL;
-	unsigned long trans_offset;
+	unsigned int trans_offset;
 	int i;
 	int trans_half;
 
@@ -2356,8 +2356,8 @@ static int journal_read(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_desc *desc;
-	unsigned long oldest_trans_id = 0;
-	unsigned long oldest_invalid_trans_id = 0;
+	unsigned int oldest_trans_id = 0;
+	unsigned int oldest_invalid_trans_id = 0;
 	time_t start;
 	unsigned long oldest_start = 0;
 	unsigned long cur_dblock = 0;
@@ -2970,7 +2970,7 @@ static void wake_queued_writers(struct super_block *s)
 		wake_up(&journal->j_join_wait);
 }
 
-static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)
+static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	unsigned long bcount = journal->j_bcount;
@@ -3001,7 +3001,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 			      int join)
 {
 	time_t now = get_seconds();
-	int old_trans_id;
+	unsigned int old_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_transaction_handle myth;
 	int sched_count = 0;
@@ -3824,7 +3824,7 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id,
 
 int reiserfs_commit_for_inode(struct inode *inode)
 {
-	unsigned long id = REISERFS_I(inode)->i_trans_id;
+	unsigned int id = REISERFS_I(inode)->i_trans_id;
 	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
 
 	/* for the whole inode, assume unset id means it was
@@ -3938,7 +3938,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	struct reiserfs_journal_list *jl, *temp_jl;
 	struct list_head *entry, *safe;
 	unsigned long jindex;
-	unsigned long commit_trans_id;
+	unsigned int commit_trans_id;
 	int trans_half;
 
 	BUG_ON(th->t_refcount > 1);
@@ -3946,7 +3946,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 
 	/* protect flush_older_commits from doing mistakes if the
            transaction ID counter gets overflowed.  */
-	if (th->t_trans_id == ~0UL)
+	if (th->t_trans_id == ~0U)
 		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
 	flush = flags & FLUSH_ALL;
 	wait_on_commit = flags & WAIT;
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 37173fa07d15..370988efc8ad 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -321,7 +321,7 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 		   /* incore fields */
 		   "j_1st_reserved_block: \t%i\n"
 		   "j_state: \t%li\n"
-		   "j_trans_id: \t%lu\n"
+		   "j_trans_id: \t%u\n"
 		   "j_mount_id: \t%lu\n"
 		   "j_start: \t%lu\n"
 		   "j_len: \t%lu\n"
@@ -329,7 +329,7 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 		   "j_wcount: \t%i\n"
 		   "j_bcount: \t%lu\n"
 		   "j_first_unflushed_offset: \t%lu\n"
-		   "j_last_flush_trans_id: \t%lu\n"
+		   "j_last_flush_trans_id: \t%u\n"
 		   "j_trans_start_time: \t%li\n"
 		   "j_list_bitmap_index: \t%i\n"
 		   "j_must_wait: \t%i\n"
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index ab748a03fe97..bd52b949f8c9 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1676,7 +1676,7 @@ struct reiserfs_transaction_handle {
 	int t_refcount;
 	int t_blocks_logged;	/* number of blocks this writer has logged */
 	int t_blocks_allocated;	/* number of blocks this writer allocated */
-	unsigned long t_trans_id;	/* sanity check, equals the current trans id */
+	unsigned int t_trans_id;	/* sanity check, equals the current trans id */
 	void *t_handle_save;	/* save existing current->journal_info */
 	unsigned displace_new_blocks:1;	/* if new block allocation occurres, that block
 					   should be displaced from others */
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index ce3663fb0101..201dd910b042 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -51,7 +51,7 @@ struct reiserfs_inode_info {
 	/* we use these for fsync or O_SYNC to decide which transaction
 	 ** needs to be committed in order for this inode to be properly
 	 ** flushed */
-	unsigned long i_trans_id;
+	unsigned int i_trans_id;
 	struct reiserfs_journal_list *i_jl;
 	struct mutex i_mmap;
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index ccd38f351530..12fc2a0d13be 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -156,7 +156,7 @@ struct reiserfs_journal_list {
 	atomic_t j_commit_left;
 	atomic_t j_older_commits_done;	/* all commits older than this on disk */
 	struct mutex j_commit_mutex;
-	unsigned long j_trans_id;
+	unsigned int j_trans_id;
 	time_t j_timestamp;
 	struct reiserfs_list_bitmap *j_list_bitmap;
 	struct buffer_head *j_commit_bh;	/* commit buffer head */
@@ -185,7 +185,7 @@ struct reiserfs_journal {
 	int j_1st_reserved_block;	/* first block on s_dev of reserved area journal */
 
 	unsigned long j_state;
-	unsigned long j_trans_id;
+	unsigned int j_trans_id;
 	unsigned long j_mount_id;
 	unsigned long j_start;	/* start of current waiting commit (index into j_ap_blocks) */
 	unsigned long j_len;	/* length of current waiting commit */
@@ -226,10 +226,10 @@ struct reiserfs_journal {
 	int j_num_work_lists;	/* number that need attention from kreiserfsd */
 
 	/* debugging to make sure things are flushed in order */
-	int j_last_flush_id;
+	unsigned int j_last_flush_id;
 
 	/* debugging to make sure things are committed in order */
-	int j_last_commit_id;
+	unsigned int j_last_commit_id;
 
 	struct list_head j_bitmap_nodes;
 	struct list_head j_dirty_buffers;
-- 
cgit v1.2.3-71-gd317


From eba00305591714f1d85ccad1afbf58259c2197b4 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:18 -0400
Subject: reiserfs: use buffer_info for leaf_paste_entries

This patch makes leaf_paste_entries more consistent with respect to the
other leaf operations.  Using buffer_info instead of buffer_head
directly allows us to get a superblock pointer for use in error
handling.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/do_balan.c      | 17 +++++++----------
 fs/reiserfs/lbalance.c      |  5 +++--
 include/linux/reiserfs_fs.h |  2 +-
 3 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 2f87f5b14630..99f80538c4bf 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -449,8 +449,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							/* when we have merge directory item, pos_in_item has been changed too */
 
 							/* paste new directory entry. 1 is entry number */
-							leaf_paste_entries(bi.
-									   bi_bh,
+							leaf_paste_entries(&bi,
 									   n +
 									   item_pos
 									   -
@@ -699,7 +698,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							   n + item_pos -
 							   ret_val);
 					if (is_direntry_le_ih(pasted))
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   n +
 								   item_pos -
 								   ret_val,
@@ -894,8 +893,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							     tb->insert_size[0],
 							     body, zeros_num);
 							/* paste entry */
-							leaf_paste_entries(bi.
-									   bi_bh,
+							leaf_paste_entries(&bi,
 									   0,
 									   paste_entry_position,
 									   1,
@@ -1096,7 +1094,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							   tb->rnum[0]);
 					if (is_direntry_le_ih(pasted)
 					    && pos_in_item >= 0) {
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   item_pos -
 								   n +
 								   tb->rnum[0],
@@ -1339,8 +1337,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							     tb->insert_size[0],
 							     body, zeros_num);
 							/* paste new directory entry */
-							leaf_paste_entries(bi.
-									   bi_bh,
+							leaf_paste_entries(&bi,
 									   0,
 									   pos_in_item
 									   -
@@ -1505,7 +1502,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							   item_pos - n +
 							   snum[i]);
 					if (is_direntry_le_ih(pasted)) {
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   item_pos -
 								   n + snum[i],
 								   pos_in_item,
@@ -1606,7 +1603,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 								     zeros_num);
 
 						/* paste entry */
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   item_pos,
 								   pos_in_item,
 								   1,
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 6de060a6aa7f..41bdd8c75887 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -111,7 +111,7 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
 	item_num_in_dest =
 	    (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
 
-	leaf_paste_entries(dest_bi->bi_bh, item_num_in_dest,
+	leaf_paste_entries(dest_bi, item_num_in_dest,
 			   (last_first ==
 			    FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest,
 									  item_num_in_dest))
@@ -1191,7 +1191,7 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
 }
 
 /* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
-void leaf_paste_entries(struct buffer_head *bh,
+void leaf_paste_entries(struct buffer_info *bi,
 			int item_num,
 			int before,
 			int new_entry_count,
@@ -1203,6 +1203,7 @@ void leaf_paste_entries(struct buffer_head *bh,
 	struct reiserfs_de_head *deh;
 	char *insert_point;
 	int i, old_entry_num;
+	struct buffer_head *bh = bi->bi_bh;
 
 	if (new_entry_count == 0)
 		return;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index bd52b949f8c9..65bb5e3e3abe 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2026,7 +2026,7 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
 			  int zeros_number);
 void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
 			  int pos_in_item, int cut_size);
-void leaf_paste_entries(struct buffer_head *bh, int item_num, int before,
+void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
 			int new_entry_count, struct reiserfs_de_head *new_dehs,
 			const char *records, int paste_size);
 /* ibalance.c */
-- 
cgit v1.2.3-71-gd317


From 45b03d5e8e674eb6555b767e1c8eb40b671ff892 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:21 -0400
Subject: reiserfs: rework reiserfs_warning

ReiserFS warnings can be somewhat inconsistent.
In some cases:
 * a unique identifier may be associated with it
 * the function name may be included
 * the device may be printed separately

This patch aims to make warnings more consistent. reiserfs_warning() prints
the device name, so printing it a second time is not required. The function
name for a warning is always helpful in debugging, so it is now automatically
inserted into the output. Hans has stated that every warning should have
a unique identifier. Some cases lack them, others really shouldn't have them.
reiserfs_warning() now expects an id associated with each message. In the
rare case where one isn't needed, "" will suffice.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/bitmap.c          |  52 ++++-----
 fs/reiserfs/do_balan.c        |  40 +++----
 fs/reiserfs/file.c            |   2 +-
 fs/reiserfs/fix_node.c        |  14 +--
 fs/reiserfs/inode.c           |  60 +++++-----
 fs/reiserfs/item_ops.c        |  60 +++++-----
 fs/reiserfs/journal.c         | 174 +++++++++++++++-------------
 fs/reiserfs/lbalance.c        |  12 +-
 fs/reiserfs/namei.c           |  45 ++++----
 fs/reiserfs/objectid.c        |   5 +-
 fs/reiserfs/prints.c          |  11 +-
 fs/reiserfs/procfs.c          |   5 +-
 fs/reiserfs/stree.c           | 107 ++++++++----------
 fs/reiserfs/super.c           | 257 ++++++++++++++++++++++--------------------
 fs/reiserfs/tail_conversion.c |   6 +-
 fs/reiserfs/xattr.c           |  21 ++--
 include/linux/reiserfs_fs.h   |   9 +-
 17 files changed, 454 insertions(+), 426 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 98b92a3da14a..51b116103041 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -64,8 +64,8 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 	unsigned int bmap_count = reiserfs_bmap_count(s);
 
 	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
-		reiserfs_warning(s,
-				 "vs-4010: is_reusable: block number is out of range %lu (%u)",
+		reiserfs_warning(s, "vs-4010",
+				 "block number is out of range %lu (%u)",
 				 block, SB_BLOCK_COUNT(s));
 		return 0;
 	}
@@ -79,30 +79,29 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 		b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
 		if (block >= bmap1 &&
 		    block <= bmap1 + bmap_count) {
-			reiserfs_warning(s, "vs: 4019: is_reusable: "
-					 "bitmap block %lu(%u) can't be freed or reused",
+			reiserfs_warning(s, "vs-4019", "bitmap block %lu(%u) "
+					 "can't be freed or reused",
 					 block, bmap_count);
 			return 0;
 		}
 	} else {
 		if (offset == 0) {
-			reiserfs_warning(s, "vs: 4020: is_reusable: "
-					 "bitmap block %lu(%u) can't be freed or reused",
+			reiserfs_warning(s, "vs-4020", "bitmap block %lu(%u) "
+					 "can't be freed or reused",
 					 block, bmap_count);
 			return 0;
 		}
 	}
 
 	if (bmap >= bmap_count) {
-		reiserfs_warning(s,
-				 "vs-4030: is_reusable: there is no so many bitmap blocks: "
-				 "block=%lu, bitmap_nr=%u", block, bmap);
+		reiserfs_warning(s, "vs-4030", "bitmap for requested block "
+				 "is out of range: block=%lu, bitmap_nr=%u",
+				 block, bmap);
 		return 0;
 	}
 
 	if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
-		reiserfs_warning(s,
-				 "vs-4050: is_reusable: this is root block (%u), "
+		reiserfs_warning(s, "vs-4050", "this is root block (%u), "
 				 "it must be busy", SB_ROOT_BLOCK(s));
 		return 0;
 	}
@@ -154,8 +153,8 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 /* - I mean `a window of zero bits' as in description of this function - Zam. */
 
 	if (!bi) {
-		reiserfs_warning(s, "NULL bitmap info pointer for bitmap %d",
-				 bmap_n);
+		reiserfs_warning(s, "jdm-4055", "NULL bitmap info pointer "
+				 "for bitmap %d", bmap_n);
 		return 0;
 	}
 
@@ -400,11 +399,8 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 	get_bit_address(s, block, &nr, &offset);
 
 	if (nr >= reiserfs_bmap_count(s)) {
-		reiserfs_warning(s, "vs-4075: reiserfs_free_block: "
-				 "block %lu is out of range on %s "
-				 "(nr=%u,max=%u)", block,
-				 reiserfs_bdevname(s), nr,
-				 reiserfs_bmap_count(s));
+		reiserfs_warning(s, "vs-4075", "block %lu is out of range",
+				 block);
 		return;
 	}
 
@@ -416,9 +412,8 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 
 	/* clear bit for the given block in bit map */
 	if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
-		reiserfs_warning(s, "vs-4080: reiserfs_free_block: "
-				 "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
-				 reiserfs_bdevname(s), block);
+		reiserfs_warning(s, "vs-4080",
+				 "block %lu: bit already cleared", block);
 	}
 	apbi[nr].free_count++;
 	journal_mark_dirty(th, s, bmbh);
@@ -477,9 +472,8 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th,
 	BUG_ON(!th->t_trans_id);
 #ifdef CONFIG_REISERFS_CHECK
 	if (ei->i_prealloc_count < 0)
-		reiserfs_warning(th->t_super,
-				 "zam-4001:%s: inode has negative prealloc blocks count.",
-				 __func__);
+		reiserfs_warning(th->t_super, "zam-4001",
+				 "inode has negative prealloc blocks count.");
 #endif
 	while (ei->i_prealloc_count > 0) {
 		reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
@@ -515,9 +509,9 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
 				i_prealloc_list);
 #ifdef CONFIG_REISERFS_CHECK
 		if (!ei->i_prealloc_count) {
-			reiserfs_warning(th->t_super,
-					 "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.",
-					 __func__);
+			reiserfs_warning(th->t_super, "zam-4001",
+					 "inode is in prealloc list but has "
+					 "no preallocated blocks.");
 		}
 #endif
 		__discard_prealloc(th, ei);
@@ -631,8 +625,8 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
 			continue;
 		}
 
-		reiserfs_warning(s, "zam-4001: %s : unknown option - %s",
-				 __func__, this_char);
+		reiserfs_warning(s, "zam-4001", "unknown option - %s",
+				 this_char);
 		return 1;
 	}
 
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 99f80538c4bf..f701f37ddf98 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -1752,15 +1752,16 @@ static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
 	int i;
 
 	if (buffer_dirty(bh))
-		reiserfs_warning(tb->tb_sb,
-				 "store_thrown deals with dirty buffer");
+		reiserfs_warning(tb->tb_sb, "reiserfs-12320",
+				 "called with dirty buffer");
 	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
 		if (!tb->thrown[i]) {
 			tb->thrown[i] = bh;
 			get_bh(bh);	/* free_thrown puts this */
 			return;
 		}
-	reiserfs_warning(tb->tb_sb, "store_thrown: too many thrown buffers");
+	reiserfs_warning(tb->tb_sb, "reiserfs-12321",
+			 "too many thrown buffers");
 }
 
 static void free_thrown(struct tree_balance *tb)
@@ -1771,8 +1772,8 @@ static void free_thrown(struct tree_balance *tb)
 		if (tb->thrown[i]) {
 			blocknr = tb->thrown[i]->b_blocknr;
 			if (buffer_dirty(tb->thrown[i]))
-				reiserfs_warning(tb->tb_sb,
-						 "free_thrown deals with dirty buffer %d",
+				reiserfs_warning(tb->tb_sb, "reiserfs-12322",
+						 "called with dirty buffer %d",
 						 blocknr);
 			brelse(tb->thrown[i]);	/* incremented in store_thrown */
 			reiserfs_free_block(tb->transaction_handle, NULL,
@@ -1877,13 +1878,12 @@ static void check_internal_node(struct super_block *s, struct buffer_head *bh,
 	}
 }
 
-static int locked_or_not_in_tree(struct buffer_head *bh, char *which)
+static int locked_or_not_in_tree(struct tree_balance *tb,
+				  struct buffer_head *bh, char *which)
 {
 	if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
 	    !B_IS_IN_TREE(bh)) {
-		reiserfs_warning(NULL,
-				 "vs-12339: locked_or_not_in_tree: %s (%b)",
-				 which, bh);
+		reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
 		return 1;
 	}
 	return 0;
@@ -1902,18 +1902,19 @@ static int check_before_balancing(struct tree_balance *tb)
 	/* double check that buffers that we will modify are unlocked. (fix_nodes should already have
 	   prepped all of these for us). */
 	if (tb->lnum[0]) {
-		retval |= locked_or_not_in_tree(tb->L[0], "L[0]");
-		retval |= locked_or_not_in_tree(tb->FL[0], "FL[0]");
-		retval |= locked_or_not_in_tree(tb->CFL[0], "CFL[0]");
+		retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
 		check_leaf(tb->L[0]);
 	}
 	if (tb->rnum[0]) {
-		retval |= locked_or_not_in_tree(tb->R[0], "R[0]");
-		retval |= locked_or_not_in_tree(tb->FR[0], "FR[0]");
-		retval |= locked_or_not_in_tree(tb->CFR[0], "CFR[0]");
+		retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
 		check_leaf(tb->R[0]);
 	}
-	retval |= locked_or_not_in_tree(PATH_PLAST_BUFFER(tb->tb_path), "S[0]");
+	retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
+					"S[0]");
 	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
 
 	return retval;
@@ -1952,7 +1953,7 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 					       PATH_H_POSITION(tb->tb_path,
 							       1))));
 		print_cur_tb("12223");
-		reiserfs_warning(tb->tb_sb,
+		reiserfs_warning(tb->tb_sb, "reiserfs-12363",
 				 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
 				 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
 				 left,
@@ -2104,9 +2105,8 @@ void do_balance(struct tree_balance *tb,	/* tree_balance structure */
 	}
 	/* if we have no real work to do  */
 	if (!tb->insert_size[0]) {
-		reiserfs_warning(tb->tb_sb,
-				 "PAP-12350: do_balance: insert_size == 0, mode == %c",
-				 flag);
+		reiserfs_warning(tb->tb_sb, "PAP-12350",
+				 "insert_size == 0, mode == %c", flag);
 		unfix_nodes(tb);
 		return;
 	}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 33408417038c..47bab8978be1 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -76,7 +76,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 			 * and let the admin know what is going on.
 			 */
 			igrab(inode);
-			reiserfs_warning(inode->i_sb,
+			reiserfs_warning(inode->i_sb, "clm-9001",
 					 "pinning inode %lu because the "
 					 "preallocation can't be freed",
 					 inode->i_ino);
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 07d05e0842b7..59735a9e2349 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -496,8 +496,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
 		snum012[needed_nodes - 1 + 3] = units;
 
 		if (needed_nodes > 2)
-			reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: "
-					 "split_item_position is out of boundary");
+			reiserfs_warning(tb->tb_sb, "vs-8111",
+					 "split_item_position is out of range");
 		snum012[needed_nodes - 1]++;
 		split_item_positions[needed_nodes - 1] = i;
 		needed_nodes++;
@@ -533,8 +533,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
 
 		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
 		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
-			reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not "
-					 "directory or indirect item");
+			reiserfs_warning(tb->tb_sb, "vs-8115",
+					 "not directory or indirect item");
 	}
 
 	/* now we know S2bytes, calculate S1bytes */
@@ -2268,9 +2268,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
 #ifdef CONFIG_REISERFS_CHECK
 			repeat_counter++;
 			if ((repeat_counter % 10000) == 0) {
-				reiserfs_warning(p_s_tb->tb_sb,
-						 "wait_tb_buffers_until_released(): too many "
-						 "iterations waiting for buffer to unlock "
+				reiserfs_warning(p_s_tb->tb_sb, "reiserfs-8200",
+						 "too many iterations waiting "
+						 "for buffer to unlock "
 						 "(%b)", locked);
 
 				/* Don't loop forever.  Try to recover from possible error. */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 55fce92cdf18..95157762b1bf 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -842,7 +842,9 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 				if (retval) {
 					if (retval != -ENOSPC)
 						reiserfs_warning(inode->i_sb,
-								 "clm-6004: convert tail failed inode %lu, error %d",
+							 "clm-6004",
+							 "convert tail failed "
+							 "inode %lu, error %d",
 								 inode->i_ino,
 								 retval);
 					if (allocated_block_nr) {
@@ -1006,8 +1008,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			goto failure;
 		}
 		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb,
-					 "vs-825: reiserfs_get_block: "
+			reiserfs_warning(inode->i_sb, "vs-825",
 					 "%K should not be found", &key);
 			retval = -EEXIST;
 			if (allocated_block_nr)
@@ -1332,9 +1333,9 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
 		/* look for the object's stat data */
 		retval = search_item(inode->i_sb, &key, &path);
 		if (retval == IO_ERROR) {
-			reiserfs_warning(inode->i_sb,
-					 "vs-13050: reiserfs_update_sd: "
-					 "i/o failure occurred trying to update %K stat data",
+			reiserfs_warning(inode->i_sb, "vs-13050",
+					 "i/o failure occurred trying to "
+					 "update %K stat data",
 					 &key);
 			return;
 		}
@@ -1345,9 +1346,9 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
 				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
 				return;
 			}
-			reiserfs_warning(inode->i_sb,
-					 "vs-13060: reiserfs_update_sd: "
-					 "stat data of object %k (nlink == %d) not found (pos %d)",
+			reiserfs_warning(inode->i_sb, "vs-13060",
+					 "stat data of object %k (nlink == %d) "
+					 "not found (pos %d)",
 					 INODE_PKEY(inode), inode->i_nlink,
 					 pos);
 			reiserfs_check_path(&path);
@@ -1424,10 +1425,9 @@ void reiserfs_read_locked_inode(struct inode *inode,
 	/* look for the object's stat data */
 	retval = search_item(inode->i_sb, &key, &path_to_sd);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(inode->i_sb,
-				 "vs-13070: reiserfs_read_locked_inode: "
-				 "i/o failure occurred trying to find stat data of %K",
-				 &key);
+		reiserfs_warning(inode->i_sb, "vs-13070",
+				 "i/o failure occurred trying to find "
+				 "stat data of %K", &key);
 		reiserfs_make_bad_inode(inode);
 		return;
 	}
@@ -1457,8 +1457,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
 	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
 	if ((inode->i_nlink == 0) &&
 	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
-		reiserfs_warning(inode->i_sb,
-				 "vs-13075: reiserfs_read_locked_inode: "
+		reiserfs_warning(inode->i_sb, "vs-13075",
 				 "dead inode read from disk %K. "
 				 "This is likely to be race with knfsd. Ignore",
 				 &key);
@@ -1555,7 +1554,7 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 	 */
 	if (fh_type > fh_len) {
 		if (fh_type != 6 || fh_len != 5)
-			reiserfs_warning(sb,
+			reiserfs_warning(sb, "reiserfs-13077",
 				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
 				fh_type, fh_len);
 		fh_type = 5;
@@ -1680,13 +1679,13 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: "
+		reiserfs_warning(sb, "vs-13080",
 				 "i/o failure occurred creating new directory");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
-		reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: "
+		reiserfs_warning(sb, "vs-13070",
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
@@ -1720,13 +1719,13 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: "
+		reiserfs_warning(sb, "vs-13080",
 				 "i/o failure occurred creating new symlink");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
-		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: "
+		reiserfs_warning(sb, "vs-13080",
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
@@ -1927,7 +1926,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 			goto out_inserted_sd;
 		}
 	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
-		reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, "
+		reiserfs_warning(inode->i_sb, "jdm-13090",
+				 "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
 	} else if (is_reiserfs_priv_object(dir)) {
 		reiserfs_mark_inode_private(inode);
@@ -2044,8 +2044,8 @@ static int grab_tail_page(struct inode *p_s_inode,
 		 ** I've screwed up the code to find the buffer, or the code to
 		 ** call prepare_write
 		 */
-		reiserfs_warning(p_s_inode->i_sb,
-				 "clm-6000: error reading block %lu on dev %s",
+		reiserfs_warning(p_s_inode->i_sb, "clm-6000",
+				 "error reading block %lu on dev %s",
 				 bh->b_blocknr,
 				 reiserfs_bdevname(p_s_inode->i_sb));
 		error = -EIO;
@@ -2089,8 +2089,8 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 			// and get_block_create_0 could not find a block to read in,
 			// which is ok.
 			if (error != -ENOENT)
-				reiserfs_warning(p_s_inode->i_sb,
-						 "clm-6001: grab_tail_page failed %d",
+				reiserfs_warning(p_s_inode->i_sb, "clm-6001",
+						 "grab_tail_page failed %d",
 						 error);
 			page = NULL;
 			bh = NULL;
@@ -2208,9 +2208,8 @@ static int map_block_for_writepage(struct inode *inode,
 	/* we've found an unformatted node */
 	if (indirect_item_found(retval, ih)) {
 		if (bytes_copied > 0) {
-			reiserfs_warning(inode->i_sb,
-					 "clm-6002: bytes_copied %d",
-					 bytes_copied);
+			reiserfs_warning(inode->i_sb, "clm-6002",
+					 "bytes_copied %d", bytes_copied);
 		}
 		if (!get_block_num(item, pos_in_item)) {
 			/* crap, we are writing to a hole */
@@ -2267,9 +2266,8 @@ static int map_block_for_writepage(struct inode *inode,
 			goto research;
 		}
 	} else {
-		reiserfs_warning(inode->i_sb,
-				 "clm-6003: bad item inode %lu, device %s",
-				 inode->i_ino, reiserfs_bdevname(inode->i_sb));
+		reiserfs_warning(inode->i_sb, "clm-6003",
+				 "bad item inode %lu", inode->i_ino);
 		retval = -EIO;
 		goto out;
 	}
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index 9475557ab499..8a11cf39f57b 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -97,7 +97,8 @@ static int sd_unit_num(struct virtual_item *vi)
 
 static void sd_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL, "STATDATA, index %d, type 0x%x, %h",
+	reiserfs_warning(NULL, "reiserfs-16100",
+			 "STATDATA, index %d, type 0x%x, %h",
 			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
@@ -190,7 +191,8 @@ static int direct_unit_num(struct virtual_item *vi)
 
 static void direct_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL, "DIRECT, index %d, type 0x%x, %h",
+	reiserfs_warning(NULL, "reiserfs-16101",
+			 "DIRECT, index %d, type 0x%x, %h",
 			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
@@ -278,7 +280,7 @@ static void indirect_print_item(struct item_head *ih, char *item)
 	unp = (__le32 *) item;
 
 	if (ih_item_len(ih) % UNFM_P_SIZE)
-		reiserfs_warning(NULL, "indirect_print_item: invalid item len");
+		reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
 
 	printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
 	for (j = 0; j < I_UNFM_NUM(ih); j++) {
@@ -334,7 +336,8 @@ static int indirect_unit_num(struct virtual_item *vi)
 
 static void indirect_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL, "INDIRECT, index %d, type 0x%x, %h",
+	reiserfs_warning(NULL, "reiserfs-16103",
+			 "INDIRECT, index %d, type 0x%x, %h",
 			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
@@ -359,7 +362,7 @@ static struct item_operations indirect_ops = {
 
 static int direntry_bytes_number(struct item_head *ih, int block_size)
 {
-	reiserfs_warning(NULL, "vs-16090: direntry_bytes_number: "
+	reiserfs_warning(NULL, "vs-16090",
 			 "bytes number is asked for direntry");
 	return 0;
 }
@@ -614,7 +617,8 @@ static void direntry_print_vi(struct virtual_item *vi)
 	int i;
 	struct direntry_uarea *dir_u = vi->vi_uarea;
 
-	reiserfs_warning(NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
+	reiserfs_warning(NULL, "reiserfs-16104",
+			 "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
 			 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
 	printk("%d entries: ", dir_u->entry_count);
 	for (i = 0; i < dir_u->entry_count; i++)
@@ -642,43 +646,43 @@ static struct item_operations direntry_ops = {
 //
 static int errcatch_bytes_number(struct item_head *ih, int block_size)
 {
-	reiserfs_warning(NULL,
-			 "green-16001: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16001",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static void errcatch_decrement_key(struct cpu_key *key)
 {
-	reiserfs_warning(NULL,
-			 "green-16002: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16002",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static int errcatch_is_left_mergeable(struct reiserfs_key *key,
 				      unsigned long bsize)
 {
-	reiserfs_warning(NULL,
-			 "green-16003: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16003",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static void errcatch_print_item(struct item_head *ih, char *item)
 {
-	reiserfs_warning(NULL,
-			 "green-16004: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16004",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static void errcatch_check_item(struct item_head *ih, char *item)
 {
-	reiserfs_warning(NULL,
-			 "green-16005: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16005",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static int errcatch_create_vi(struct virtual_node *vn,
 			      struct virtual_item *vi,
 			      int is_affected, int insert_size)
 {
-	reiserfs_warning(NULL,
-			 "green-16006: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16006",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;		// We might return -1 here as well, but it won't help as create_virtual_node() from where
 	// this operation is called from is of return type void.
 }
@@ -686,36 +690,36 @@ static int errcatch_create_vi(struct virtual_node *vn,
 static int errcatch_check_left(struct virtual_item *vi, int free,
 			       int start_skip, int end_skip)
 {
-	reiserfs_warning(NULL,
-			 "green-16007: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16007",
+			 "Invalid item type observed, run fsck ASAP");
 	return -1;
 }
 
 static int errcatch_check_right(struct virtual_item *vi, int free)
 {
-	reiserfs_warning(NULL,
-			 "green-16008: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16008",
+			 "Invalid item type observed, run fsck ASAP");
 	return -1;
 }
 
 static int errcatch_part_size(struct virtual_item *vi, int first, int count)
 {
-	reiserfs_warning(NULL,
-			 "green-16009: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16009",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static int errcatch_unit_num(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL,
-			 "green-16010: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16010",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static void errcatch_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL,
-			 "green-16011: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16011",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static struct item_operations errcatch_ops = {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 677bb926e7d6..88a031fafd07 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -300,8 +300,8 @@ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
 		jb->journal_list = NULL;
 		jb->bitmaps = vmalloc(mem);
 		if (!jb->bitmaps) {
-			reiserfs_warning(p_s_sb,
-					 "clm-2000, unable to allocate bitmaps for journal lists");
+			reiserfs_warning(p_s_sb, "clm-2000", "unable to "
+					 "allocate bitmaps for journal lists");
 			failed = 1;
 			break;
 		}
@@ -644,8 +644,8 @@ static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 	char b[BDEVNAME_SIZE];
 
 	if (buffer_journaled(bh)) {
-		reiserfs_warning(NULL,
-				 "clm-2084: pinned buffer %lu:%s sent to disk",
+		reiserfs_warning(NULL, "clm-2084",
+				 "pinned buffer %lu:%s sent to disk",
 				 bh->b_blocknr, bdevname(bh->b_bdev, b));
 	}
 	if (uptodate)
@@ -1122,7 +1122,8 @@ static int flush_commit_list(struct super_block *s,
 			sync_dirty_buffer(tbh);
 		if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
-			reiserfs_warning(s, "journal-601, buffer write failed");
+			reiserfs_warning(s, "journal-601",
+					 "buffer write failed");
 #endif
 			retval = -EIO;
 		}
@@ -1154,14 +1155,14 @@ static int flush_commit_list(struct super_block *s,
 	 * up propagating the write error out to the filesystem. */
 	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-		reiserfs_warning(s, "journal-615: buffer write failed");
+		reiserfs_warning(s, "journal-615", "buffer write failed");
 #endif
 		retval = -EIO;
 	}
 	bforget(jl->j_commit_bh);
 	if (journal->j_last_commit_id != 0 &&
 	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
-		reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",
+		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
 				 journal->j_last_commit_id, jl->j_trans_id);
 	}
 	journal->j_last_commit_id = jl->j_trans_id;
@@ -1250,7 +1251,7 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
 	while (cn) {
 		if (cn->blocknr != 0) {
 			if (debug) {
-				reiserfs_warning(p_s_sb,
+				reiserfs_warning(p_s_sb, "reiserfs-2201",
 						 "block %u, bh is %d, state %ld",
 						 cn->blocknr, cn->bh ? 1 : 0,
 						 cn->state);
@@ -1288,8 +1289,8 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 			wait_on_buffer((journal->j_header_bh));
 			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-				reiserfs_warning(p_s_sb,
-						 "journal-699: buffer write failed");
+				reiserfs_warning(p_s_sb, "journal-699",
+						 "buffer write failed");
 #endif
 				return -EIO;
 			}
@@ -1319,8 +1320,8 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 			sync_dirty_buffer(journal->j_header_bh);
 		}
 		if (!buffer_uptodate(journal->j_header_bh)) {
-			reiserfs_warning(p_s_sb,
-					 "journal-837: IO error during journal replay");
+			reiserfs_warning(p_s_sb, "journal-837",
+					 "IO error during journal replay");
 			return -EIO;
 		}
 	}
@@ -1401,8 +1402,7 @@ static int flush_journal_list(struct super_block *s,
 	BUG_ON(j_len_saved <= 0);
 
 	if (atomic_read(&journal->j_wcount) != 0) {
-		reiserfs_warning(s,
-				 "clm-2048: flush_journal_list called with wcount %d",
+		reiserfs_warning(s, "clm-2048", "called with wcount %d",
 				 atomic_read(&journal->j_wcount));
 	}
 	BUG_ON(jl->j_trans_id == 0);
@@ -1510,8 +1510,8 @@ static int flush_journal_list(struct super_block *s,
 		 ** is not marked JDirty_wait
 		 */
 		if ((!was_jwait) && !buffer_locked(saved_bh)) {
-			reiserfs_warning(s,
-					 "journal-813: BAD! buffer %llu %cdirty %cjwait, "
+			reiserfs_warning(s, "journal-813",
+					 "BAD! buffer %llu %cdirty %cjwait, "
 					 "not in a newer tranasction",
 					 (unsigned long long)saved_bh->
 					 b_blocknr, was_dirty ? ' ' : '!',
@@ -1529,8 +1529,8 @@ static int flush_journal_list(struct super_block *s,
 				unlock_buffer(saved_bh);
 			count++;
 		} else {
-			reiserfs_warning(s,
-					 "clm-2082: Unable to flush buffer %llu in %s",
+			reiserfs_warning(s, "clm-2082",
+					 "Unable to flush buffer %llu in %s",
 					 (unsigned long long)saved_bh->
 					 b_blocknr, __func__);
 		}
@@ -1541,8 +1541,8 @@ static int flush_journal_list(struct super_block *s,
 			/* we incremented this to keep others from taking the buffer head away */
 			put_bh(saved_bh);
 			if (atomic_read(&(saved_bh->b_count)) < 0) {
-				reiserfs_warning(s,
-						 "journal-945: saved_bh->b_count < 0");
+				reiserfs_warning(s, "journal-945",
+						 "saved_bh->b_count < 0");
 			}
 		}
 	}
@@ -1561,8 +1561,8 @@ static int flush_journal_list(struct super_block *s,
 				}
 				if (unlikely(!buffer_uptodate(cn->bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-					reiserfs_warning(s,
-							 "journal-949: buffer write failed\n");
+					reiserfs_warning(s, "journal-949",
+							 "buffer write failed");
 #endif
 					err = -EIO;
 				}
@@ -1623,7 +1623,7 @@ static int flush_journal_list(struct super_block *s,
 
 	if (journal->j_last_flush_id != 0 &&
 	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
-		reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",
+		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
 				 journal->j_last_flush_id, jl->j_trans_id);
 	}
 	journal->j_last_flush_id = jl->j_trans_id;
@@ -2058,8 +2058,9 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 			return -1;
 		}
 		if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) {
-			reiserfs_warning(p_s_sb,
-					 "journal-2018: Bad transaction length %d encountered, ignoring transaction",
+			reiserfs_warning(p_s_sb, "journal-2018",
+					 "Bad transaction length %d "
+					 "encountered, ignoring transaction",
 					 get_desc_trans_len(desc));
 			return -1;
 		}
@@ -2195,8 +2196,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 		brelse(d_bh);
 		kfree(log_blocks);
 		kfree(real_blocks);
-		reiserfs_warning(p_s_sb,
-				 "journal-1169: kmalloc failed, unable to mount FS");
+		reiserfs_warning(p_s_sb, "journal-1169",
+				 "kmalloc failed, unable to mount FS");
 		return -1;
 	}
 	/* get all the buffer heads */
@@ -2218,15 +2219,18 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 						  j_realblock[i - trans_half]));
 		}
 		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");
+			reiserfs_warning(p_s_sb, "journal-1207",
+					 "REPLAY FAILURE fsck required! "
+					 "Block to replay is outside of "
+					 "filesystem");
 			goto abort_replay;
 		}
 		/* make sure we don't try to replay onto log or reserved area */
 		if (is_block_in_log_or_reserved_area
 		    (p_s_sb, real_blocks[i]->b_blocknr)) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block");
+			reiserfs_warning(p_s_sb, "journal-1204",
+					 "REPLAY FAILURE fsck required! "
+					 "Trying to replay onto a log block");
 		      abort_replay:
 			brelse_array(log_blocks, i);
 			brelse_array(real_blocks, i);
@@ -2242,8 +2246,9 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(log_blocks[i]);
 		if (!buffer_uptodate(log_blocks[i])) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1212: REPLAY FAILURE fsck required! buffer write failed");
+			reiserfs_warning(p_s_sb, "journal-1212",
+					 "REPLAY FAILURE fsck required! "
+					 "buffer write failed");
 			brelse_array(log_blocks + i,
 				     get_desc_trans_len(desc) - i);
 			brelse_array(real_blocks, get_desc_trans_len(desc));
@@ -2266,8 +2271,9 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(real_blocks[i]);
 		if (!buffer_uptodate(real_blocks[i])) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1226: REPLAY FAILURE, fsck required! buffer write failed");
+			reiserfs_warning(p_s_sb, "journal-1226",
+					 "REPLAY FAILURE, fsck required! "
+					 "buffer write failed");
 			brelse_array(real_blocks + i,
 				     get_desc_trans_len(desc) - i);
 			brelse(c_bh);
@@ -2418,8 +2424,8 @@ static int journal_read(struct super_block *p_s_sb)
 	}
 
 	if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
-		reiserfs_warning(p_s_sb,
-				 "clm-2076: device is readonly, unable to replay log");
+		reiserfs_warning(p_s_sb, "clm-2076",
+				 "device is readonly, unable to replay log");
 		return -1;
 	}
 
@@ -2580,9 +2586,8 @@ static int release_journal_dev(struct super_block *super,
 	}
 
 	if (result != 0) {
-		reiserfs_warning(super,
-				 "sh-457: release_journal_dev: Cannot release journal device: %i",
-				 result);
+		reiserfs_warning(super, "sh-457",
+				 "Cannot release journal device: %i", result);
 	}
 	return result;
 }
@@ -2612,7 +2617,7 @@ static int journal_init_dev(struct super_block *super,
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
 			journal->j_dev_bd = NULL;
-			reiserfs_warning(super, "sh-458: journal_init_dev: "
+			reiserfs_warning(super, "sh-458",
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
@@ -2676,16 +2681,16 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
 		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
 		    SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max <
 		    JOURNAL_MIN_RATIO) {
-		        reiserfs_warning(p_s_sb,
-				 "sh-462: bad transaction max size (%u). FSCK?",
-				 journal->j_trans_max);
+			reiserfs_warning(p_s_sb, "sh-462",
+					 "bad transaction max size (%u). "
+					 "FSCK?", journal->j_trans_max);
 			return 1;
 		}
 		if (journal->j_max_batch != (journal->j_trans_max) *
 		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
-		        reiserfs_warning(p_s_sb,
-				"sh-463: bad transaction max batch (%u). FSCK?",
-				journal->j_max_batch);
+			reiserfs_warning(p_s_sb, "sh-463",
+					 "bad transaction max batch (%u). "
+					 "FSCK?", journal->j_max_batch);
 			return 1;
 		}
 	} else {
@@ -2693,9 +2698,11 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
                    The file system was created by old version
 		   of mkreiserfs, so some fields contain zeros,
 		   and we need to advise proper values for them */
-	        if (p_s_sb->s_blocksize != REISERFS_STANDARD_BLKSIZE)
-	                reiserfs_panic(p_s_sb, "sh-464: bad blocksize (%u)",
-				       p_s_sb->s_blocksize);
+		if (p_s_sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
+			reiserfs_warning(p_s_sb, "sh-464", "bad blocksize (%u)",
+					 p_s_sb->s_blocksize);
+			return 1;
+		}
 		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
 		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
 		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
@@ -2719,8 +2726,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 
 	journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal));
 	if (!journal) {
-		reiserfs_warning(p_s_sb,
-				 "journal-1256: unable to get memory for journal structure");
+		reiserfs_warning(p_s_sb, "journal-1256",
+				 "unable to get memory for journal structure");
 		return 1;
 	}
 	memset(journal, 0, sizeof(struct reiserfs_journal));
@@ -2749,9 +2756,9 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) &&
 	    (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) +
 	     SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) {
-		reiserfs_warning(p_s_sb,
-				 "journal-1393: journal does not fit for area "
-				 "addressed by first of bitmap blocks. It starts at "
+		reiserfs_warning(p_s_sb, "journal-1393",
+				 "journal does not fit for area addressed "
+				 "by first of bitmap blocks. It starts at "
 				 "%u and its size is %u. Block size %ld",
 				 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
 				 SB_ONDISK_JOURNAL_SIZE(p_s_sb),
@@ -2760,8 +2767,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	}
 
 	if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) {
-		reiserfs_warning(p_s_sb,
-				 "sh-462: unable to initialize jornal device");
+		reiserfs_warning(p_s_sb, "sh-462",
+				 "unable to initialize jornal device");
 		goto free_and_return;
 	}
 
@@ -2772,8 +2779,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 			     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 			     SB_ONDISK_JOURNAL_SIZE(p_s_sb));
 	if (!bhjh) {
-		reiserfs_warning(p_s_sb,
-				 "sh-459: unable to read journal header");
+		reiserfs_warning(p_s_sb, "sh-459",
+				 "unable to read journal header");
 		goto free_and_return;
 	}
 	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
@@ -2782,10 +2789,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	if (is_reiserfs_jr(rs)
 	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
 		sb_jp_journal_magic(rs))) {
-		reiserfs_warning(p_s_sb,
-				 "sh-460: journal header magic %x "
-				 "(device %s) does not match to magic found in super "
-				 "block %x", jh->jh_journal.jp_journal_magic,
+		reiserfs_warning(p_s_sb, "sh-460",
+				 "journal header magic %x (device %s) does "
+				 "not match to magic found in super block %x",
+				 jh->jh_journal.jp_journal_magic,
 				 bdevname(journal->j_dev_bd, b),
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
@@ -2852,7 +2859,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	journal->j_must_wait = 0;
 
 	if (journal->j_cnode_free == 0) {
-        	reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory "
+		reiserfs_warning(p_s_sb, "journal-2004", "Journal cnode memory "
 		                 "allocation failed (%ld bytes). Journal is "
 		                 "too large for available memory. Usually "
 		                 "this is due to a journal that is too large.",
@@ -2864,12 +2871,13 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	jl = journal->j_current_jl;
 	jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
 	if (!jl->j_list_bitmap) {
-		reiserfs_warning(p_s_sb,
-				 "journal-2005, get_list_bitmap failed for journal list 0");
+		reiserfs_warning(p_s_sb, "journal-2005",
+				 "get_list_bitmap failed for journal list 0");
 		goto free_and_return;
 	}
 	if (journal_read(p_s_sb) < 0) {
-		reiserfs_warning(p_s_sb, "Replay Failure, unable to mount");
+		reiserfs_warning(p_s_sb, "reiserfs-2006",
+				 "Replay Failure, unable to mount");
 		goto free_and_return;
 	}
 
@@ -3196,16 +3204,17 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 			cur_th->t_refcount++;
 			memcpy(th, cur_th, sizeof(*th));
 			if (th->t_refcount <= 1)
-				reiserfs_warning(p_s_sb,
-						 "BAD: refcount <= 1, but journal_info != 0");
+				reiserfs_warning(p_s_sb, "reiserfs-2005",
+						 "BAD: refcount <= 1, but "
+						 "journal_info != 0");
 			return 0;
 		} else {
 			/* we've ended up with a handle from a different filesystem.
 			 ** save it and restore on journal_end.  This should never
 			 ** really happen...
 			 */
-			reiserfs_warning(p_s_sb,
-					 "clm-2100: nesting info a different FS");
+			reiserfs_warning(p_s_sb, "clm-2100",
+					 "nesting info a different FS");
 			th->t_handle_save = current->journal_info;
 			current->journal_info = th;
 		}
@@ -3266,7 +3275,8 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	 ** could get to disk too early.  NOT GOOD.
 	 */
 	if (!prepared || buffer_dirty(bh)) {
-		reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state "
+		reiserfs_warning(p_s_sb, "journal-1777",
+				 "buffer %llu bad state "
 				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
 				 (unsigned long long)bh->b_blocknr,
 				 prepared ? ' ' : '!',
@@ -3276,8 +3286,8 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	}
 
 	if (atomic_read(&(journal->j_wcount)) <= 0) {
-		reiserfs_warning(p_s_sb,
-				 "journal-1409: journal_mark_dirty returning because j_wcount was %d",
+		reiserfs_warning(p_s_sb, "journal-1409",
+				 "returning because j_wcount was %d",
 				 atomic_read(&(journal->j_wcount)));
 		return 1;
 	}
@@ -3342,8 +3352,8 @@ int journal_end(struct reiserfs_transaction_handle *th,
 		struct super_block *p_s_sb, unsigned long nblocks)
 {
 	if (!current->journal_info && th->t_refcount > 1)
-		reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d",
-				 th->t_refcount);
+		reiserfs_warning(p_s_sb, "REISER-NESTING",
+				 "th NULL, refcount %d", th->t_refcount);
 
 	if (!th->t_trans_id) {
 		WARN_ON(1);
@@ -3413,8 +3423,8 @@ static int remove_from_transaction(struct super_block *p_s_sb,
 		clear_buffer_journal_test(bh);
 		put_bh(bh);
 		if (atomic_read(&(bh->b_count)) < 0) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1752: remove from trans, b_count < 0");
+			reiserfs_warning(p_s_sb, "journal-1752",
+					 "b_count < 0");
 		}
 		ret = 1;
 	}
@@ -3734,7 +3744,8 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 						if (atomic_read
 						    (&(cn->bh->b_count)) < 0) {
 							reiserfs_warning(p_s_sb,
-									 "journal-2138: cn->bh->b_count < 0");
+								 "journal-2138",
+								 "cn->bh->b_count < 0");
 						}
 					}
 					if (cn->jlist) {	/* since we are clearing the bh, we MUST dec nonzerolen */
@@ -4137,8 +4148,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			clear_buffer_journaled(cn->bh);
 		} else {
 			/* JDirty cleared sometime during transaction.  don't log this one */
-			reiserfs_warning(p_s_sb,
-					 "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!");
+			reiserfs_warning(p_s_sb, "journal-2048",
+					 "BAD, buffer in journal hash, "
+					 "but not JDirty!");
 			brelse(cn->bh);
 		}
 		next = cn->next;
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 41bdd8c75887..381339b432e7 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -1288,12 +1288,16 @@ void leaf_paste_entries(struct buffer_info *bi,
 			prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0;
 
 			if (prev && prev <= deh_location(&(deh[i])))
-				reiserfs_warning(NULL,
-						 "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)",
+				reiserfs_warning(NULL, "vs-10240",
+						 "directory item (%h) "
+						 "corrupted (prev %a, "
+						 "cur(%d) %a)",
 						 ih, deh + i - 1, i, deh + i);
 			if (next && next >= deh_location(&(deh[i])))
-				reiserfs_warning(NULL,
-						 "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)",
+				reiserfs_warning(NULL, "vs-10250",
+						 "directory item (%h) "
+						 "corrupted (cur(%d) %a, "
+						 "next %a)",
 						 ih, i, deh + i, deh + i + 1);
 		}
 	}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 738967f6c8ee..bb41c6e7c79b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -120,8 +120,8 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
 	switch (retval) {
 	case ITEM_NOT_FOUND:
 		if (!PATH_LAST_POSITION(path)) {
-			reiserfs_warning(sb,
-					 "vs-7000: search_by_entry_key: search_by_key returned item position == 0");
+			reiserfs_warning(sb, "vs-7000", "search_by_key "
+					 "returned item position == 0");
 			pathrelse(path);
 			return IO_ERROR;
 		}
@@ -135,8 +135,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
 
 	default:
 		pathrelse(path);
-		reiserfs_warning(sb,
-				 "vs-7002: search_by_entry_key: no path to here");
+		reiserfs_warning(sb, "vs-7002", "no path to here");
 		return IO_ERROR;
 	}
 
@@ -300,8 +299,7 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
 		    search_by_entry_key(dir->i_sb, &key_to_search,
 					path_to_entry, de);
 		if (retval == IO_ERROR) {
-			reiserfs_warning(dir->i_sb, "zam-7001: io error in %s",
-					 __func__);
+			reiserfs_warning(dir->i_sb, "zam-7001", "io error");
 			return IO_ERROR;
 		}
 
@@ -484,10 +482,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 		}
 
 		if (retval != NAME_FOUND) {
-			reiserfs_warning(dir->i_sb,
-					 "zam-7002:%s: \"reiserfs_find_entry\" "
-					 "has returned unexpected value (%d)",
-					 __func__, retval);
+			reiserfs_warning(dir->i_sb, "zam-7002",
+					 "reiserfs_find_entry() returned "
+					 "unexpected value (%d)", retval);
 		}
 
 		return -EEXIST;
@@ -498,8 +495,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 				MAX_GENERATION_NUMBER + 1);
 	if (gen_number > MAX_GENERATION_NUMBER) {
 		/* there is no free generation number */
-		reiserfs_warning(dir->i_sb,
-				 "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
+		reiserfs_warning(dir->i_sb, "reiserfs-7010",
+				 "Congratulations! we have got hash function "
+				 "screwed up");
 		if (buffer != small_buf)
 			kfree(buffer);
 		pathrelse(&path);
@@ -515,10 +513,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	if (gen_number != 0) {	/* we need to re-search for the insertion point */
 		if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
 		    NAME_NOT_FOUND) {
-			reiserfs_warning(dir->i_sb,
-					 "vs-7032: reiserfs_add_entry: "
-					 "entry with this key (%K) already exists",
-					 &entry_key);
+			reiserfs_warning(dir->i_sb, "vs-7032",
+					 "entry with this key (%K) already "
+					 "exists", &entry_key);
 
 			if (buffer != small_buf)
 				kfree(buffer);
@@ -903,8 +900,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto end_rmdir;
 
 	if (inode->i_nlink != 2 && inode->i_nlink != 1)
-		reiserfs_warning(inode->i_sb, "%s: empty directory has nlink "
-				 "!= 2 (%d)", __func__, inode->i_nlink);
+		reiserfs_warning(inode->i_sb, "reiserfs-7040",
+				 "empty directory has nlink != 2 (%d)",
+				 inode->i_nlink);
 
 	clear_nlink(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -980,10 +978,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 
 	if (!inode->i_nlink) {
-		reiserfs_warning(inode->i_sb, "%s: deleting nonexistent file "
-				 "(%s:%lu), %d", __func__,
-				 reiserfs_bdevname(inode->i_sb), inode->i_ino,
-				 inode->i_nlink);
+		reiserfs_warning(inode->i_sb, "reiserfs-7042",
+				 "deleting nonexistent file (%lu), %d",
+				 inode->i_ino, inode->i_nlink);
 		inode->i_nlink = 1;
 	}
 
@@ -1499,8 +1496,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (reiserfs_cut_from_item
 	    (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL,
 	     0) < 0)
-		reiserfs_warning(old_dir->i_sb,
-				 "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?");
+		reiserfs_warning(old_dir->i_sb, "vs-7060",
+				 "couldn't not cut old name. Fsck later?");
 
 	old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
 
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index ea0cf8c28a99..a3a5f43ff443 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -61,7 +61,7 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
 	/* comment needed -Hans */
 	unused_objectid = le32_to_cpu(map[1]);
 	if (unused_objectid == U32_MAX) {
-		reiserfs_warning(s, "%s: no more object ids", __func__);
+		reiserfs_warning(s, "reiserfs-15100", "no more object ids");
 		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
 		return 0;
 	}
@@ -160,8 +160,7 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
 		i += 2;
 	}
 
-	reiserfs_warning(s,
-			 "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)",
+	reiserfs_warning(s, "vs-15011", "tried to free free object id (%lu)",
 			 (long unsigned)objectid_to_release);
 }
 
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 535a3c7fc68e..50ed4bd3ef63 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -264,14 +264,17 @@ static void prepare_error_buf(const char *fmt, va_list args)
     va_end( args );\
 }
 
-void reiserfs_warning(struct super_block *sb, const char *fmt, ...)
+void __reiserfs_warning(struct super_block *sb, const char *id,
+			 const char *function, const char *fmt, ...)
 {
 	do_reiserfs_warning(fmt);
 	if (sb)
-		printk(KERN_WARNING "REISERFS warning (device %s): %s\n",
-		       sb->s_id, error_buf);
+		printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
+		       "%s\n", sb->s_id, id ? id : "", id ? " " : "",
+		       function, error_buf);
 	else
-		printk(KERN_WARNING "REISERFS warning: %s\n", error_buf);
+		printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
+		       id ? id : "", id ? " " : "", function, error_buf);
 }
 
 /* No newline.. reiserfs_info calls can be followed by printk's */
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 370988efc8ad..d4d7f1433ed0 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -503,7 +503,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
 		add_file(sb, "journal", show_journal);
 		return 0;
 	}
-	reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s",
+	reiserfs_warning(sb, "cannot create /proc/%s/%s",
 			 proc_info_root_name, b);
 	return 1;
 }
@@ -559,8 +559,7 @@ int reiserfs_proc_info_global_init(void)
 		if (proc_info_root) {
 			proc_info_root->owner = THIS_MODULE;
 		} else {
-			reiserfs_warning(NULL,
-					 "reiserfs: cannot create /proc/%s",
+			reiserfs_warning(NULL, "cannot create /proc/%s",
 					 proc_info_root_name);
 			return 1;
 		}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index abbc64dcc8d4..f328d27a19d5 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -444,23 +444,24 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
 
 	blkh = (struct block_head *)buf;
 	if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
-		reiserfs_warning(NULL,
-				 "is_leaf: this should be caught earlier");
+		reiserfs_warning(NULL, "reiserfs-5080",
+				 "this should be caught earlier");
 		return 0;
 	}
 
 	nr = blkh_nr_item(blkh);
 	if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
 		/* item number is too big or too small */
-		reiserfs_warning(NULL, "is_leaf: nr_item seems wrong: %z", bh);
+		reiserfs_warning(NULL, "reiserfs-5081",
+				 "nr_item seems wrong: %z", bh);
 		return 0;
 	}
 	ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
 	used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
 	if (used_space != blocksize - blkh_free_space(blkh)) {
 		/* free space does not match to calculated amount of use space */
-		reiserfs_warning(NULL, "is_leaf: free space seems wrong: %z",
-				 bh);
+		reiserfs_warning(NULL, "reiserfs-5082",
+				 "free space seems wrong: %z", bh);
 		return 0;
 	}
 	// FIXME: it is_leaf will hit performance too much - we may have
@@ -471,29 +472,29 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
 	prev_location = blocksize;
 	for (i = 0; i < nr; i++, ih++) {
 		if (le_ih_k_type(ih) == TYPE_ANY) {
-			reiserfs_warning(NULL,
-					 "is_leaf: wrong item type for item %h",
+			reiserfs_warning(NULL, "reiserfs-5083",
+					 "wrong item type for item %h",
 					 ih);
 			return 0;
 		}
 		if (ih_location(ih) >= blocksize
 		    || ih_location(ih) < IH_SIZE * nr) {
-			reiserfs_warning(NULL,
-					 "is_leaf: item location seems wrong: %h",
+			reiserfs_warning(NULL, "reiserfs-5084",
+					 "item location seems wrong: %h",
 					 ih);
 			return 0;
 		}
 		if (ih_item_len(ih) < 1
 		    || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
-			reiserfs_warning(NULL,
-					 "is_leaf: item length seems wrong: %h",
+			reiserfs_warning(NULL, "reiserfs-5085",
+					 "item length seems wrong: %h",
 					 ih);
 			return 0;
 		}
 		if (prev_location - ih_location(ih) != ih_item_len(ih)) {
-			reiserfs_warning(NULL,
-					 "is_leaf: item location seems wrong (second one): %h",
-					 ih);
+			reiserfs_warning(NULL, "reiserfs-5086",
+					 "item location seems wrong "
+					 "(second one): %h", ih);
 			return 0;
 		}
 		prev_location = ih_location(ih);
@@ -514,24 +515,23 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
 	nr = blkh_level(blkh);
 	if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
 		/* this level is not possible for internal nodes */
-		reiserfs_warning(NULL,
-				 "is_internal: this should be caught earlier");
+		reiserfs_warning(NULL, "reiserfs-5087",
+				 "this should be caught earlier");
 		return 0;
 	}
 
 	nr = blkh_nr_item(blkh);
 	if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
 		/* for internal which is not root we might check min number of keys */
-		reiserfs_warning(NULL,
-				 "is_internal: number of key seems wrong: %z",
-				 bh);
+		reiserfs_warning(NULL, "reiserfs-5088",
+				 "number of key seems wrong: %z", bh);
 		return 0;
 	}
 
 	used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
 	if (used_space != blocksize - blkh_free_space(blkh)) {
-		reiserfs_warning(NULL,
-				 "is_internal: free space seems wrong: %z", bh);
+		reiserfs_warning(NULL, "reiserfs-5089",
+				 "free space seems wrong: %z", bh);
 		return 0;
 	}
 	// one may imagine much more checks
@@ -543,8 +543,8 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
 static int is_tree_node(struct buffer_head *bh, int level)
 {
 	if (B_LEVEL(bh) != level) {
-		reiserfs_warning(NULL,
-				 "is_tree_node: node level %d does not match to the expected one %d",
+		reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
+				 "not match to the expected one %d",
 				 B_LEVEL(bh), level);
 		return 0;
 	}
@@ -645,9 +645,9 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 
 #ifdef CONFIG_REISERFS_CHECK
 		if (!(++n_repeat_counter % 50000))
-			reiserfs_warning(p_s_sb, "PAP-5100: search_by_key: %s:"
-					 "there were %d iterations of while loop "
-					 "looking for key %K",
+			reiserfs_warning(p_s_sb, "PAP-5100",
+					 "%s: there were %d iterations of "
+					 "while loop looking for key %K",
 					 current->comm, n_repeat_counter,
 					 p_s_key);
 #endif
@@ -721,9 +721,9 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 		// make sure, that the node contents look like a node of
 		// certain level
 		if (!is_tree_node(p_s_bh, expected_level)) {
-			reiserfs_warning(p_s_sb, "vs-5150: search_by_key: "
-					 "invalid format found in block %ld. Fsck?",
-					 p_s_bh->b_blocknr);
+			reiserfs_warning(p_s_sb, "vs-5150",
+					 "invalid format found in block %ld. "
+					 "Fsck?", p_s_bh->b_blocknr);
 			pathrelse(p_s_search_path);
 			return IO_ERROR;
 		}
@@ -1227,8 +1227,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		if (n_ret_value == IO_ERROR)
 			break;
 		if (n_ret_value == FILE_NOT_FOUND) {
-			reiserfs_warning(p_s_sb,
-					 "vs-5340: reiserfs_delete_item: "
+			reiserfs_warning(p_s_sb, "vs-5340",
 					 "no items of the file %K found",
 					 p_s_item_key);
 			break;
@@ -1338,10 +1337,9 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 	while (1) {
 		retval = search_item(th->t_super, &cpu_key, &path);
 		if (retval == IO_ERROR) {
-			reiserfs_warning(th->t_super,
-					 "vs-5350: reiserfs_delete_solid_item: "
-					 "i/o failure occurred trying to delete %K",
-					 &cpu_key);
+			reiserfs_warning(th->t_super, "vs-5350",
+					 "i/o failure occurred trying "
+					 "to delete %K", &cpu_key);
 			break;
 		}
 		if (retval != ITEM_FOUND) {
@@ -1355,9 +1353,8 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 			     GET_GENERATION_NUMBER(le_key_k_offset
 						   (le_key_version(key),
 						    key)) == 1))
-				reiserfs_warning(th->t_super,
-						 "vs-5355: reiserfs_delete_solid_item: %k not found",
-						 key);
+				reiserfs_warning(th->t_super, "vs-5355",
+						 "%k not found", key);
 			break;
 		}
 		if (!tb_init) {
@@ -1389,8 +1386,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 			break;
 		}
 		// IO_ERROR, NO_DISK_SPACE, etc
-		reiserfs_warning(th->t_super,
-				 "vs-5360: reiserfs_delete_solid_item: "
+		reiserfs_warning(th->t_super, "vs-5360",
 				 "could not delete %K due to fix_nodes failure",
 				 &cpu_key);
 		unfix_nodes(&tb);
@@ -1533,8 +1529,9 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
 		set_cpu_key_k_offset(&tail_key,
 				     cpu_key_k_offset(&tail_key) - removed);
 	}
-	reiserfs_warning(inode->i_sb,
-			 "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space");
+	reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
+			 "conversion has been rolled back due to "
+			 "lack of disk space");
 	//mark_file_without_tail (inode);
 	mark_inode_dirty(inode);
 }
@@ -1639,8 +1636,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		if (n_ret_value == POSITION_FOUND)
 			continue;
 
-		reiserfs_warning(p_s_sb,
-				 "PAP-5610: reiserfs_cut_from_item: item %K not found",
+		reiserfs_warning(p_s_sb, "PAP-5610", "item %K not found",
 				 p_s_item_key);
 		unfix_nodes(&s_cut_balance);
 		return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
@@ -1654,7 +1650,8 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			indirect_to_direct_roll_back(th, p_s_inode, p_s_path);
 		}
 		if (n_ret_value == NO_DISK_SPACE)
-			reiserfs_warning(p_s_sb, "NO_DISK_SPACE");
+			reiserfs_warning(p_s_sb, "reiserfs-5092",
+					 "NO_DISK_SPACE");
 		unfix_nodes(&s_cut_balance);
 		return -EIO;
 	}
@@ -1743,8 +1740,7 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
 {
 	BUG_ON(!th->t_trans_id);
 	if (inode->i_nlink)
-		reiserfs_warning(inode->i_sb,
-				 "vs-5655: truncate_directory: link count != 0");
+		reiserfs_warning(inode->i_sb, "vs-5655", "link count != 0");
 
 	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
 	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
@@ -1797,16 +1793,14 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 	    search_for_position_by_key(p_s_inode->i_sb, &s_item_key,
 				       &s_search_path);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(p_s_inode->i_sb,
-				 "vs-5657: reiserfs_do_truncate: "
+		reiserfs_warning(p_s_inode->i_sb, "vs-5657",
 				 "i/o failure occurred trying to truncate %K",
 				 &s_item_key);
 		err = -EIO;
 		goto out;
 	}
 	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
-		reiserfs_warning(p_s_inode->i_sb,
-				 "PAP-5660: reiserfs_do_truncate: "
+		reiserfs_warning(p_s_inode->i_sb, "PAP-5660",
 				 "wrong result %d of search for %K", retval,
 				 &s_item_key);
 
@@ -1850,8 +1844,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
 					   p_s_inode, page, n_new_file_size);
 		if (n_deleted < 0) {
-			reiserfs_warning(p_s_inode->i_sb,
-					 "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed");
+			reiserfs_warning(p_s_inode->i_sb, "vs-5665",
+					 "reiserfs_cut_from_item failed");
 			reiserfs_check_path(&s_search_path);
 			return 0;
 		}
@@ -2000,8 +1994,8 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 			goto error_out;
 		}
 		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb,
-					 "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists",
+			reiserfs_warning(inode->i_sb, "PAP-5710",
+					 "entry or pasted byte (%K) exists",
 					 p_s_key);
 			retval = -EEXIST;
 			goto error_out;
@@ -2087,8 +2081,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 			goto error_out;
 		}
 		if (retval == ITEM_FOUND) {
-			reiserfs_warning(th->t_super,
-					 "PAP-5760: reiserfs_insert_item: "
+			reiserfs_warning(th->t_super, "PAP-5760",
 					 "key %K already exists in the tree",
 					 key);
 			retval = -EEXIST;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0428004dc638..bfc276c8e978 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -183,9 +183,9 @@ static int finish_unfinished(struct super_block *s)
 		if (REISERFS_SB(s)->s_qf_names[i]) {
 			int ret = reiserfs_quota_on_mount(s, i);
 			if (ret < 0)
-				reiserfs_warning(s,
-						 "reiserfs: cannot turn on journaled quota: error %d",
-						 ret);
+				reiserfs_warning(s, "reiserfs-2500",
+						 "cannot turn on journaled "
+						 "quota: error %d", ret);
 		}
 	}
 #endif
@@ -195,8 +195,8 @@ static int finish_unfinished(struct super_block *s)
 	while (!retval) {
 		retval = search_item(s, &max_cpu_key, &path);
 		if (retval != ITEM_NOT_FOUND) {
-			reiserfs_warning(s,
-					 "vs-2140: finish_unfinished: search_by_key returned %d",
+			reiserfs_warning(s, "vs-2140",
+					 "search_by_key returned %d",
 					 retval);
 			break;
 		}
@@ -204,8 +204,8 @@ static int finish_unfinished(struct super_block *s)
 		bh = get_last_bh(&path);
 		item_pos = get_item_pos(&path);
 		if (item_pos != B_NR_ITEMS(bh)) {
-			reiserfs_warning(s,
-					 "vs-2060: finish_unfinished: wrong position found");
+			reiserfs_warning(s, "vs-2060",
+					 "wrong position found");
 			break;
 		}
 		item_pos--;
@@ -235,8 +235,7 @@ static int finish_unfinished(struct super_block *s)
 		if (!inode) {
 			/* the unlink almost completed, it just did not manage to remove
 			   "save" link and release objectid */
-			reiserfs_warning(s,
-					 "vs-2180: finish_unfinished: iget failed for %K",
+			reiserfs_warning(s, "vs-2180", "iget failed for %K",
 					 &obj_key);
 			retval = remove_save_link_only(s, &save_link_key, 1);
 			continue;
@@ -244,8 +243,8 @@ static int finish_unfinished(struct super_block *s)
 
 		if (!truncate && inode->i_nlink) {
 			/* file is not unlinked */
-			reiserfs_warning(s,
-					 "vs-2185: finish_unfinished: file %K is not unlinked",
+			reiserfs_warning(s, "vs-2185",
+					 "file %K is not unlinked",
 					 &obj_key);
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			continue;
@@ -257,8 +256,9 @@ static int finish_unfinished(struct super_block *s)
 			   The only imaginable way is to execute unfinished truncate request
 			   then boot into old kernel, remove the file and create dir with
 			   the same key. */
-			reiserfs_warning(s,
-					 "green-2101: impossible truncate on a directory %k. Please report",
+			reiserfs_warning(s, "green-2101",
+					 "impossible truncate on a "
+					 "directory %k. Please report",
 					 INODE_PKEY(inode));
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			truncate = 0;
@@ -288,9 +288,10 @@ static int finish_unfinished(struct super_block *s)
 				/* removal gets completed in iput */
 				retval = 0;
 			} else {
-				reiserfs_warning(s, "Dead loop in "
-						"finish_unfinished detected, "
-						"just remove save link\n");
+				reiserfs_warning(s, "super-2189", "Dead loop "
+						 "in finish_unfinished "
+						 "detected, just remove "
+						 "save link\n");
 				retval = remove_save_link_only(s,
 							&save_link_key, 0);
 			}
@@ -360,8 +361,9 @@ void add_save_link(struct reiserfs_transaction_handle *th,
 	} else {
 		/* truncate */
 		if (S_ISDIR(inode->i_mode))
-			reiserfs_warning(inode->i_sb,
-					 "green-2102: Adding a truncate savelink for a directory %k! Please report",
+			reiserfs_warning(inode->i_sb, "green-2102",
+					 "Adding a truncate savelink for "
+					 "a directory %k! Please report",
 					 INODE_PKEY(inode));
 		set_cpu_key_k_offset(&key, 1);
 		set_cpu_key_k_type(&key, TYPE_INDIRECT);
@@ -376,7 +378,7 @@ void add_save_link(struct reiserfs_transaction_handle *th,
 	retval = search_item(inode->i_sb, &key, &path);
 	if (retval != ITEM_NOT_FOUND) {
 		if (retval != -ENOSPC)
-			reiserfs_warning(inode->i_sb, "vs-2100: add_save_link:"
+			reiserfs_warning(inode->i_sb, "vs-2100",
 					 "search_by_key (%K) returned %d", &key,
 					 retval);
 		pathrelse(&path);
@@ -391,9 +393,8 @@ void add_save_link(struct reiserfs_transaction_handle *th,
 	    reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
 	if (retval) {
 		if (retval != -ENOSPC)
-			reiserfs_warning(inode->i_sb,
-					 "vs-2120: add_save_link: insert_item returned %d",
-					 retval);
+			reiserfs_warning(inode->i_sb, "vs-2120",
+					 "insert_item returned %d", retval);
 	} else {
 		if (truncate)
 			REISERFS_I(inode)->i_flags |=
@@ -492,8 +493,7 @@ static void reiserfs_put_super(struct super_block *s)
 	print_statistics(s);
 
 	if (REISERFS_SB(s)->reserved_blocks != 0) {
-		reiserfs_warning(s,
-				 "green-2005: reiserfs_put_super: reserved blocks left %d",
+		reiserfs_warning(s, "green-2005", "reserved blocks left %d",
 				 REISERFS_SB(s)->reserved_blocks);
 	}
 
@@ -559,8 +559,8 @@ static void reiserfs_dirty_inode(struct inode *inode)
 
 	int err = 0;
 	if (inode->i_sb->s_flags & MS_RDONLY) {
-		reiserfs_warning(inode->i_sb,
-				 "clm-6006: writing inode %lu on readonly FS",
+		reiserfs_warning(inode->i_sb, "clm-6006",
+				 "writing inode %lu on readonly FS",
 				 inode->i_ino);
 		return;
 	}
@@ -794,13 +794,15 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 			if (bit_flags) {
 				if (opt->clrmask ==
 				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "%s not supported.",
+					reiserfs_warning(s, "super-6500",
+							 "%s not supported.\n",
 							 p);
 				else
 					*bit_flags &= ~opt->clrmask;
 				if (opt->setmask ==
 				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "%s not supported.",
+					reiserfs_warning(s, "super-6501",
+							 "%s not supported.\n",
 							 p);
 				else
 					*bit_flags |= opt->setmask;
@@ -809,7 +811,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 		}
 	}
 	if (!opt->option_name) {
-		reiserfs_warning(s, "unknown mount option \"%s\"", p);
+		reiserfs_warning(s, "super-6502",
+				 "unknown mount option \"%s\"", p);
 		return -1;
 	}
 
@@ -817,8 +820,9 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 	switch (*p) {
 	case '=':
 		if (!opt->arg_required) {
-			reiserfs_warning(s,
-					 "the option \"%s\" does not require an argument",
+			reiserfs_warning(s, "super-6503",
+					 "the option \"%s\" does not "
+					 "require an argument\n",
 					 opt->option_name);
 			return -1;
 		}
@@ -826,14 +830,15 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 
 	case 0:
 		if (opt->arg_required) {
-			reiserfs_warning(s,
-					 "the option \"%s\" requires an argument",
-					 opt->option_name);
+			reiserfs_warning(s, "super-6504",
+					 "the option \"%s\" requires an "
+					 "argument\n", opt->option_name);
 			return -1;
 		}
 		break;
 	default:
-		reiserfs_warning(s, "head of option \"%s\" is only correct",
+		reiserfs_warning(s, "super-6505",
+				 "head of option \"%s\" is only correct\n",
 				 opt->option_name);
 		return -1;
 	}
@@ -845,7 +850,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 	    && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
 	    && !strlen(p)) {
 		/* this catches "option=," if not allowed */
-		reiserfs_warning(s, "empty argument for \"%s\"",
+		reiserfs_warning(s, "super-6506",
+				 "empty argument for \"%s\"\n",
 				 opt->option_name);
 		return -1;
 	}
@@ -867,7 +873,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 		}
 	}
 
-	reiserfs_warning(s, "bad value \"%s\" for option \"%s\"", p,
+	reiserfs_warning(s, "super-6506",
+			 "bad value \"%s\" for option \"%s\"\n", p,
 			 opt->option_name);
 	return -1;
 }
@@ -957,9 +964,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 				*blocks = simple_strtoul(arg, &p, 0);
 				if (*p != '\0') {
 					/* NNN does not look like a number */
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: bad value %s",
-							 arg);
+					reiserfs_warning(s, "super-6507",
+							 "bad value %s for "
+							 "-oresize\n", arg);
 					return 0;
 				}
 			}
@@ -970,8 +977,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 			unsigned long val = simple_strtoul(arg, &p, 0);
 			/* commit=NNN (time in seconds) */
 			if (*p != '\0' || val >= (unsigned int)-1) {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: bad value %s",
+				reiserfs_warning(s, "super-6508",
+						 "bad value %s for -ocommit\n",
 						 arg);
 				return 0;
 			}
@@ -979,16 +986,18 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		}
 
 		if (c == 'w') {
-			reiserfs_warning(s, "reiserfs: nolargeio option is no longer supported");
+			reiserfs_warning(s, "super-6509", "nolargeio option "
+					 "is no longer supported");
 			return 0;
 		}
 
 		if (c == 'j') {
 			if (arg && *arg && jdev_name) {
 				if (*jdev_name) {	//Hm, already assigned?
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: journal device was already  specified to be %s",
-							 *jdev_name);
+					reiserfs_warning(s, "super-6510",
+							 "journal device was "
+							 "already specified to "
+							 "be %s", *jdev_name);
 					return 0;
 				}
 				*jdev_name = arg;
@@ -1000,29 +1009,35 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 
 			if (sb_any_quota_loaded(s) &&
 			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+				reiserfs_warning(s, "super-6511",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
 				return 0;
 			}
 			if (*arg) {	/* Some filename specified? */
 				if (REISERFS_SB(s)->s_qf_names[qtype]
 				    && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
 					      arg)) {
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: %s quota file already specified.",
+					reiserfs_warning(s, "super-6512",
+							 "%s quota file "
+							 "already specified.",
 							 QTYPE2NAME(qtype));
 					return 0;
 				}
 				if (strchr(arg, '/')) {
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: quotafile must be on filesystem root.");
+					reiserfs_warning(s, "super-6513",
+							 "quotafile must be "
+							 "on filesystem root.");
 					return 0;
 				}
 				qf_names[qtype] =
 				    kmalloc(strlen(arg) + 1, GFP_KERNEL);
 				if (!qf_names[qtype]) {
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: not enough memory for storing quotafile name.");
+					reiserfs_warning(s, "reiserfs-2502",
+							 "not enough memory "
+							 "for storing "
+							 "quotafile name.");
 					return 0;
 				}
 				strcpy(qf_names[qtype], arg);
@@ -1040,21 +1055,24 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 			else if (!strcmp(arg, "vfsv0"))
 				*qfmt = QFMT_VFS_V0;
 			else {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: unknown quota format specified.");
+				reiserfs_warning(s, "super-6514",
+						 "unknown quota format "
+						 "specified.");
 				return 0;
 			}
 			if (sb_any_quota_loaded(s) &&
 			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+				reiserfs_warning(s, "super-6515",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
 				return 0;
 			}
 		}
 #else
 		if (c == 'u' || c == 'g' || c == 'f') {
-			reiserfs_warning(s,
-					 "reiserfs_parse_options: journaled quota options not supported.");
+			reiserfs_warning(s, "reiserfs-2503", "journaled "
+					 "quota options not supported.");
 			return 0;
 		}
 #endif
@@ -1063,15 +1081,15 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 #ifdef CONFIG_QUOTA
 	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
 	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
-		reiserfs_warning(s,
-				 "reiserfs_parse_options: journaled quota format not specified.");
+		reiserfs_warning(s, "super-6515",
+				 "journaled quota format not specified.");
 		return 0;
 	}
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
 	if (!(*mount_options & (1 << REISERFS_QUOTA))
 	    && sb_any_quota_loaded(s)) {
-		reiserfs_warning(s,
-				 "reiserfs_parse_options: quota options must be present when quota is turned on.");
+		reiserfs_warning(s, "super-6516", "quota options must "
+				 "be present when quota is turned on.");
 		return 0;
 	}
 #endif
@@ -1131,14 +1149,15 @@ static void handle_attrs(struct super_block *s)
 
 	if (reiserfs_attrs(s)) {
 		if (old_format_only(s)) {
-			reiserfs_warning(s,
-					 "reiserfs: cannot support attributes on 3.5.x disk format");
+			reiserfs_warning(s, "super-6517", "cannot support "
+					 "attributes on 3.5.x disk format");
 			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 			return;
 		}
 		if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
-			reiserfs_warning(s,
-					 "reiserfs: cannot support attributes until flag is set in super-block");
+			reiserfs_warning(s, "super-6518", "cannot support "
+					 "attributes until flag is set in "
+					 "super-block");
 			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 		}
 	}
@@ -1316,7 +1335,7 @@ static int read_super_block(struct super_block *s, int offset)
 
 	bh = sb_bread(s, offset / s->s_blocksize);
 	if (!bh) {
-		reiserfs_warning(s, "sh-2006: read_super_block: "
+		reiserfs_warning(s, "sh-2006",
 				 "bread failed (dev %s, block %lu, size %lu)",
 				 reiserfs_bdevname(s), offset / s->s_blocksize,
 				 s->s_blocksize);
@@ -1337,8 +1356,8 @@ static int read_super_block(struct super_block *s, int offset)
 
 	bh = sb_bread(s, offset / s->s_blocksize);
 	if (!bh) {
-		reiserfs_warning(s, "sh-2007: read_super_block: "
-				 "bread failed (dev %s, block %lu, size %lu)\n",
+		reiserfs_warning(s, "sh-2007",
+				 "bread failed (dev %s, block %lu, size %lu)",
 				 reiserfs_bdevname(s), offset / s->s_blocksize,
 				 s->s_blocksize);
 		return 1;
@@ -1346,8 +1365,8 @@ static int read_super_block(struct super_block *s, int offset)
 
 	rs = (struct reiserfs_super_block *)bh->b_data;
 	if (sb_blocksize(rs) != s->s_blocksize) {
-		reiserfs_warning(s, "sh-2011: read_super_block: "
-				 "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n",
+		reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
+				 "filesystem on (dev %s, block %Lu, size %lu)",
 				 reiserfs_bdevname(s),
 				 (unsigned long long)bh->b_blocknr,
 				 s->s_blocksize);
@@ -1357,9 +1376,10 @@ static int read_super_block(struct super_block *s, int offset)
 
 	if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
 		brelse(bh);
-		reiserfs_warning(s,
-				 "Unfinished reiserfsck --rebuild-tree run detected. Please run\n"
-				 "reiserfsck --rebuild-tree and wait for a completion. If that fails\n"
+		reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
+				 "--rebuild-tree run detected. Please run\n"
+				 "reiserfsck --rebuild-tree and wait for a "
+				 "completion. If that fails\n"
 				 "get newer reiserfsprogs package");
 		return 1;
 	}
@@ -1377,10 +1397,9 @@ static int read_super_block(struct super_block *s, int offset)
 			reiserfs_info(s, "found reiserfs format \"3.5\""
 				      " with non-standard journal\n");
 		else {
-			reiserfs_warning(s,
-					 "sh-2012: read_super_block: found unknown "
-					 "format \"%u\" of reiserfs with non-standard magic",
-					 sb_version(rs));
+			reiserfs_warning(s, "sh-2012", "found unknown "
+					 "format \"%u\" of reiserfs with "
+					 "non-standard magic", sb_version(rs));
 			return 1;
 		}
 	} else
@@ -1410,8 +1429,7 @@ static int reread_meta_blocks(struct super_block *s)
 	ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
 	wait_on_buffer(SB_BUFFER_WITH_SB(s));
 	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
-		reiserfs_warning(s,
-				 "reread_meta_blocks, error reading the super");
+		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
 		return 1;
 	}
 
@@ -1475,10 +1493,10 @@ static __u32 find_hash_out(struct super_block *s)
 			&& (yurahash ==
 			    GET_HASH_VALUE(deh_offset
 					   (&(de.de_deh[de.de_entry_num])))))) {
-			reiserfs_warning(s,
-					 "Unable to automatically detect hash function. "
-					 "Please mount with -o hash={tea,rupasov,r5}",
-					 reiserfs_bdevname(s));
+			reiserfs_warning(s, "reiserfs-2506", "Unable to "
+					 "automatically detect hash function. "
+					 "Please mount with -o "
+					 "hash={tea,rupasov,r5}");
 			hash = UNSET_HASH;
 			break;
 		}
@@ -1492,7 +1510,8 @@ static __u32 find_hash_out(struct super_block *s)
 			 (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash)
 			hash = R5_HASH;
 		else {
-			reiserfs_warning(s, "Unrecognised hash function");
+			reiserfs_warning(s, "reiserfs-2506",
+					 "Unrecognised hash function");
 			hash = UNSET_HASH;
 		}
 	} while (0);
@@ -1520,17 +1539,20 @@ static int what_hash(struct super_block *s)
 		 ** mount options 
 		 */
 		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
-			reiserfs_warning(s, "Error, %s hash detected, "
+			reiserfs_warning(s, "reiserfs-2507",
+					 "Error, %s hash detected, "
 					 "unable to force rupasov hash",
 					 reiserfs_hashname(code));
 			code = UNSET_HASH;
 		} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
-			reiserfs_warning(s, "Error, %s hash detected, "
+			reiserfs_warning(s, "reiserfs-2508",
+					 "Error, %s hash detected, "
 					 "unable to force tea hash",
 					 reiserfs_hashname(code));
 			code = UNSET_HASH;
 		} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
-			reiserfs_warning(s, "Error, %s hash detected, "
+			reiserfs_warning(s, "reiserfs-2509",
+					 "Error, %s hash detected, "
 					 "unable to force r5 hash",
 					 reiserfs_hashname(code));
 			code = UNSET_HASH;
@@ -1589,9 +1611,9 @@ static int function2code(hashf_t func)
 	return 0;
 }
 
-#define SWARN(silent, s, ...)			\
+#define SWARN(silent, s, id, ...)			\
 	if (!(silent))				\
-		reiserfs_warning (s, __VA_ARGS__)
+		reiserfs_warning(s, id, __VA_ARGS__)
 
 static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 {
@@ -1643,8 +1665,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 #endif
 
 	if (blocks) {
-		SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option "
-		      "for remount only");
+		SWARN(silent, s, "jmacd-7", "resize option for remount only");
 		goto error;
 	}
 
@@ -1653,8 +1674,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		old_format = 1;
 	/* try new format (64-th 1k block), which can contain reiserfs super block */
 	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
-		SWARN(silent, s,
-		      "sh-2021: reiserfs_fill_super: can not find reiserfs on %s",
+		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
 		      reiserfs_bdevname(s));
 		goto error;
 	}
@@ -1666,13 +1686,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (s->s_bdev && s->s_bdev->bd_inode
 	    && i_size_read(s->s_bdev->bd_inode) <
 	    sb_block_count(rs) * sb_blocksize(rs)) {
-		SWARN(silent, s,
-		      "Filesystem on %s cannot be mounted because it is bigger than the device",
-		      reiserfs_bdevname(s));
-		SWARN(silent, s,
-		      "You may need to run fsck or increase size of your LVM partition");
-		SWARN(silent, s,
-		      "Or may be you forgot to reboot after fdisk when it told you to");
+		SWARN(silent, s, "", "Filesystem cannot be "
+		      "mounted because it is bigger than the device");
+		SWARN(silent, s, "", "You may need to run fsck "
+		      "or increase size of your LVM partition");
+		SWARN(silent, s, "", "Or may be you forgot to "
+		      "reboot after fdisk when it told you to");
 		goto error;
 	}
 
@@ -1680,14 +1699,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	sbi->s_mount_state = REISERFS_VALID_FS;
 
 	if ((errval = reiserfs_init_bitmap_cache(s))) {
-		SWARN(silent, s,
-		      "jmacd-8: reiserfs_fill_super: unable to read bitmap");
+		SWARN(silent, s, "jmacd-8", "unable to read bitmap");
 		goto error;
 	}
 	errval = -EINVAL;
 #ifdef CONFIG_REISERFS_CHECK
-	SWARN(silent, s, "CONFIG_REISERFS_CHECK is set ON");
-	SWARN(silent, s, "- it is slow mode for debugging.");
+	SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
+	SWARN(silent, s, "", "- it is slow mode for debugging.");
 #endif
 
 	/* make data=ordered the default */
@@ -1708,8 +1726,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	}
 	// set_device_ro(s->s_dev, 1) ;
 	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
-		SWARN(silent, s,
-		      "sh-2022: reiserfs_fill_super: unable to initialize journal space");
+		SWARN(silent, s, "sh-2022",
+		      "unable to initialize journal space");
 		goto error;
 	} else {
 		jinit_done = 1;	/* once this is set, journal_release must be called
@@ -1717,8 +1735,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 				 */
 	}
 	if (reread_meta_blocks(s)) {
-		SWARN(silent, s,
-		      "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init");
+		SWARN(silent, s, "jmacd-9",
+		      "unable to reread meta blocks after journal init");
 		goto error;
 	}
 
@@ -1726,8 +1744,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		goto error;
 
 	if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
-		SWARN(silent, s,
-		      "clm-7000: Detected readonly device, marking FS readonly");
+		SWARN(silent, s, "clm-7000",
+		      "Detected readonly device, marking FS readonly");
 		s->s_flags |= MS_RDONLY;
 	}
 	args.objectid = REISERFS_ROOT_OBJECTID;
@@ -1736,8 +1754,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	    iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
 			 reiserfs_init_locked_inode, (void *)(&args));
 	if (!root_inode) {
-		SWARN(silent, s,
-		      "jmacd-10: reiserfs_fill_super: get root inode failed");
+		SWARN(silent, s, "jmacd-10", "get root inode failed");
 		goto error;
 	}
 
@@ -1786,7 +1803,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		 * avoiding corruption. -jeffm */
 		if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
 		    sb_bmap_nr(rs) != 0) {
-			reiserfs_warning(s, "super-2030: This file system "
+			reiserfs_warning(s, "super-2030", "This file system "
 					"claims to use %u bitmap blocks in "
 					"its super block, but requires %u. "
 					"Clearing to zero.", sb_bmap_nr(rs),
@@ -2087,8 +2104,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		err = reiserfs_unpack(inode, NULL);
 		if (err) {
-			reiserfs_warning(sb,
-				"reiserfs: Unpacking tail of quota file failed"
+			reiserfs_warning(sb, "super-6520",
+				"Unpacking tail of quota file failed"
 				" (%d). Cannot turn on quotas.", err);
 			err = -EINVAL;
 			goto out;
@@ -2099,8 +2116,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	if (REISERFS_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
 		if (path.dentry->d_parent != sb->s_root)
-			reiserfs_warning(sb,
-				 "reiserfs: Quota file not on filesystem root. "
+			reiserfs_warning(sb, "super-6521",
+				 "Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
 	}
 
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index f8121a1147e8..256285dddb20 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -48,9 +48,9 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 
 	// FIXME: we could avoid this 
 	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
-		reiserfs_warning(sb, "PAP-14030: direct2indirect: "
-				 "pasted or inserted byte exists in the tree %K. "
-				 "Use fsck to repair.", &end_key);
+		reiserfs_warning(sb, "PAP-14030",
+				 "pasted or inserted byte exists in "
+				 "the tree %K. Use fsck to repair.", &end_key);
 		pathrelse(path);
 		return -EIO;
 	}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e11b00472361..d14f5c2c0e4a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -259,7 +259,8 @@ static int __xattr_readdir(struct inode *inode, void *dirent, filldir_t filldir)
 		ih = de.de_ih;
 
 		if (!is_direntry_le_ih(ih)) {
-			reiserfs_warning(inode->i_sb, "not direntry %h", ih);
+			reiserfs_warning(inode->i_sb, "jdm-20000",
+					 "not direntry %h", ih);
 			break;
 		}
 		copy_item_head(&tmp_ih, ih);
@@ -598,7 +599,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
 				unlock_page(page);
 				reiserfs_put_page(page);
-				reiserfs_warning(inode->i_sb,
+				reiserfs_warning(inode->i_sb, "jdm-20001",
 						 "Invalid magic for xattr (%s) "
 						 "associated with %k", name,
 						 INODE_PKEY(inode));
@@ -618,7 +619,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 
 	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
 	    hash) {
-		reiserfs_warning(inode->i_sb,
+		reiserfs_warning(inode->i_sb, "jdm-20002",
 				 "Invalid hash for xattr (%s) associated "
 				 "with %k", name, INODE_PKEY(inode));
 		err = -EIO;
@@ -652,7 +653,8 @@ __reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
 		goto out_file;
 
 	if (!is_reiserfs_priv_object(dentry->d_inode)) {
-		reiserfs_warning(dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have "
+		reiserfs_warning(dir->i_sb, "jdm-20003",
+				 "OID %08x [%.*s/%.*s] doesn't have "
 				 "priv flag set [parent is %sset].",
 				 le32_to_cpu(INODE_PKEY(dentry->d_inode)->
 					     k_objectid), xadir->d_name.len,
@@ -750,7 +752,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
 		reiserfs_write_unlock_xattrs(inode->i_sb);
 		dput(root);
 	} else {
-		reiserfs_warning(inode->i_sb,
+		reiserfs_warning(inode->i_sb, "jdm-20006",
 				 "Couldn't remove all entries in directory");
 	}
 	unlock_kernel();
@@ -1154,7 +1156,8 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	} else if (reiserfs_xattrs_optional(s)) {
 		/* Old format filesystem, but optional xattrs have been enabled
 		 * at mount time. Error out. */
-		reiserfs_warning(s, "xattrs/ACLs not supported on pre v3.6 "
+		reiserfs_warning(s, "jdm-20005",
+				 "xattrs/ACLs not supported on pre v3.6 "
 				 "format filesystem. Failing mount.");
 		err = -EOPNOTSUPP;
 		goto error;
@@ -1201,8 +1204,10 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 			/* If we're read-only it just means that the dir hasn't been
 			 * created. Not an error -- just no xattrs on the fs. We'll
 			 * check again if we go read-write */
-			reiserfs_warning(s, "xattrs/ACLs enabled and couldn't "
-					 "find/create .reiserfs_priv. Failing mount.");
+			reiserfs_warning(s, "jdm-20006",
+					 "xattrs/ACLs enabled and couldn't "
+					 "find/create .reiserfs_priv. "
+					 "Failing mount.");
 			err = -EOPNOTSUPP;
 		}
 	}
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 65bb5e3e3abe..056e2a3b04e3 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -79,7 +79,10 @@ struct fid;
 */
 #define REISERFS_DEBUG_CODE 5	/* extra messages to help find/debug errors */
 
-void reiserfs_warning(struct super_block *s, const char *fmt, ...);
+void __reiserfs_warning(struct super_block *s, const char *id,
+			 const char *func, const char *fmt, ...);
+#define reiserfs_warning(s, id, fmt, args...) \
+	 __reiserfs_warning(s, id, __func__, fmt, ##args)
 /* assertions handling */
 
 /** always check a condition and panic if it's false. */
@@ -558,7 +561,7 @@ static inline int uniqueness2type(__u32 uniqueness)
 	case V1_DIRENTRY_UNIQUENESS:
 		return TYPE_DIRENTRY;
 	default:
-		reiserfs_warning(NULL, "vs-500: unknown uniqueness %d",
+		reiserfs_warning(NULL, "vs-500", "unknown uniqueness %d",
 				 uniqueness);
 	case V1_ANY_UNIQUENESS:
 		return TYPE_ANY;
@@ -578,7 +581,7 @@ static inline __u32 type2uniqueness(int type)
 	case TYPE_DIRENTRY:
 		return V1_DIRENTRY_UNIQUENESS;
 	default:
-		reiserfs_warning(NULL, "vs-501: unknown type %d", type);
+		reiserfs_warning(NULL, "vs-501", "unknown type %d", type);
 	case TYPE_ANY:
 		return V1_ANY_UNIQUENESS;
 	}
-- 
cgit v1.2.3-71-gd317


From fd7cb031efb1dd71cb731668e2f597d9e61acdcb Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:23 -0400
Subject: reiserfs: eliminate reiserfs_warning from uniqueness functions

uniqueness2type and type2uniquness issue a warning when the value is
unknown. When called from reiserfs_warning, this causes a re-entrancy
problem and deadlocks on the error buffer lock.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reiserfs_fs.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 056e2a3b04e3..cf5407ee0f32 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -560,10 +560,8 @@ static inline int uniqueness2type(__u32 uniqueness)
 		return TYPE_DIRECT;
 	case V1_DIRENTRY_UNIQUENESS:
 		return TYPE_DIRENTRY;
-	default:
-		reiserfs_warning(NULL, "vs-500", "unknown uniqueness %d",
-				 uniqueness);
 	case V1_ANY_UNIQUENESS:
+	default:
 		return TYPE_ANY;
 	}
 }
@@ -580,9 +578,8 @@ static inline __u32 type2uniqueness(int type)
 		return V1_DIRECT_UNIQUENESS;
 	case TYPE_DIRENTRY:
 		return V1_DIRENTRY_UNIQUENESS;
-	default:
-		reiserfs_warning(NULL, "vs-501", "unknown type %d", type);
 	case TYPE_ANY:
+	default:
 		return V1_ANY_UNIQUENESS;
 	}
 }
-- 
cgit v1.2.3-71-gd317


From c3a9c2109f84882b9b3178f6b1838d550d3df0ec Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:25 -0400
Subject: reiserfs: rework reiserfs_panic

ReiserFS panics can be somewhat inconsistent.
In some cases:
 * a unique identifier may be associated with it
 * the function name may be included
 * the device may be printed separately

This patch aims to make warnings more consistent. reiserfs_warning() prints
the device name, so printing it a second time is not required. The function
name for a warning is always helpful in debugging, so it is now automatically
inserted into the output. Hans has stated that every warning should have
a unique identifier. Some cases lack them, others really shouldn't have them.
reiserfs_warning() now expects an id associated with each message. In the
rare case where one isn't needed, "" will suffice.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/do_balan.c        | 67 ++++++++++++++++++++++--------------------
 fs/reiserfs/fix_node.c        | 68 ++++++++++++++++++++++---------------------
 fs/reiserfs/ibalance.c        | 12 ++++----
 fs/reiserfs/inode.c           |  3 +-
 fs/reiserfs/item_ops.c        |  8 +++--
 fs/reiserfs/journal.c         | 57 ++++++++++++++++++------------------
 fs/reiserfs/lbalance.c        | 27 +++++++++--------
 fs/reiserfs/namei.c           | 18 +++++-------
 fs/reiserfs/objectid.c        |  3 +-
 fs/reiserfs/prints.c          | 33 ++++++++++-----------
 fs/reiserfs/stree.c           | 49 +++++++++++++++----------------
 fs/reiserfs/tail_conversion.c | 10 +++----
 include/linux/reiserfs_fs.h   | 28 ++++++++++++++----
 13 files changed, 200 insertions(+), 183 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index f701f37ddf98..e788fbc3ff6b 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -153,8 +153,8 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 
 	default:
 		print_cur_tb("12040");
-		reiserfs_panic(tb->tb_sb,
-			       "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)",
+		reiserfs_panic(tb->tb_sb, "PAP-12040",
+			       "unexpected mode: %s(%d)",
 			       (flag ==
 				M_PASTE) ? "PASTE" : ((flag ==
 						       M_INSERT) ? "INSERT" :
@@ -721,8 +721,9 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 				}
 				break;
 			default:	/* cases d and t */
-				reiserfs_panic(tb->tb_sb,
-					       "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)",
+				reiserfs_panic(tb->tb_sb, "PAP-12130",
+					       "lnum > 0: unexpected mode: "
+					       " %s(%d)",
 					       (flag ==
 						M_DELETE) ? "DELETE" : ((flag ==
 									 M_CUT)
@@ -1134,8 +1135,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 			}
 			break;
 		default:	/* cases d and t */
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)",
+			reiserfs_panic(tb->tb_sb, "PAP-12175",
+				       "rnum > 0: unexpected mode: %s(%d)",
 				       (flag ==
 					M_DELETE) ? "DELETE" : ((flag ==
 								 M_CUT) ? "CUT"
@@ -1165,8 +1166,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 		   not set correctly */
 		if (tb->CFL[0]) {
 			if (!tb->CFR[0])
-				reiserfs_panic(tb->tb_sb,
-					       "vs-12195: balance_leaf: CFR not initialized");
+				reiserfs_panic(tb->tb_sb, "vs-12195",
+					       "CFR not initialized");
 			copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
 				 B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]));
 			do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
@@ -1472,7 +1473,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					    && (pos_in_item != ih_item_len(ih_check)
 						|| tb->insert_size[0] <= 0))
 						reiserfs_panic(tb->tb_sb,
-							       "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len");
+							     "PAP-12235",
+							     "pos_in_item "
+							     "must be equal "
+							     "to ih_item_len");
 #endif				/* CONFIG_REISERFS_CHECK */
 
 					leaf_mi =
@@ -1532,8 +1536,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 			}
 			break;
 		default:	/* cases d and t */
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)",
+			reiserfs_panic(tb->tb_sb, "PAP-12245",
+				       "blknum > 2: unexpected mode: %s(%d)",
 				       (flag ==
 					M_DELETE) ? "DELETE" : ((flag ==
 								 M_CUT) ? "CUT"
@@ -1678,10 +1682,11 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							print_cur_tb("12285");
 							reiserfs_panic(tb->
 								       tb_sb,
-								       "PAP-12285: balance_leaf: insert_size must be 0 (%d)",
-								       tb->
-								       insert_size
-								       [0]);
+							    "PAP-12285",
+							    "insert_size "
+							    "must be 0 "
+							    "(%d)",
+							    tb->insert_size[0]);
 						}
 					}
 #endif				/* CONFIG_REISERFS_CHECK */
@@ -1694,11 +1699,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 	if (flag == M_PASTE && tb->insert_size[0]) {
 		print_cur_tb("12290");
 		reiserfs_panic(tb->tb_sb,
-			       "PAP-12290: balance_leaf: insert_size is still not 0 (%d)",
+			       "PAP-12290", "insert_size is still not 0 (%d)",
 			       tb->insert_size[0]);
 	}
 #endif				/* CONFIG_REISERFS_CHECK */
-
 	return 0;
 }				/* Leaf level of the tree is balanced (end of balance_leaf) */
 
@@ -1729,8 +1733,7 @@ struct buffer_head *get_FEB(struct tree_balance *tb)
 			break;
 
 	if (i == MAX_FEB_SIZE)
-		reiserfs_panic(tb->tb_sb,
-			       "vs-12300: get_FEB: FEB list is empty");
+		reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
 
 	bi.tb = tb;
 	bi.bi_bh = first_b = tb->FEB[i];
@@ -1871,8 +1874,8 @@ static void check_internal_node(struct super_block *s, struct buffer_head *bh,
 	for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
 		if (!is_reusable(s, dc_block_number(dc), 1)) {
 			print_cur_tb(mes);
-			reiserfs_panic(s,
-				       "PAP-12338: check_internal_node: invalid child pointer %y in %b",
+			reiserfs_panic(s, "PAP-12338",
+				       "invalid child pointer %y in %b",
 				       dc, bh);
 		}
 	}
@@ -1894,9 +1897,10 @@ static int check_before_balancing(struct tree_balance *tb)
 	int retval = 0;
 
 	if (cur_tb) {
-		reiserfs_panic(tb->tb_sb, "vs-12335: check_before_balancing: "
-			       "suspect that schedule occurred based on cur_tb not being null at this point in code. "
-			       "do_balance cannot properly handle schedule occurring while it runs.");
+		reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
+			       "occurred based on cur_tb not being null at "
+			       "this point in code. do_balance cannot properly "
+			       "handle schedule occurring while it runs.");
 	}
 
 	/* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1928,8 +1932,8 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 		    dc_size(B_N_CHILD
 			    (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
 			print_cur_tb("12221");
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12355: check_after_balance_leaf: shift to left was incorrect");
+			reiserfs_panic(tb->tb_sb, "PAP-12355",
+				       "shift to left was incorrect");
 		}
 	}
 	if (tb->rnum[0]) {
@@ -1938,8 +1942,8 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 		    dc_size(B_N_CHILD
 			    (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
 			print_cur_tb("12222");
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12360: check_after_balance_leaf: shift to right was incorrect");
+			reiserfs_panic(tb->tb_sb, "PAP-12360",
+				       "shift to right was incorrect");
 		}
 	}
 	if (PATH_H_PBUFFER(tb->tb_path, 1) &&
@@ -1964,8 +1968,7 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 					 (PATH_H_PBUFFER(tb->tb_path, 1),
 					  PATH_H_POSITION(tb->tb_path, 1))),
 				 right);
-		reiserfs_panic(tb->tb_sb,
-			       "PAP-12365: check_after_balance_leaf: S is incorrect");
+		reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
 	}
 }
 
@@ -2100,8 +2103,8 @@ void do_balance(struct tree_balance *tb,	/* tree_balance structure */
 	tb->need_balance_dirty = 0;
 
 	if (FILESYSTEM_CHANGED_TB(tb)) {
-		reiserfs_panic(tb->tb_sb,
-			       "clm-6000: do_balance, fs generation has changed\n");
+		reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
+			       "changed");
 	}
 	/* if we have no real work to do  */
 	if (!tb->insert_size[0]) {
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 59735a9e2349..bbb37b0589af 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -135,8 +135,7 @@ static void create_virtual_node(struct tree_balance *tb, int h)
 		vn->vn_free_ptr +=
 		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
 		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
-			reiserfs_panic(tb->tb_sb,
-				       "vs-8030: create_virtual_node: "
+			reiserfs_panic(tb->tb_sb, "vs-8030",
 				       "virtual node space consumed");
 
 		if (!is_affected)
@@ -186,8 +185,9 @@ static void create_virtual_node(struct tree_balance *tb, int h)
 			     && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
 				/* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
 				print_block(Sh, 0, -1, -1);
-				reiserfs_panic(tb->tb_sb,
-					       "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
+				reiserfs_panic(tb->tb_sb, "vs-8045",
+					       "rdkey %k, affected item==%d "
+					       "(mode==%c) Must be %c",
 					       key, vn->vn_affected_item_num,
 					       vn->vn_mode, M_DELETE);
 			}
@@ -1255,8 +1255,8 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 	/* Calculate balance parameters for creating new root. */
 	if (!Sh) {
 		if (!h)
-			reiserfs_panic(tb->tb_sb,
-				       "vs-8210: ip_check_balance: S[0] can not be 0");
+			reiserfs_panic(tb->tb_sb, "vs-8210",
+				       "S[0] can not be 0");
 		switch (n_ret_value = get_empty_nodes(tb, h)) {
 		case CARRY_ON:
 			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
@@ -1266,8 +1266,8 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 		case REPEAT_SEARCH:
 			return n_ret_value;
 		default:
-			reiserfs_panic(tb->tb_sb,
-				       "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
+			reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
+				       "return value of get_empty_nodes");
 		}
 	}
 
@@ -2095,38 +2095,38 @@ static void tb_buffer_sanity_check(struct super_block *p_s_sb,
 	if (p_s_bh) {
 		if (atomic_read(&(p_s_bh->b_count)) <= 0) {
 
-			reiserfs_panic(p_s_sb,
-				       "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n",
-				       descr, level, p_s_bh);
+			reiserfs_panic(p_s_sb, "jmacd-1", "negative or zero "
+				       "reference counter for buffer %s[%d] "
+				       "(%b)", descr, level, p_s_bh);
 		}
 
 		if (!buffer_uptodate(p_s_bh)) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n",
+			reiserfs_panic(p_s_sb, "jmacd-2", "buffer is not up "
+				       "to date %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
 		if (!B_IS_IN_TREE(p_s_bh)) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n",
+			reiserfs_panic(p_s_sb, "jmacd-3", "buffer is not "
+				       "in tree %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
 		if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n",
+			reiserfs_panic(p_s_sb, "jmacd-4", "buffer has wrong "
+				       "device %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
 		if (p_s_bh->b_size != p_s_sb->s_blocksize) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n",
+			reiserfs_panic(p_s_sb, "jmacd-5", "buffer has wrong "
+				       "blocksize %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
 		if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
+			reiserfs_panic(p_s_sb, "jmacd-6", "buffer block "
+				       "number too high %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 	}
@@ -2358,14 +2358,14 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_
 #ifdef CONFIG_REISERFS_CHECK
 	if (cur_tb) {
 		print_cur_tb("fix_nodes");
-		reiserfs_panic(p_s_tb->tb_sb,
-			       "PAP-8305: fix_nodes:  there is pending do_balance");
+		reiserfs_panic(p_s_tb->tb_sb, "PAP-8305",
+			       "there is pending do_balance");
 	}
 
 	if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
-		reiserfs_panic(p_s_tb->tb_sb,
-			       "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
-			       "at the beginning of fix_nodes or not in tree (mode %c)",
+		reiserfs_panic(p_s_tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
+			       "not uptodate at the beginning of fix_nodes "
+			       "or not in tree (mode %c)",
 			       p_s_tbS0, p_s_tbS0, n_op_mode);
 	}
 
@@ -2373,24 +2373,26 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_
 	switch (n_op_mode) {
 	case M_INSERT:
 		if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
-			reiserfs_panic(p_s_tb->tb_sb,
-				       "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
-				       n_item_num, B_NR_ITEMS(p_s_tbS0));
+			reiserfs_panic(p_s_tb->tb_sb, "PAP-8330", "Incorrect "
+				       "item number %d (in S0 - %d) in case "
+				       "of insert", n_item_num,
+				       B_NR_ITEMS(p_s_tbS0));
 		break;
 	case M_PASTE:
 	case M_DELETE:
 	case M_CUT:
 		if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
 			print_block(p_s_tbS0, 0, -1, -1);
-			reiserfs_panic(p_s_tb->tb_sb,
-				       "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n",
+			reiserfs_panic(p_s_tb->tb_sb, "PAP-8335", "Incorrect "
+				       "item number(%d); mode = %c "
+				       "insert_size = %d",
 				       n_item_num, n_op_mode,
 				       p_s_tb->insert_size[0]);
 		}
 		break;
 	default:
-		reiserfs_panic(p_s_tb->tb_sb,
-			       "PAP-8340: fix_nodes: Incorrect mode of operation");
+		reiserfs_panic(p_s_tb->tb_sb, "PAP-8340", "Incorrect mode "
+			       "of operation");
 	}
 #endif
 
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index de391a82b999..063b5514fe29 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -105,8 +105,8 @@ static void internal_define_dest_src_infos(int shift_mode,
 		break;
 
 	default:
-		reiserfs_panic(tb->tb_sb,
-			       "internal_define_dest_src_infos: shift type is unknown (%d)",
+		reiserfs_panic(tb->tb_sb, "ibalance-1",
+			       "shift type is unknown (%d)",
 			       shift_mode);
 	}
 }
@@ -702,8 +702,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
 
 		return;
 	}
-	reiserfs_panic(tb->tb_sb,
-		       "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
+	reiserfs_panic(tb->tb_sb, "ibalance-2",
+		       "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
 		       h, tb->lnum[h], h, tb->rnum[h]);
 }
 
@@ -940,8 +940,8 @@ int balance_internal(struct tree_balance *tb,	/* tree_balance structure
 		struct block_head *blkh;
 
 		if (tb->blknum[h] != 1)
-			reiserfs_panic(NULL,
-				       "balance_internal: One new node required for creating the new root");
+			reiserfs_panic(NULL, "ibalance-3", "One new node "
+				       "required for creating the new root");
 		/* S[h] = empty buffer from the list FEB. */
 		tbSh = get_FEB(tb);
 		blkh = B_BLK_HEAD(tbSh);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 95157762b1bf..7ee0097004c0 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1300,8 +1300,7 @@ static void update_stat_data(struct treepath *path, struct inode *inode,
 	ih = PATH_PITEM_HEAD(path);
 
 	if (!is_statdata_le_ih(ih))
-		reiserfs_panic(inode->i_sb,
-			       "vs-13065: update_stat_data: key %k, found item %h",
+		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
 			       INODE_PKEY(inode), ih);
 
 	if (stat_data_v1(ih)) {
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index 8a11cf39f57b..72cb1cc51b87 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -517,8 +517,9 @@ static int direntry_create_vi(struct virtual_node *vn,
 		    ((is_affected
 		      && (vn->vn_mode == M_PASTE
 			  || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
-			reiserfs_panic(NULL,
-				       "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item",
+			reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
+				       "insert_size==%d), invalid length of "
+				       "directory item",
 				       vn->vn_mode, insert_size);
 		}
 	}
@@ -549,7 +550,8 @@ static int direntry_check_left(struct virtual_item *vi, int free,
 	}
 
 	if (entries == dir_u->entry_count) {
-		reiserfs_panic(NULL, "free space %d, entry_count %d\n", free,
+		reiserfs_panic(NULL, "item_ops-1",
+			       "free space %d, entry_count %d", free,
 			       dir_u->entry_count);
 	}
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 88a031fafd07..774f3ba37409 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -436,8 +436,8 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
 {
 #ifdef CONFIG_SMP
 	if (current->lock_depth < 0) {
-		reiserfs_panic(sb, "%s called without kernel lock held",
-			       caller);
+		reiserfs_panic(sb, "journal-1", "%s called without kernel "
+			       "lock held", caller);
 	}
 #else
 	;
@@ -574,7 +574,7 @@ static inline void put_journal_list(struct super_block *s,
 				    struct reiserfs_journal_list *jl)
 {
 	if (jl->j_refcount < 1) {
-		reiserfs_panic(s, "trans id %u, refcount at %d",
+		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
 			       jl->j_trans_id, jl->j_refcount);
 	}
 	if (--jl->j_refcount == 0)
@@ -1416,8 +1416,7 @@ static int flush_journal_list(struct super_block *s,
 
 	count = 0;
 	if (j_len_saved > journal->j_trans_max) {
-		reiserfs_panic(s,
-			       "journal-715: flush_journal_list, length is %lu, trans id %lu\n",
+		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
 			       j_len_saved, jl->j_trans_id);
 		return 0;
 	}
@@ -1449,8 +1448,8 @@ static int flush_journal_list(struct super_block *s,
 	 ** or wait on a more recent transaction, or just ignore it 
 	 */
 	if (atomic_read(&(journal->j_wcount)) != 0) {
-		reiserfs_panic(s,
-			       "journal-844: panic journal list is flushing, wcount is not 0\n");
+		reiserfs_panic(s, "journal-844", "journal list is flushing, "
+			       "wcount is not 0");
 	}
 	cn = jl->j_realblock;
 	while (cn) {
@@ -1551,13 +1550,13 @@ static int flush_journal_list(struct super_block *s,
 		while (cn) {
 			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
 				if (!cn->bh) {
-					reiserfs_panic(s,
-						       "journal-1011: cn->bh is NULL\n");
+					reiserfs_panic(s, "journal-1011",
+						       "cn->bh is NULL");
 				}
 				wait_on_buffer(cn->bh);
 				if (!cn->bh) {
-					reiserfs_panic(s,
-						       "journal-1012: cn->bh is NULL\n");
+					reiserfs_panic(s, "journal-1012",
+						       "cn->bh is NULL");
 				}
 				if (unlikely(!buffer_uptodate(cn->bh))) {
 #ifdef CONFIG_REISERFS_CHECK
@@ -3255,8 +3254,8 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 
 	PROC_INFO_INC(p_s_sb, journal.mark_dirty);
 	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super,
-			       "journal-1577: handle trans id %ld != current trans id %ld\n",
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
 			       th->t_trans_id, journal->j_trans_id);
 	}
 
@@ -3295,8 +3294,8 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	 ** Nothing can be done here, except make the FS readonly or panic.
 	 */
 	if (journal->j_len >= journal->j_trans_max) {
-		reiserfs_panic(th->t_super,
-			       "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n",
+		reiserfs_panic(th->t_super, "journal-1413",
+			       "j_len (%lu) is too big",
 			       journal->j_len);
 	}
 
@@ -3316,7 +3315,8 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	if (!cn) {
 		cn = get_cnode(p_s_sb);
 		if (!cn) {
-			reiserfs_panic(p_s_sb, "get_cnode failed!\n");
+			reiserfs_panic(p_s_sb, "journal-4",
+				       "get_cnode failed!");
 		}
 
 		if (th->t_blocks_logged == th->t_blocks_allocated) {
@@ -3584,8 +3584,8 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	BUG_ON(!th->t_trans_id);
 
 	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super,
-			       "journal-1577: handle trans id %ld != current trans id %ld\n",
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
 			       th->t_trans_id, journal->j_trans_id);
 	}
 
@@ -3664,8 +3664,8 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	}
 
 	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-		reiserfs_panic(p_s_sb,
-			       "journal-003: journal_end: j_start (%ld) is too high\n",
+		reiserfs_panic(p_s_sb, "journal-003",
+			       "j_start (%ld) is too high",
 			       journal->j_start);
 	}
 	return 1;
@@ -3710,8 +3710,8 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 		/* set the bit for this block in the journal bitmap for this transaction */
 		jb = journal->j_current_jl->j_list_bitmap;
 		if (!jb) {
-			reiserfs_panic(p_s_sb,
-				       "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n");
+			reiserfs_panic(p_s_sb, "journal-1702",
+				       "journal_list_bitmap is NULL");
 		}
 		set_bit_in_list_bitmap(p_s_sb, blocknr, jb);
 
@@ -4066,8 +4066,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		if (buffer_journaled(cn->bh)) {
 			jl_cn = get_cnode(p_s_sb);
 			if (!jl_cn) {
-				reiserfs_panic(p_s_sb,
-					       "journal-1676, get_cnode returned NULL\n");
+				reiserfs_panic(p_s_sb, "journal-1676",
+					       "get_cnode returned NULL");
 			}
 			if (i == 0) {
 				jl->j_realblock = jl_cn;
@@ -4083,8 +4083,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 
 			if (is_block_in_log_or_reserved_area
 			    (p_s_sb, cn->bh->b_blocknr)) {
-				reiserfs_panic(p_s_sb,
-					       "journal-2332: Trying to log block %lu, which is a log block\n",
+				reiserfs_panic(p_s_sb, "journal-2332",
+					       "Trying to log block %lu, "
+					       "which is a log block",
 					       cn->bh->b_blocknr);
 			}
 			jl_cn->blocknr = cn->bh->b_blocknr;
@@ -4268,8 +4269,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	    get_list_bitmap(p_s_sb, journal->j_current_jl);
 
 	if (!(journal->j_current_jl->j_list_bitmap)) {
-		reiserfs_panic(p_s_sb,
-			       "journal-1996: do_journal_end, could not get a list bitmap\n");
+		reiserfs_panic(p_s_sb, "journal-1996",
+			       "could not get a list bitmap");
 	}
 
 	atomic_set(&(journal->j_jlock), 0);
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 381339b432e7..67f1d1de213d 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -168,10 +168,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
 			if (bytes_or_entries == ih_item_len(ih)
 			    && is_indirect_le_ih(ih))
 				if (get_ih_free_space(ih))
-					reiserfs_panic(NULL,
-						       "vs-10020: leaf_copy_boundary_item: "
-						       "last unformatted node must be filled entirely (%h)",
-						       ih);
+					reiserfs_panic(sb_from_bi(dest_bi),
+						       "vs-10020",
+						       "last unformatted node "
+						       "must be filled "
+						       "entirely (%h)", ih);
 		}
 #endif
 
@@ -622,9 +623,8 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
 		break;
 
 	default:
-		reiserfs_panic(NULL,
-			       "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)",
-			       shift_mode);
+		reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
+			       "shift type is unknown (%d)", shift_mode);
 	}
 	RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
 	       "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
@@ -674,9 +674,9 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
 #ifdef CONFIG_REISERFS_CHECK
 			if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
 				print_cur_tb("vs-10275");
-				reiserfs_panic(tb->tb_sb,
-					       "vs-10275: leaf_shift_left: balance condition corrupted (%c)",
-					       tb->tb_mode);
+				reiserfs_panic(tb->tb_sb, "vs-10275",
+					       "balance condition corrupted "
+					       "(%c)", tb->tb_mode);
 			}
 #endif
 
@@ -889,9 +889,12 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
 
 #ifdef CONFIG_REISERFS_CHECK
 	if (zeros_number > paste_size) {
+		struct super_block *sb = NULL;
+		if (bi && bi->tb)
+			sb = bi->tb->tb_sb;
 		print_cur_tb("10177");
-		reiserfs_panic(NULL,
-			       "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d",
+		reiserfs_panic(sb, "vs-10177",
+			       "zeros_number == %d, paste_size == %d",
 			       zeros_number, paste_size);
 	}
 #endif				/* CONFIG_REISERFS_CHECK */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index bb41c6e7c79b..ef41cc882bd9 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -145,10 +145,9 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
 	if (!is_direntry_le_ih(de->de_ih) ||
 	    COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) {
 		print_block(de->de_bh, 0, -1, -1);
-		reiserfs_panic(sb,
-			       "vs-7005: search_by_entry_key: found item %h is not directory item or "
-			       "does not belong to the same directory as key %K",
-			       de->de_ih, key);
+		reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
+			       "item or does not belong to the same directory "
+			       "as key %K", de->de_ih, key);
 	}
 #endif				/* CONFIG_REISERFS_CHECK */
 
@@ -1193,15 +1192,14 @@ static int entry_points_to_object(const char *name, int len,
 
 	if (inode) {
 		if (!de_visible(de->de_deh + de->de_entry_num))
-			reiserfs_panic(NULL,
-				       "vs-7042: entry_points_to_object: entry must be visible");
+			reiserfs_panic(inode->i_sb, "vs-7042",
+				       "entry must be visible");
 		return (de->de_objectid == inode->i_ino) ? 1 : 0;
 	}
 
 	/* this must be added hidden entry */
 	if (de_visible(de->de_deh + de->de_entry_num))
-		reiserfs_panic(NULL,
-			       "vs-7043: entry_points_to_object: entry must be visible");
+		reiserfs_panic(NULL, "vs-7043", "entry must be visible");
 
 	return 1;
 }
@@ -1315,8 +1313,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			       new_dentry->d_name.len, old_inode, 0);
 	if (retval == -EEXIST) {
 		if (!new_dentry_inode) {
-			reiserfs_panic(old_dir->i_sb,
-				       "vs-7050: new entry is found, new inode == 0\n");
+			reiserfs_panic(old_dir->i_sb, "vs-7050",
+				       "new entry is found, new inode == 0");
 		}
 	} else if (retval) {
 		int err = journal_end(&th, old_dir->i_sb, jbegin_count);
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index a3a5f43ff443..90e4e52f857b 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -18,8 +18,7 @@
 static void check_objectid_map(struct super_block *s, __le32 * map)
 {
 	if (le32_to_cpu(map[0]) != 1)
-		reiserfs_panic(s,
-			       "vs-15010: check_objectid_map: map corrupted: %lx",
+		reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
 			       (long unsigned int)le32_to_cpu(map[0]));
 
 	// FIXME: add something else here
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index de71372f0dfe..1964acb6eb17 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -353,14 +353,21 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
 extern struct tree_balance *cur_tb;
 #endif
 
-void reiserfs_panic(struct super_block *sb, const char *fmt, ...)
+void __reiserfs_panic(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
 {
 	do_reiserfs_warning(fmt);
 
+#ifdef CONFIG_REISERFS_CHECK
 	dump_stack();
-
-	panic(KERN_EMERG "REISERFS: panic (device %s): %s\n",
-	       reiserfs_bdevname(sb), error_buf);
+#endif
+	if (sb)
+		panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
+		      sb->s_id, id ? id : "", id ? " " : "",
+		      function, error_buf);
+	else
+		panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
+		      id ? id : "", id ? " " : "", function, error_buf);
 }
 
 void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
@@ -681,12 +688,10 @@ static void check_leaf_block_head(struct buffer_head *bh)
 	blkh = B_BLK_HEAD(bh);
 	nr = blkh_nr_item(blkh);
 	if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL,
-			       "vs-6010: check_leaf_block_head: invalid item number %z",
+		reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
 			       bh);
 	if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
-		reiserfs_panic(NULL,
-			       "vs-6020: check_leaf_block_head: invalid free space %z",
+		reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
 			       bh);
 
 }
@@ -697,21 +702,15 @@ static void check_internal_block_head(struct buffer_head *bh)
 
 	blkh = B_BLK_HEAD(bh);
 	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
-		reiserfs_panic(NULL,
-			       "vs-6025: check_internal_block_head: invalid level %z",
-			       bh);
+		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
 
 	if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL,
-			       "vs-6030: check_internal_block_head: invalid item number %z",
-			       bh);
+		reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
 
 	if (B_FREE_SPACE(bh) !=
 	    bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
 	    DC_SIZE * (B_NR_ITEMS(bh) + 1))
-		reiserfs_panic(NULL,
-			       "vs-6040: check_internal_block_head: invalid free space %z",
-			       bh);
+		reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
 
 }
 
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index f328d27a19d5..2de1e309124b 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -366,9 +366,8 @@ inline void decrement_bcount(struct buffer_head *p_s_bh)
 			put_bh(p_s_bh);
 			return;
 		}
-		reiserfs_panic(NULL,
-			       "PAP-5070: decrement_bcount: trying to free free buffer %b",
-			       p_s_bh);
+		reiserfs_panic(NULL, "PAP-5070",
+			       "trying to free free buffer %b", p_s_bh);
 	}
 }
 
@@ -713,8 +712,8 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 #ifdef CONFIG_REISERFS_CHECK
 		if (cur_tb) {
 			print_cur_tb("5140");
-			reiserfs_panic(p_s_sb,
-				       "PAP-5140: search_by_key: schedule occurred in do_balance!");
+			reiserfs_panic(p_s_sb, "PAP-5140",
+				       "schedule occurred in do_balance!");
 		}
 #endif
 
@@ -1511,8 +1510,8 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
 		/* look for the last byte of the tail */
 		if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
 		    POSITION_NOT_FOUND)
-			reiserfs_panic(inode->i_sb,
-				       "vs-5615: indirect_to_direct_roll_back: found invalid item");
+			reiserfs_panic(inode->i_sb, "vs-5615",
+				       "found invalid item");
 		RFALSE(path->pos_in_item !=
 		       ih_item_len(PATH_PITEM_HEAD(path)) - 1,
 		       "vs-5616: appended bytes found");
@@ -1612,8 +1611,8 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 				print_block(PATH_PLAST_BUFFER(p_s_path), 3,
 					    PATH_LAST_POSITION(p_s_path) - 1,
 					    PATH_LAST_POSITION(p_s_path) + 1);
-				reiserfs_panic(p_s_sb,
-					       "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)",
+				reiserfs_panic(p_s_sb, "PAP-5580", "item to "
+					       "convert does not exist (%K)",
 					       p_s_item_key);
 			}
 			continue;
@@ -1693,22 +1692,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		   sure, that we exactly remove last unformatted node pointer
 		   of the item */
 		if (!is_indirect_le_ih(le_ih))
-			reiserfs_panic(p_s_sb,
-				       "vs-5652: reiserfs_cut_from_item: "
+			reiserfs_panic(p_s_sb, "vs-5652",
 				       "item must be indirect %h", le_ih);
 
 		if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
-			reiserfs_panic(p_s_sb,
-				       "vs-5653: reiserfs_cut_from_item: "
-				       "completing indirect2direct conversion indirect item %h "
-				       "being deleted must be of 4 byte long",
-				       le_ih);
+			reiserfs_panic(p_s_sb, "vs-5653", "completing "
+				       "indirect2direct conversion indirect "
+				       "item %h being deleted must be of "
+				       "4 byte long", le_ih);
 
 		if (c_mode == M_CUT
 		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
-			reiserfs_panic(p_s_sb,
-				       "vs-5654: reiserfs_cut_from_item: "
-				       "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)",
+			reiserfs_panic(p_s_sb, "vs-5654", "can not complete "
+				       "indirect2direct conversion of %h "
+				       "(CUT, insert_size==%d)",
 				       le_ih, s_cut_balance.insert_size[0]);
 		}
 		/* it would be useful to make sure, that right neighboring
@@ -1923,10 +1920,10 @@ static void check_research_for_paste(struct treepath *path,
 		    || op_bytes_number(found_ih,
 				       get_last_bh(path)->b_size) !=
 		    pos_in_item(path))
-			reiserfs_panic(NULL,
-				       "PAP-5720: check_research_for_paste: "
-				       "found direct item %h or position (%d) does not match to key %K",
-				       found_ih, pos_in_item(path), p_s_key);
+			reiserfs_panic(NULL, "PAP-5720", "found direct item "
+				       "%h or position (%d) does not match "
+				       "to key %K", found_ih,
+				       pos_in_item(path), p_s_key);
 	}
 	if (is_indirect_le_ih(found_ih)) {
 		if (le_ih_k_offset(found_ih) +
@@ -1935,9 +1932,9 @@ static void check_research_for_paste(struct treepath *path,
 		    cpu_key_k_offset(p_s_key)
 		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
 		    || get_ih_free_space(found_ih) != 0)
-			reiserfs_panic(NULL,
-				       "PAP-5730: check_research_for_paste: "
-				       "found indirect item (%h) or position (%d) does not match to key (%K)",
+			reiserfs_panic(NULL, "PAP-5730", "found indirect "
+				       "item (%h) or position (%d) does not "
+				       "match to key (%K)",
 				       found_ih, pos_in_item(path), p_s_key);
 	}
 }
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 256285dddb20..f8449cb74b53 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -92,8 +92,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 		   last item of the file */
 		if (search_for_position_by_key(sb, &end_key, path) ==
 		    POSITION_FOUND)
-			reiserfs_panic(sb,
-				       "PAP-14050: direct2indirect: "
+			reiserfs_panic(sb, "PAP-14050",
 				       "direct item (%K) not found", &end_key);
 		p_le_ih = PATH_PITEM_HEAD(path);
 		RFALSE(!is_direct_le_ih(p_le_ih),
@@ -214,8 +213,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 		/* re-search indirect item */
 		if (search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path)
 		    == POSITION_NOT_FOUND)
-			reiserfs_panic(p_s_sb,
-				       "PAP-5520: indirect2direct: "
+			reiserfs_panic(p_s_sb, "PAP-5520",
 				       "item to be converted %K does not exist",
 				       p_s_item_key);
 		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
@@ -224,8 +222,8 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
 		     1) * p_s_sb->s_blocksize;
 		if (pos != pos1)
-			reiserfs_panic(p_s_sb, "vs-5530: indirect2direct: "
-				       "tail position changed while we were reading it");
+			reiserfs_panic(p_s_sb, "vs-5530", "tail position "
+				       "changed while we were reading it");
 #endif
 	}
 
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index cf5407ee0f32..04bfd61eeaaa 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -86,11 +86,14 @@ void __reiserfs_warning(struct super_block *s, const char *id,
 /* assertions handling */
 
 /** always check a condition and panic if it's false. */
-#define __RASSERT( cond, scond, format, args... )					\
-if( !( cond ) ) 								\
-  reiserfs_panic( NULL, "reiserfs[%i]: assertion " scond " failed at "	\
-		  __FILE__ ":%i:%s: " format "\n",		\
-		  in_interrupt() ? -1 : task_pid_nr(current), __LINE__ , __func__ , ##args )
+#define __RASSERT(cond, scond, format, args...)			\
+do {									\
+	if (!(cond))							\
+		reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
+			       __FILE__ ":%i:%s: " format "\n",		\
+			       in_interrupt() ? -1 : task_pid_nr(current), \
+			       __LINE__, __func__ , ##args);		\
+} while (0)
 
 #define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
 
@@ -1448,6 +1451,16 @@ struct buffer_info {
 	int bi_position;
 };
 
+static inline struct super_block *sb_from_tb(struct tree_balance *tb)
+{
+	return tb ? tb->tb_sb : NULL;
+}
+
+static inline struct super_block *sb_from_bi(struct buffer_info *bi)
+{
+	return bi ? sb_from_tb(bi->tb) : NULL;
+}
+
 /* there are 4 types of items: stat data, directory item, indirect, direct.
 +-------------------+------------+--------------+------------+
 |	            |  k_offset  | k_uniqueness | mergeable? |
@@ -1988,8 +2001,11 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb,
 void unfix_nodes(struct tree_balance *);
 
 /* prints.c */
-void reiserfs_panic(struct super_block *s, const char *fmt, ...)
+void __reiserfs_panic(struct super_block *s, const char *id,
+		      const char *function, const char *fmt, ...)
     __attribute__ ((noreturn));
+#define reiserfs_panic(s, id, fmt, args...) \
+	__reiserfs_panic(s, id, __func__, fmt, ##args)
 void reiserfs_info(struct super_block *s, const char *fmt, ...);
 void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
 void print_indirect_item(struct buffer_head *bh, int item_num);
-- 
cgit v1.2.3-71-gd317


From 32e8b1062915d00d07d3b88a95174648e369b6a3 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:26 -0400
Subject: reiserfs: rearrange journal abort

This patch kills off reiserfs_journal_abort as it is never called, and
combines __reiserfs_journal_abort_{soft,hard} into one function called
reiserfs_abort_journal, which performs the same work. It is silent
as opposed to the old version, since the message was always issued
after a regular 'abort' message.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c       | 23 ++++-------------------
 fs/reiserfs/prints.c        |  2 +-
 include/linux/reiserfs_fs.h |  2 +-
 3 files changed, 6 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 774f3ba37409..db91754cfb83 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -4295,14 +4295,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	return journal->j_errno;
 }
 
-static void __reiserfs_journal_abort_hard(struct super_block *sb)
+/* Send the file system read only and refuse new transactions */
+void reiserfs_abort_journal(struct super_block *sb, int errno)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	if (test_bit(J_ABORTED, &journal->j_state))
 		return;
 
-	printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",
-	       reiserfs_bdevname(sb));
+	if (!journal->j_errno)
+		journal->j_errno = errno;
 
 	sb->s_flags |= MS_RDONLY;
 	set_bit(J_ABORTED, &journal->j_state);
@@ -4312,19 +4313,3 @@ static void __reiserfs_journal_abort_hard(struct super_block *sb)
 #endif
 }
 
-static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	if (test_bit(J_ABORTED, &journal->j_state))
-		return;
-
-	if (!journal->j_errno)
-		journal->j_errno = errno;
-
-	__reiserfs_journal_abort_hard(sb);
-}
-
-void reiserfs_journal_abort(struct super_block *sb, int errno)
-{
-	__reiserfs_journal_abort_soft(sb, errno);
-}
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 1964acb6eb17..84f3f69652e3 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -386,7 +386,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
 	       error_buf);
 
 	sb->s_flags |= MS_RDONLY;
-	reiserfs_journal_abort(sb, errno);
+	reiserfs_abort_journal(sb, errno);
 }
 
 /* this prints internal nodes (4 keys/items in line) (dc_number,
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 04bfd61eeaaa..d097966bbd91 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1771,7 +1771,7 @@ int journal_begin(struct reiserfs_transaction_handle *,
 		  struct super_block *p_s_sb, unsigned long);
 int journal_join_abort(struct reiserfs_transaction_handle *,
 		       struct super_block *p_s_sb, unsigned long);
-void reiserfs_journal_abort(struct super_block *sb, int errno);
+void reiserfs_abort_journal(struct super_block *sb, int errno);
 void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
 int reiserfs_allocate_list_bitmaps(struct super_block *s,
 				   struct reiserfs_list_bitmap *, unsigned int);
-- 
cgit v1.2.3-71-gd317


From 1e5e59d431038c53954fe8f0b38bee0f0ad30349 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:27 -0400
Subject: reiserfs: introduce reiserfs_error()

Although reiserfs can currently handle severe errors such as journal failure,
it cannot handle less severe errors like metadata i/o failure. The following
patch adds a reiserfs_error() function akin to the one in ext3.

Subsequent patches will use this new error handler to handle errors more
gracefully in general.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/prints.c        | 25 +++++++++++++++++++++++++
 include/linux/reiserfs_fs.h |  4 ++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 84f3f69652e3..8e826c07cd21 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -370,6 +370,31 @@ void __reiserfs_panic(struct super_block *sb, const char *id,
 		      id ? id : "", id ? " " : "", function, error_buf);
 }
 
+void __reiserfs_error(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
+
+	BUG_ON(sb == NULL);
+
+	if (reiserfs_error_panic(sb))
+		__reiserfs_panic(sb, id, function, error_buf);
+
+	if (id && id[0])
+		printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
+		       sb->s_id, id, function, error_buf);
+	else
+		printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
+		       sb->s_id, function, error_buf);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	reiserfs_info(sb, "Remounting filesystem read-only\n");
+	sb->s_flags |= MS_RDONLY;
+	reiserfs_abort_journal(sb, -EIO);
+}
+
 void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
 {
 	do_reiserfs_warning(fmt);
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index d097966bbd91..6c4af98b6767 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2006,6 +2006,10 @@ void __reiserfs_panic(struct super_block *s, const char *id,
     __attribute__ ((noreturn));
 #define reiserfs_panic(s, id, fmt, args...) \
 	__reiserfs_panic(s, id, __func__, fmt, ##args)
+void __reiserfs_error(struct super_block *s, const char *id,
+		      const char *function, const char *fmt, ...);
+#define reiserfs_error(s, id, fmt, args...) \
+	 __reiserfs_error(s, id, __func__, fmt, ##args)
 void reiserfs_info(struct super_block *s, const char *fmt, ...);
 void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
 void print_indirect_item(struct buffer_head *bh, int item_num);
-- 
cgit v1.2.3-71-gd317


From 6dfede696391133eadd7ce90b61c9573ee6e5a90 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:32 -0400
Subject: reiserfs: remove IS_PRIVATE helpers

There are a number of helper functions for marking a reiserfs inode
private that were leftover from reiserfs did its own thing wrt to
private inodes.  S_PRIVATE has been in the kernel for some time, so this
patch removes the helpers and uses IS_PRIVATE instead.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/inode.c            |  5 ++---
 fs/reiserfs/namei.c            |  7 ++++---
 fs/reiserfs/xattr.c            | 14 ++++++--------
 fs/reiserfs/xattr_acl.c        |  6 +++---
 fs/reiserfs/xattr_security.c   |  8 ++++----
 fs/reiserfs/xattr_trusted.c    |  8 ++++----
 include/linux/reiserfs_xattr.h |  8 --------
 7 files changed, 23 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index fab0373ad6e3..cd42a8658086 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1927,9 +1927,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		reiserfs_warning(inode->i_sb, "jdm-13090",
 				 "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
-	} else if (is_reiserfs_priv_object(dir)) {
-		reiserfs_mark_inode_private(inode);
-	}
+	} else if (IS_PRIVATE(dir))
+		inode->i_flags |= S_PRIVATE;
 
 	reiserfs_update_sd(th, inode);
 	reiserfs_check_path(&path_to_key);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 3ce3f8b1690d..c8430f1c824f 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -358,9 +358,10 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 			return ERR_PTR(-EACCES);
 		}
 
-		/* Propogate the priv_object flag so we know we're in the priv tree */
-		if (is_reiserfs_priv_object(dir))
-			reiserfs_mark_inode_private(inode);
+		/* Propagate the private flag so we know we're
+		 * in the priv tree */
+		if (IS_PRIVATE(dir))
+			inode->i_flags |= S_PRIVATE;
 	}
 	reiserfs_write_unlock(dir->i_sb);
 	if (retval == IO_ERROR) {
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3e9e82ca3ba2..c5fc207e529c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -633,14 +633,14 @@ __reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
 	if (S_ISDIR(dentry->d_inode->i_mode))
 		goto out_file;
 
-	if (!is_reiserfs_priv_object(dentry->d_inode)) {
+	if (!IS_PRIVATE(dentry->d_inode)) {
 		reiserfs_error(dir->i_sb, "jdm-20003",
 			       "OID %08x [%.*s/%.*s] doesn't have "
 			       "priv flag set [parent is %sset].",
 			       le32_to_cpu(INODE_PKEY(dentry->d_inode)->
 					   k_objectid), xadir->d_name.len,
 			       xadir->d_name.name, namelen, name,
-			       is_reiserfs_priv_object(xadir->d_inode) ? "" :
+			       IS_PRIVATE(xadir->d_inode) ? "" :
 			       "not ");
 		dput(dentry);
 		return -EIO;
@@ -701,8 +701,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
 	int err = 0;
 
 	/* Skip out, an xattr has no xattrs associated with it */
-	if (is_reiserfs_priv_object(inode) ||
-	    get_inode_sd_version(inode) == STAT_DATA_V1 ||
+	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1 ||
 	    !reiserfs_xattrs(inode->i_sb)) {
 		return 0;
 	}
@@ -786,8 +785,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 	unsigned int ia_valid = attrs->ia_valid;
 
 	/* Skip out, an xattr has no xattrs associated with it */
-	if (is_reiserfs_priv_object(inode) ||
-	    get_inode_sd_version(inode) == STAT_DATA_V1 ||
+	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1 ||
 	    !reiserfs_xattrs(inode->i_sb)) {
 		return 0;
 	}
@@ -1178,7 +1176,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 
 		if (!err && dentry) {
 			s->s_root->d_op = &xattr_lookup_poison_ops;
-			reiserfs_mark_inode_private(dentry->d_inode);
+			dentry->d_inode->i_flags |= S_PRIVATE;
 			REISERFS_SB(s)->priv_root = dentry;
 		} else if (!(mount_flags & MS_RDONLY)) {	/* xattrs are unavailable */
 			/* If we're read-only it just means that the dir hasn't been
@@ -1239,7 +1237,7 @@ int reiserfs_permission(struct inode *inode, int mask)
 	 * We don't do permission checks on the internal objects.
 	 * Permissions are determined by the "owning" object.
 	 */
-	if (is_reiserfs_priv_object(inode))
+	if (IS_PRIVATE(inode))
 		return 0;
 
 	/*
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index b7e4fa4539de..9128e4d5ba64 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -335,8 +335,8 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 	/* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
 	 * would be useless since permissions are ignored, and a pain because
 	 * it introduces locking cycles */
-	if (is_reiserfs_priv_object(dir)) {
-		reiserfs_mark_inode_private(inode);
+	if (IS_PRIVATE(dir)) {
+		inode->i_flags |= S_PRIVATE;
 		goto apply_umask;
 	}
 
@@ -401,7 +401,7 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 int reiserfs_cache_default_acl(struct inode *inode)
 {
 	int ret = 0;
-	if (reiserfs_posixacl(inode->i_sb) && !is_reiserfs_priv_object(inode)) {
+	if (reiserfs_posixacl(inode->i_sb) && !IS_PRIVATE(inode)) {
 		struct posix_acl *acl;
 		reiserfs_read_lock_xattr_i(inode);
 		reiserfs_read_lock_xattrs(inode->i_sb);
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 056008db1377..1958b361c35d 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -12,7 +12,7 @@ security_get(struct inode *inode, const char *name, void *buffer, size_t size)
 	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
 		return -EINVAL;
 
-	if (is_reiserfs_priv_object(inode))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_get(inode, name, buffer, size);
@@ -25,7 +25,7 @@ security_set(struct inode *inode, const char *name, const void *buffer,
 	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
 		return -EINVAL;
 
-	if (is_reiserfs_priv_object(inode))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
@@ -36,7 +36,7 @@ static int security_del(struct inode *inode, const char *name)
 	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
 		return -EINVAL;
 
-	if (is_reiserfs_priv_object(inode))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
 	return 0;
@@ -47,7 +47,7 @@ security_list(struct inode *inode, const char *name, int namelen, char *out)
 {
 	int len = namelen;
 
-	if (is_reiserfs_priv_object(inode))
+	if (IS_PRIVATE(inode))
 		return 0;
 
 	if (out)
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 60abe2bb1f98..076ad388d489 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -16,7 +16,7 @@ trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
 	if (!reiserfs_xattrs(inode->i_sb))
 		return -EOPNOTSUPP;
 
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
 		return -EPERM;
 
 	return reiserfs_xattr_get(inode, name, buffer, size);
@@ -32,7 +32,7 @@ trusted_set(struct inode *inode, const char *name, const void *buffer,
 	if (!reiserfs_xattrs(inode->i_sb))
 		return -EOPNOTSUPP;
 
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
 		return -EPERM;
 
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
@@ -46,7 +46,7 @@ static int trusted_del(struct inode *inode, const char *name)
 	if (!reiserfs_xattrs(inode->i_sb))
 		return -EOPNOTSUPP;
 
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
 		return -EPERM;
 
 	return 0;
@@ -60,7 +60,7 @@ trusted_list(struct inode *inode, const char *name, int namelen, char *out)
 	if (!reiserfs_xattrs(inode->i_sb))
 		return 0;
 
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
 		return 0;
 
 	if (out)
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index af135ae895db..58f32ba7f5a0 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -44,7 +44,6 @@ struct reiserfs_xattr_handler {
 };
 
 #ifdef CONFIG_REISERFS_FS_XATTR
-#define is_reiserfs_priv_object(inode) IS_PRIVATE(inode)
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
 ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
 			  void *buffer, size_t size);
@@ -104,11 +103,6 @@ static inline void reiserfs_read_unlock_xattr_i(struct inode *inode)
 	up_read(&REISERFS_I(inode)->xattr_sem);
 }
 
-static inline void reiserfs_mark_inode_private(struct inode *inode)
-{
-	inode->i_flags |= S_PRIVATE;
-}
-
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
 	init_rwsem(&REISERFS_I(inode)->xattr_sem);
@@ -116,8 +110,6 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 
 #else
 
-#define is_reiserfs_priv_object(inode) 0
-#define reiserfs_mark_inode_private(inode) do {;} while(0)
 #define reiserfs_getxattr NULL
 #define reiserfs_setxattr NULL
 #define reiserfs_listxattr NULL
-- 
cgit v1.2.3-71-gd317


From a72bdb1cd244725ff47b3a29662e2cb820d8c60f Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:33 -0400
Subject: reiserfs: Clean up xattrs when REISERFS_FS_XATTR is unset

The current reiserfs xattr implementation will not clean up old xattr
files if files are deleted when REISERFS_FS_XATTR is unset.  This
results in inaccessible lost files, wasting space.

This patch compiles in basic xattr knowledge, such as how to delete them
and change ownership for quota tracking.  If the file system has never
used xattrs, then the operation is quite fast: it returns immediately
when it sees there is no .reiserfs_priv directory.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/Makefile           |   4 +-
 fs/reiserfs/xattr.c            | 801 +++++++++++++++++++++--------------------
 include/linux/reiserfs_fs_sb.h |   2 +-
 include/linux/reiserfs_xattr.h |  29 +-
 4 files changed, 423 insertions(+), 413 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 0eb7ac080484..7c5ab6330dd6 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,10 +7,10 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
 		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
 		 hashes.o tail_conversion.o journal.o resize.o \
-		 item_ops.o ioctl.o procfs.o
+		 item_ops.o ioctl.o procfs.o xattr.o
 
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
-reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o
+reiserfs-objs += xattr_user.o xattr_trusted.o
 endif
 
 ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c5fc207e529c..f9bcdd5750f7 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -50,9 +50,6 @@
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
 
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
-								*prefix);
-
 /* Returns the dentry referring to the root of the extended attribute
  * directory tree. If it has already been retrieved, it is used. If it
  * hasn't been created and the flags indicate creation is allowed, we
@@ -143,60 +140,6 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 	return xadir;
 }
 
-/* Returns a dentry corresponding to a specific extended attribute file
- * for the inode. If flags allow, the file is created. Otherwise, a
- * valid or negative dentry, or an error is returned. */
-static struct dentry *get_xa_file_dentry(const struct inode *inode,
-					 const char *name, int flags)
-{
-	struct dentry *xadir, *xafile;
-	int err = 0;
-
-	xadir = open_xa_dir(inode, flags);
-	if (IS_ERR(xadir)) {
-		return ERR_CAST(xadir);
-	} else if (!xadir->d_inode) {
-		dput(xadir);
-		return ERR_PTR(-ENODATA);
-	}
-
-	xafile = lookup_one_len(name, xadir, strlen(name));
-	if (IS_ERR(xafile)) {
-		dput(xadir);
-		return ERR_CAST(xafile);
-	}
-
-	if (xafile->d_inode) {	/* file exists */
-		if (flags & XATTR_CREATE) {
-			err = -EEXIST;
-			dput(xafile);
-			goto out;
-		}
-	} else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
-		goto out;
-	} else {
-		/* inode->i_mutex is down, so nothing else can try to create
-		 * the same xattr */
-		err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
-						   0700 | S_IFREG, NULL);
-
-		if (err) {
-			dput(xafile);
-			goto out;
-		}
-	}
-
-      out:
-	dput(xadir);
-	if (err)
-		xafile = ERR_PTR(err);
-	else if (!xafile->d_inode) {
-		dput(xafile);
-		xafile = ERR_PTR(-ENODATA);
-	}
-	return xafile;
-}
-
 /*
  * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but
  * we need to drop the path before calling the filldir struct.  That
@@ -369,6 +312,251 @@ int xattr_readdir(struct inode *inode, filldir_t filler, void *buf)
 	return res;
 }
 
+static int
+__reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
+{
+	struct dentry *dentry;
+	struct inode *dir = xadir->d_inode;
+	int err = 0;
+
+	dentry = lookup_one_len(name, xadir, namelen);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out;
+	} else if (!dentry->d_inode) {
+		err = -ENODATA;
+		goto out_file;
+	}
+
+	/* Skip directories.. */
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		goto out_file;
+
+	if (!IS_PRIVATE(dentry->d_inode)) {
+		reiserfs_error(dir->i_sb, "jdm-20003",
+			       "OID %08x [%.*s/%.*s] doesn't have "
+			       "priv flag set [parent is %sset].",
+			       le32_to_cpu(INODE_PKEY(dentry->d_inode)->
+					   k_objectid), xadir->d_name.len,
+			       xadir->d_name.name, namelen, name,
+			       IS_PRIVATE(xadir->d_inode) ? "" :
+			       "not ");
+		dput(dentry);
+		return -EIO;
+	}
+
+	err = dir->i_op->unlink(dir, dentry);
+	if (!err)
+		d_delete(dentry);
+
+out_file:
+	dput(dentry);
+
+out:
+	return err;
+}
+
+/* The following are side effects of other operations that aren't explicitly
+ * modifying extended attributes. This includes operations such as permissions
+ * or ownership changes, object deletions, etc. */
+
+static int
+reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
+			      loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct dentry *xadir = (struct dentry *)buf;
+
+	return __reiserfs_xattr_del(xadir, name, namelen);
+
+}
+
+/* This is called w/ inode->i_mutex downed */
+int reiserfs_delete_xattrs(struct inode *inode)
+{
+	struct dentry *dir, *root;
+	int err = 0;
+
+	/* Skip out, an xattr has no xattrs associated with it */
+	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
+		return 0;
+
+	reiserfs_read_lock_xattrs(inode->i_sb);
+	dir = open_xa_dir(inode, FL_READONLY);
+	reiserfs_read_unlock_xattrs(inode->i_sb);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		goto out;
+	} else if (!dir->d_inode) {
+		dput(dir);
+		return 0;
+	}
+
+	lock_kernel();
+	err = xattr_readdir(dir->d_inode, reiserfs_delete_xattrs_filler, dir);
+	if (err) {
+		unlock_kernel();
+		goto out_dir;
+	}
+
+	/* Leftovers besides . and .. -- that's not good. */
+	if (dir->d_inode->i_nlink <= 2) {
+		root = get_xa_root(inode->i_sb, XATTR_REPLACE);
+		reiserfs_write_lock_xattrs(inode->i_sb);
+		err = vfs_rmdir(root->d_inode, dir);
+		reiserfs_write_unlock_xattrs(inode->i_sb);
+		dput(root);
+	} else {
+		reiserfs_warning(inode->i_sb, "jdm-20006",
+				 "Couldn't remove all entries in directory");
+	}
+	unlock_kernel();
+
+out_dir:
+	dput(dir);
+
+out:
+	if (!err)
+		REISERFS_I(inode)->i_flags =
+		    REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
+	return err;
+}
+
+struct reiserfs_chown_buf {
+	struct inode *inode;
+	struct dentry *xadir;
+	struct iattr *attrs;
+};
+
+/* XXX: If there is a better way to do this, I'd love to hear about it */
+static int
+reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
+			     loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
+	struct dentry *xafile, *xadir = chown_buf->xadir;
+	struct iattr *attrs = chown_buf->attrs;
+	int err = 0;
+
+	xafile = lookup_one_len(name, xadir, namelen);
+	if (IS_ERR(xafile))
+		return PTR_ERR(xafile);
+	else if (!xafile->d_inode) {
+		dput(xafile);
+		return -ENODATA;
+	}
+
+	if (!S_ISDIR(xafile->d_inode->i_mode))
+		err = notify_change(xafile, attrs);
+	dput(xafile);
+
+	return err;
+}
+
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
+{
+	struct dentry *dir;
+	int err = 0;
+	struct reiserfs_chown_buf buf;
+	unsigned int ia_valid = attrs->ia_valid;
+
+	/* Skip out, an xattr has no xattrs associated with it */
+	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
+		return 0;
+
+	reiserfs_read_lock_xattrs(inode->i_sb);
+	dir = open_xa_dir(inode, FL_READONLY);
+	reiserfs_read_unlock_xattrs(inode->i_sb);
+	if (IS_ERR(dir)) {
+		if (PTR_ERR(dir) != -ENODATA)
+			err = PTR_ERR(dir);
+		goto out;
+	} else if (!dir->d_inode) {
+		dput(dir);
+		goto out;
+	}
+
+	lock_kernel();
+
+	attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
+	buf.xadir = dir;
+	buf.attrs = attrs;
+	buf.inode = inode;
+
+	err = xattr_readdir(dir->d_inode, reiserfs_chown_xattrs_filler, &buf);
+	if (err) {
+		unlock_kernel();
+		goto out_dir;
+	}
+
+	err = notify_change(dir, attrs);
+	unlock_kernel();
+
+out_dir:
+	dput(dir);
+
+out:
+	attrs->ia_valid = ia_valid;
+	return err;
+}
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
+								*prefix);
+
+/* Returns a dentry corresponding to a specific extended attribute file
+ * for the inode. If flags allow, the file is created. Otherwise, a
+ * valid or negative dentry, or an error is returned. */
+static struct dentry *get_xa_file_dentry(const struct inode *inode,
+					 const char *name, int flags)
+{
+	struct dentry *xadir, *xafile;
+	int err = 0;
+
+	xadir = open_xa_dir(inode, flags);
+	if (IS_ERR(xadir)) {
+		return ERR_CAST(xadir);
+	} else if (xadir && !xadir->d_inode) {
+		dput(xadir);
+		return ERR_PTR(-ENODATA);
+	}
+
+	xafile = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(xafile)) {
+		dput(xadir);
+		return ERR_CAST(xafile);
+	}
+
+	if (xafile->d_inode) {	/* file exists */
+		if (flags & XATTR_CREATE) {
+			err = -EEXIST;
+			dput(xafile);
+			goto out;
+		}
+	} else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
+		goto out;
+	} else {
+		/* inode->i_mutex is down, so nothing else can try to create
+		 * the same xattr */
+		err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
+						   0700 | S_IFREG, NULL);
+
+		if (err) {
+			dput(xafile);
+			goto out;
+		}
+	}
+
+out:
+	dput(xadir);
+	if (err)
+		xafile = ERR_PTR(err);
+	else if (!xafile->d_inode) {
+		dput(xafile);
+		xafile = ERR_PTR(-ENODATA);
+	}
+	return xafile;
+}
+
 /* Internal operations on file data */
 static inline void reiserfs_put_page(struct page *page)
 {
@@ -554,274 +742,85 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 		goto out_dput;
 	}
 
-	while (file_pos < isize) {
-		size_t chunk;
-		char *data;
-		size_t skip = 0;
-		if (isize - file_pos > PAGE_CACHE_SIZE)
-			chunk = PAGE_CACHE_SIZE;
-		else
-			chunk = isize - file_pos;
-
-		page = reiserfs_get_page(dentry->d_inode, file_pos);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto out_dput;
-		}
-
-		lock_page(page);
-		data = page_address(page);
-		if (file_pos == 0) {
-			struct reiserfs_xattr_header *rxh =
-			    (struct reiserfs_xattr_header *)data;
-			skip = file_pos = sizeof(struct reiserfs_xattr_header);
-			chunk -= skip;
-			/* Magic doesn't match up.. */
-			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
-				unlock_page(page);
-				reiserfs_put_page(page);
-				reiserfs_warning(inode->i_sb, "jdm-20001",
-						 "Invalid magic for xattr (%s) "
-						 "associated with %k", name,
-						 INODE_PKEY(inode));
-				err = -EIO;
-				goto out_dput;
-			}
-			hash = le32_to_cpu(rxh->h_hash);
-		}
-		memcpy(buffer + buffer_pos, data + skip, chunk);
-		unlock_page(page);
-		reiserfs_put_page(page);
-		file_pos += chunk;
-		buffer_pos += chunk;
-		skip = 0;
-	}
-	err = isize - sizeof(struct reiserfs_xattr_header);
-
-	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
-	    hash) {
-		reiserfs_warning(inode->i_sb, "jdm-20002",
-				 "Invalid hash for xattr (%s) associated "
-				 "with %k", name, INODE_PKEY(inode));
-		err = -EIO;
-	}
-
-      out_dput:
-	dput(dentry);
-
-      out:
-	return err;
-}
-
-static int
-__reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
-{
-	struct dentry *dentry;
-	struct inode *dir = xadir->d_inode;
-	int err = 0;
-
-	dentry = lookup_one_len(name, xadir, namelen);
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out;
-	} else if (!dentry->d_inode) {
-		err = -ENODATA;
-		goto out_file;
-	}
-
-	/* Skip directories.. */
-	if (S_ISDIR(dentry->d_inode->i_mode))
-		goto out_file;
-
-	if (!IS_PRIVATE(dentry->d_inode)) {
-		reiserfs_error(dir->i_sb, "jdm-20003",
-			       "OID %08x [%.*s/%.*s] doesn't have "
-			       "priv flag set [parent is %sset].",
-			       le32_to_cpu(INODE_PKEY(dentry->d_inode)->
-					   k_objectid), xadir->d_name.len,
-			       xadir->d_name.name, namelen, name,
-			       IS_PRIVATE(xadir->d_inode) ? "" :
-			       "not ");
-		dput(dentry);
-		return -EIO;
-	}
-
-	err = dir->i_op->unlink(dir, dentry);
-	if (!err)
-		d_delete(dentry);
-
-      out_file:
-	dput(dentry);
-
-      out:
-	return err;
-}
-
-int reiserfs_xattr_del(struct inode *inode, const char *name)
-{
-	struct dentry *dir;
-	int err;
-
-	dir = open_xa_dir(inode, FL_READONLY);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
-	}
-
-	err = __reiserfs_xattr_del(dir, name, strlen(name));
-	dput(dir);
-
-	if (!err) {
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-	}
-
-      out:
-	return err;
-}
-
-/* The following are side effects of other operations that aren't explicitly
- * modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc. */
-
-static int
-reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
-			      loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct dentry *xadir = (struct dentry *)buf;
-
-	return __reiserfs_xattr_del(xadir, name, namelen);
-
-}
-
-/* This is called w/ inode->i_mutex downed */
-int reiserfs_delete_xattrs(struct inode *inode)
-{
-	struct dentry *dir, *root;
-	int err = 0;
-
-	/* Skip out, an xattr has no xattrs associated with it */
-	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1 ||
-	    !reiserfs_xattrs(inode->i_sb)) {
-		return 0;
-	}
-	reiserfs_read_lock_xattrs(inode->i_sb);
-	dir = open_xa_dir(inode, FL_READONLY);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
-	} else if (!dir->d_inode) {
-		dput(dir);
-		return 0;
-	}
-
-	lock_kernel();
-	err = xattr_readdir(dir->d_inode, reiserfs_delete_xattrs_filler, dir);
-	if (err) {
-		unlock_kernel();
-		goto out_dir;
-	}
-
-	/* Leftovers besides . and .. -- that's not good. */
-	if (dir->d_inode->i_nlink <= 2) {
-		root = get_xa_root(inode->i_sb, XATTR_REPLACE);
-		reiserfs_write_lock_xattrs(inode->i_sb);
-		err = vfs_rmdir(root->d_inode, dir);
-		reiserfs_write_unlock_xattrs(inode->i_sb);
-		dput(root);
-	} else {
-		reiserfs_warning(inode->i_sb, "jdm-20006",
-				 "Couldn't remove all entries in directory");
-	}
-	unlock_kernel();
-
-      out_dir:
-	dput(dir);
-
-      out:
-	if (!err)
-		REISERFS_I(inode)->i_flags =
-		    REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
-	return err;
-}
+	while (file_pos < isize) {
+		size_t chunk;
+		char *data;
+		size_t skip = 0;
+		if (isize - file_pos > PAGE_CACHE_SIZE)
+			chunk = PAGE_CACHE_SIZE;
+		else
+			chunk = isize - file_pos;
 
-struct reiserfs_chown_buf {
-	struct inode *inode;
-	struct dentry *xadir;
-	struct iattr *attrs;
-};
+		page = reiserfs_get_page(dentry->d_inode, file_pos);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out_dput;
+		}
 
-/* XXX: If there is a better way to do this, I'd love to hear about it */
-static int
-reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
-			     loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
-	struct dentry *xafile, *xadir = chown_buf->xadir;
-	struct iattr *attrs = chown_buf->attrs;
-	int err = 0;
+		lock_page(page);
+		data = page_address(page);
+		if (file_pos == 0) {
+			struct reiserfs_xattr_header *rxh =
+			    (struct reiserfs_xattr_header *)data;
+			skip = file_pos = sizeof(struct reiserfs_xattr_header);
+			chunk -= skip;
+			/* Magic doesn't match up.. */
+			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
+				unlock_page(page);
+				reiserfs_put_page(page);
+				reiserfs_warning(inode->i_sb, "jdm-20001",
+						 "Invalid magic for xattr (%s) "
+						 "associated with %k", name,
+						 INODE_PKEY(inode));
+				err = -EIO;
+				goto out_dput;
+			}
+			hash = le32_to_cpu(rxh->h_hash);
+		}
+		memcpy(buffer + buffer_pos, data + skip, chunk);
+		unlock_page(page);
+		reiserfs_put_page(page);
+		file_pos += chunk;
+		buffer_pos += chunk;
+		skip = 0;
+	}
+	err = isize - sizeof(struct reiserfs_xattr_header);
 
-	xafile = lookup_one_len(name, xadir, namelen);
-	if (IS_ERR(xafile))
-		return PTR_ERR(xafile);
-	else if (!xafile->d_inode) {
-		dput(xafile);
-		return -ENODATA;
+	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
+	    hash) {
+		reiserfs_warning(inode->i_sb, "jdm-20002",
+				 "Invalid hash for xattr (%s) associated "
+				 "with %k", name, INODE_PKEY(inode));
+		err = -EIO;
 	}
 
-	if (!S_ISDIR(xafile->d_inode->i_mode))
-		err = notify_change(xafile, attrs);
-	dput(xafile);
+out_dput:
+	dput(dentry);
 
+out:
 	return err;
 }
 
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
+int reiserfs_xattr_del(struct inode *inode, const char *name)
 {
 	struct dentry *dir;
-	int err = 0;
-	struct reiserfs_chown_buf buf;
-	unsigned int ia_valid = attrs->ia_valid;
+	int err;
 
-	/* Skip out, an xattr has no xattrs associated with it */
-	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1 ||
-	    !reiserfs_xattrs(inode->i_sb)) {
-		return 0;
-	}
-	reiserfs_read_lock_xattrs(inode->i_sb);
 	dir = open_xa_dir(inode, FL_READONLY);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
 	if (IS_ERR(dir)) {
-		if (PTR_ERR(dir) != -ENODATA)
-			err = PTR_ERR(dir);
-		goto out;
-	} else if (!dir->d_inode) {
-		dput(dir);
+		err = PTR_ERR(dir);
 		goto out;
 	}
 
-	lock_kernel();
-
-	attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
-	buf.xadir = dir;
-	buf.attrs = attrs;
-	buf.inode = inode;
+	err = __reiserfs_xattr_del(dir, name, strlen(name));
+	dput(dir);
 
-	err = xattr_readdir(dir->d_inode, reiserfs_chown_xattrs_filler, &buf);
-	if (err) {
-		unlock_kernel();
-		goto out_dir;
+	if (!err) {
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
 	}
 
-	err = notify_change(dir, attrs);
-	unlock_kernel();
-
-      out_dir:
-	dput(dir);
-
       out:
-	attrs->ia_valid = ia_valid;
 	return err;
 }
 
@@ -1101,6 +1100,94 @@ void reiserfs_xattr_unregister_handlers(void)
 	write_unlock(&handler_lock);
 }
 
+static int reiserfs_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	int error = -EAGAIN; /* do regular unix permission checks by default */
+
+	reiserfs_read_lock_xattr_i(inode);
+	reiserfs_read_lock_xattrs(inode->i_sb);
+
+	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+	reiserfs_read_unlock_xattrs(inode->i_sb);
+	reiserfs_read_unlock_xattr_i(inode);
+
+	if (acl) {
+		if (!IS_ERR(acl)) {
+			error = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+		} else if (PTR_ERR(acl) != -ENODATA)
+			error = PTR_ERR(acl);
+	}
+
+	return error;
+}
+
+int reiserfs_permission(struct inode *inode, int mask)
+{
+	/*
+	 * We don't do permission checks on the internal objects.
+	 * Permissions are determined by the "owning" object.
+	 */
+	if (IS_PRIVATE(inode))
+		return 0;
+	/*
+	 * Stat data v1 doesn't support ACLs.
+	 */
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return generic_permission(inode, mask, NULL);
+	else
+		return generic_permission(inode, mask, reiserfs_check_acl);
+}
+
+static int create_privroot(struct dentry *dentry)
+{
+	int err;
+	struct inode *inode = dentry->d_parent->d_inode;
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
+	err = inode->i_op->mkdir(inode, dentry, 0700);
+	mutex_unlock(&inode->i_mutex);
+	if (err) {
+		dput(dentry);
+		dentry = NULL;
+	}
+
+	if (dentry && dentry->d_inode)
+		reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
+			      "storage.\n", PRIVROOT_NAME);
+
+	return err;
+}
+
+static int xattr_mount_check(struct super_block *s)
+{
+	/* We need generation numbers to ensure that the oid mapping is correct
+	 * v3.5 filesystems don't have them. */
+	if (!old_format_only(s)) {
+		set_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+	} else if (reiserfs_xattrs_optional(s)) {
+		/* Old format filesystem, but optional xattrs have been enabled
+		 * at mount time. Error out. */
+		reiserfs_warning(s, "jdm-20005",
+				 "xattrs/ACLs not supported on pre v3.6 "
+				 "format filesystem. Failing mount.");
+		return -EOPNOTSUPP;
+	} else {
+		/* Old format filesystem, but no optional xattrs have
+		 * been enabled. This means we silently disable xattrs
+		 * on the filesystem. */
+		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+	}
+
+	return 0;
+}
+
+#else
+int __init reiserfs_xattr_register_handlers(void) { return 0; }
+void reiserfs_xattr_unregister_handlers(void) {}
+#endif
+
 /* This will catch lookups from the fs root to .reiserfs_priv */
 static int
 xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
@@ -1127,47 +1214,23 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 {
 	int err = 0;
 
-	/* We need generation numbers to ensure that the oid mapping is correct
-	 * v3.5 filesystems don't have them. */
-	if (!old_format_only(s)) {
-		set_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-	} else if (reiserfs_xattrs_optional(s)) {
-		/* Old format filesystem, but optional xattrs have been enabled
-		 * at mount time. Error out. */
-		reiserfs_warning(s, "jdm-20005",
-				 "xattrs/ACLs not supported on pre v3.6 "
-				 "format filesystem. Failing mount.");
-		err = -EOPNOTSUPP;
+#ifdef CONFIG_REISERFS_FS_XATTR
+	err = xattr_mount_check(s);
+	if (err)
 		goto error;
-	} else {
-		/* Old format filesystem, but no optional xattrs have been enabled. This
-		 * means we silently disable xattrs on the filesystem. */
-		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-	}
+#endif
 
 	/* If we don't have the privroot located yet - go find it */
-	if (reiserfs_xattrs(s) && !REISERFS_SB(s)->priv_root) {
+	if (!REISERFS_SB(s)->priv_root) {
 		struct dentry *dentry;
 		dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
 					strlen(PRIVROOT_NAME));
 		if (!IS_ERR(dentry)) {
-			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
-				struct inode *inode = dentry->d_parent->d_inode;
-				mutex_lock_nested(&inode->i_mutex,
-						  I_MUTEX_XATTR);
-				err = inode->i_op->mkdir(inode, dentry, 0700);
-				mutex_unlock(&inode->i_mutex);
-				if (err) {
-					dput(dentry);
-					dentry = NULL;
-				}
-
-				if (dentry && dentry->d_inode)
-					reiserfs_info(s, "Created %s - "
-						      "reserved for xattr "
-						      "storage.\n",
-						      PRIVROOT_NAME);
-			} else if (!dentry->d_inode) {
+#ifdef CONFIG_REISERFS_FS_XATTR
+			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
+				err = create_privroot(dentry);
+#endif
+			if (!dentry->d_inode) {
 				dput(dentry);
 				dentry = NULL;
 			}
@@ -1178,73 +1241,37 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 			s->s_root->d_op = &xattr_lookup_poison_ops;
 			dentry->d_inode->i_flags |= S_PRIVATE;
 			REISERFS_SB(s)->priv_root = dentry;
-		} else if (!(mount_flags & MS_RDONLY)) {	/* xattrs are unavailable */
-			/* If we're read-only it just means that the dir hasn't been
-			 * created. Not an error -- just no xattrs on the fs. We'll
-			 * check again if we go read-write */
+#ifdef CONFIG_REISERFS_FS_XATTR
+		/* xattrs are unavailable */
+		} else if (!(mount_flags & MS_RDONLY)) {
+			/* If we're read-only it just means that the dir
+			 * hasn't been created. Not an error -- just no
+			 * xattrs on the fs. We'll check again if we
+			 * go read-write */
 			reiserfs_warning(s, "jdm-20006",
 					 "xattrs/ACLs enabled and couldn't "
 					 "find/create .reiserfs_priv. "
 					 "Failing mount.");
 			err = -EOPNOTSUPP;
+#endif
 		}
 	}
 
-      error:
-	/* This is only nonzero if there was an error initializing the xattr
-	 * directory or if there is a condition where we don't support them. */
+#ifdef CONFIG_REISERFS_FS_XATTR
+error:
 	if (err) {
 		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
 		clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
 		clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
 	}
+#endif
 
 	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
 	s->s_flags = s->s_flags & ~MS_POSIXACL;
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 	if (reiserfs_posixacl(s))
 		s->s_flags |= MS_POSIXACL;
+#endif
 
 	return err;
 }
-
-static int reiserfs_check_acl(struct inode *inode, int mask)
-{
-	struct posix_acl *acl;
-	int error = -EAGAIN; /* do regular unix permission checks by default */
-
-	reiserfs_read_lock_xattr_i(inode);
-	reiserfs_read_lock_xattrs(inode->i_sb);
-
-	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-
-	reiserfs_read_unlock_xattrs(inode->i_sb);
-	reiserfs_read_unlock_xattr_i(inode);
-
-	if (acl) {
-		if (!IS_ERR(acl)) {
-			error = posix_acl_permission(inode, acl, mask);
-			posix_acl_release(acl);
-		} else if (PTR_ERR(acl) != -ENODATA)
-			error = PTR_ERR(acl);
-	}
-
-	return error;
-}
-
-int reiserfs_permission(struct inode *inode, int mask)
-{
-	/*
-	 * We don't do permission checks on the internal objects.
-	 * Permissions are determined by the "owning" object.
-	 */
-	if (IS_PRIVATE(inode))
-		return 0;
-
-	/*
-	 * Stat data v1 doesn't support ACLs.
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return generic_permission(inode, mask, NULL);
-	else
-		return generic_permission(inode, mask, reiserfs_check_acl);
-}
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 12fc2a0d13be..cbb8868e844e 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -402,8 +402,8 @@ struct reiserfs_sb_info {
 	int reserved_blocks;	/* amount of blocks reserved for further allocations */
 	spinlock_t bitmap_lock;	/* this lock on now only used to protect reserved_blocks variable */
 	struct dentry *priv_root;	/* root of /.reiserfs_priv */
-#ifdef CONFIG_REISERFS_FS_XATTR
 	struct dentry *xattr_root;	/* root of /.reiserfs_priv/.xa */
+#ifdef CONFIG_REISERFS_FS_XATTR
 	struct rw_semaphore xattr_dir_sem;
 #endif
 	int j_errno;
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 58f32ba7f5a0..13cdd5e1cb60 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -43,6 +43,12 @@ struct reiserfs_xattr_handler {
 	struct list_head handlers;
 };
 
+int reiserfs_xattr_register_handlers(void) __init;
+void reiserfs_xattr_unregister_handlers(void);
+int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
+int reiserfs_delete_xattrs(struct inode *inode);
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
+
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
 ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
@@ -51,9 +57,6 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name,
 		      const void *value, size_t size, int flags);
 ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int reiserfs_removexattr(struct dentry *dentry, const char *name);
-int reiserfs_delete_xattrs(struct inode *inode);
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
 int reiserfs_permission(struct inode *inode, int mask);
 
 int reiserfs_xattr_del(struct inode *, const char *);
@@ -64,9 +67,6 @@ extern struct reiserfs_xattr_handler user_handler;
 extern struct reiserfs_xattr_handler trusted_handler;
 extern struct reiserfs_xattr_handler security_handler;
 
-int reiserfs_xattr_register_handlers(void) __init;
-void reiserfs_xattr_unregister_handlers(void);
-
 static inline void reiserfs_write_lock_xattrs(struct super_block *sb)
 {
 	down_write(&REISERFS_XATTR_DIR_SEM(sb));
@@ -121,23 +121,6 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 
 #define reiserfs_permission NULL
 
-#define reiserfs_xattr_register_handlers() 0
-#define reiserfs_xattr_unregister_handlers()
-
-static inline int reiserfs_delete_xattrs(struct inode *inode)
-{
-	return 0;
-};
-static inline int reiserfs_chown_xattrs(struct inode *inode,
-					struct iattr *attrs)
-{
-	return 0;
-};
-static inline int reiserfs_xattr_init(struct super_block *sb, int mount_flags)
-{
-	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);	/* to be sure */
-	return 0;
-};
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
 }
-- 
cgit v1.2.3-71-gd317


From d984561b326cd0fe0d1183d11b9b4fa1d011d21d Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:35 -0400
Subject: reiserfs: eliminate per-super xattr lock

With the switch to using inode->i_mutex locking during lookups/creation
in the xattr root, the per-super xattr lock is no longer needed.

This patch removes it.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/inode.c            | 14 +-------
 fs/reiserfs/namei.c            | 29 -----------------
 fs/reiserfs/super.c            |  4 ---
 fs/reiserfs/xattr.c            | 70 +++++++++++++++++++--------------------
 fs/reiserfs/xattr_acl.c        | 74 +++++++++++++++++++-----------------------
 include/linux/reiserfs_fs.h    |  3 --
 include/linux/reiserfs_fs_sb.h |  3 --
 include/linux/reiserfs_xattr.h | 28 +++-------------
 8 files changed, 74 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index cd42a8658086..50a73e7afdc8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1957,19 +1957,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	inode->i_nlink = 0;
 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
 	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
-
-	/* If we were inheriting an ACL, we need to release the lock so that
-	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
-	 * code really needs to be reworked, but this will take care of it
-	 * for now. -jeffm */
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		iput(inode);
-		reiserfs_write_lock_xattrs(dir->i_sb);
-	} else
-#endif
-		iput(inode);
+	iput(inode);
 	return err;
 }
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c8430f1c824f..ddf1bcd41c87 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -609,9 +609,6 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	reiserfs_write_lock(dir->i_sb);
 
-	if (locked)
-		reiserfs_write_lock_xattrs(dir->i_sb);
-
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
 		drop_new_inode(inode);
@@ -624,11 +621,6 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	if (retval)
 		goto out_failed;
 
-	if (locked) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		locked = 0;
-	}
-
 	inode->i_op = &reiserfs_file_inode_operations;
 	inode->i_fop = &reiserfs_file_operations;
 	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
@@ -655,8 +647,6 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
-	if (locked)
-		reiserfs_write_unlock_xattrs(dir->i_sb);
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
 }
@@ -686,9 +676,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 
 	reiserfs_write_lock(dir->i_sb);
 
-	if (locked)
-		reiserfs_write_lock_xattrs(dir->i_sb);
-
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
 		drop_new_inode(inode);
@@ -702,11 +689,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 		goto out_failed;
 	}
 
-	if (locked) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		locked = 0;
-	}
-
 	inode->i_op = &reiserfs_special_inode_operations;
 	init_special_inode(inode, inode->i_mode, rdev);
 
@@ -736,8 +718,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
-	if (locked)
-		reiserfs_write_unlock_xattrs(dir->i_sb);
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
 }
@@ -767,8 +747,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	locked = reiserfs_cache_default_acl(dir);
 
 	reiserfs_write_lock(dir->i_sb);
-	if (locked)
-		reiserfs_write_lock_xattrs(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
@@ -790,11 +768,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_failed;
 	}
 
-	if (locked) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		locked = 0;
-	}
-
 	reiserfs_update_inode_transaction(inode);
 	reiserfs_update_inode_transaction(dir);
 
@@ -824,8 +797,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
       out_failed:
-	if (locked)
-		reiserfs_write_unlock_xattrs(dir->i_sb);
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index fc7cb4661ee0..6d10f81b4fc1 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1646,10 +1646,6 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
 	/* Preallocate by 16 blocks (17-1) at once */
 	REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
-#ifdef CONFIG_REISERFS_FS_XATTR
-	/* Initialize the rwsem for xattr dir */
-	init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
-#endif
 	/* setup default block allocator options */
 	reiserfs_init_alloc_options(s);
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 57920a4df7a4..62c98829c545 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -27,6 +27,12 @@
  * these are special cases for filesystem ACLs, they are interpreted by the
  * kernel, in addition, they are negatively and positively cached and attached
  * to the inode so that unnecessary lookups are avoided.
+ *
+ * Locking works like so:
+ * The xattr root (/.reiserfs_priv/xattrs) is protected by its i_mutex.
+ * The xattr dir (/.reiserfs_priv/xattrs/<oid>.<gen>) is protected by
+ * inode->xattr_sem.
+ * The xattrs themselves are likewise protected by the xattr_sem.
  */
 
 #include <linux/reiserfs_fs.h>
@@ -392,16 +398,17 @@ reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
 /* This is called w/ inode->i_mutex downed */
 int reiserfs_delete_xattrs(struct inode *inode)
 {
-	struct dentry *dir, *root;
 	int err = 0;
+	struct dentry *dir, *root;
+	struct reiserfs_transaction_handle th;
+	int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+		     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
 
 	/* Skip out, an xattr has no xattrs associated with it */
 	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return 0;
 
-	reiserfs_read_lock_xattrs(inode->i_sb);
 	dir = open_xa_dir(inode, XATTR_REPLACE);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
 	if (IS_ERR(dir)) {
 		err = PTR_ERR(dir);
 		goto out;
@@ -416,18 +423,26 @@ int reiserfs_delete_xattrs(struct inode *inode)
 	if (err)
 		goto out_dir;
 
-	/* Leftovers besides . and .. -- that's not good. */
-	if (dir->d_inode->i_nlink <= 2) {
-		root = open_xa_root(inode->i_sb, XATTR_REPLACE);
-		reiserfs_write_lock_xattrs(inode->i_sb);
+	/* We start a transaction here to avoid a ABBA situation
+	 * between the xattr root's i_mutex and the journal lock.
+	 * Inode creation will inherit an ACL, which requires a
+	 * lookup. The lookup locks the xattr root i_mutex with a
+	 * transaction open.  Inode deletion takes teh xattr root
+	 * i_mutex to delete the directory and then starts a
+	 * transaction inside it. Boom. This doesn't incur much
+	 * additional overhead since the reiserfs_rmdir transaction
+	 * will just nest inside the outer transaction. */
+	err = journal_begin(&th, inode->i_sb, blocks);
+	if (!err) {
+		int jerror;
+		root = dget(dir->d_parent);
 		mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_XATTR);
 		err = xattr_rmdir(root->d_inode, dir);
+		jerror = journal_end(&th, inode->i_sb, blocks);
 		mutex_unlock(&root->d_inode->i_mutex);
-		reiserfs_write_unlock_xattrs(inode->i_sb);
 		dput(root);
-	} else {
-		reiserfs_warning(inode->i_sb, "jdm-20006",
-				 "Couldn't remove all entries in directory");
+
+		err = jerror ?: err;
 	}
 
 out_dir:
@@ -437,6 +452,9 @@ out:
 	if (!err)
 		REISERFS_I(inode)->i_flags =
 		    REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
+	else
+		reiserfs_warning(inode->i_sb, "jdm-20004",
+				 "Couldn't remove all xattrs (%d)\n", err);
 	return err;
 }
 
@@ -485,9 +503,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return 0;
 
-	reiserfs_read_lock_xattrs(inode->i_sb);
 	dir = open_xa_dir(inode, XATTR_REPLACE);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
 	if (IS_ERR(dir)) {
 		if (PTR_ERR(dir) != -ENODATA)
 			err = PTR_ERR(dir);
@@ -731,6 +747,11 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 		goto out;
 	}
 
+	/* protect against concurrent access. xattrs are backed by
+	 * regular files, but they're not regular files. The updates
+	 * must be atomic from the perspective of the user. */
+	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
+
 	isize = i_size_read(dentry->d_inode);
 	REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
 
@@ -798,6 +819,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 	}
 
 out_dput:
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	dput(dentry);
 
 out:
@@ -834,7 +856,6 @@ int reiserfs_xattr_del(struct inode *inode, const char *name)
 static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char *);
 /*
  * Inode operation getxattr()
- * Preliminary locking: we down dentry->d_inode->i_mutex
  */
 ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
@@ -848,9 +869,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 		return -EOPNOTSUPP;
 
 	reiserfs_read_lock_xattr_i(dentry->d_inode);
-	reiserfs_read_lock_xattrs(dentry->d_sb);
 	err = xah->get(dentry->d_inode, name, buffer, size);
-	reiserfs_read_unlock_xattrs(dentry->d_sb);
 	reiserfs_read_unlock_xattr_i(dentry->d_inode);
 	return err;
 }
@@ -866,23 +885,13 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 {
 	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
 	int err;
-	int lock;
 
 	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
 	reiserfs_write_lock_xattr_i(dentry->d_inode);
-	lock = !has_xattr_dir(dentry->d_inode);
-	if (lock)
-		reiserfs_write_lock_xattrs(dentry->d_sb);
-	else
-		reiserfs_read_lock_xattrs(dentry->d_sb);
 	err = xah->set(dentry->d_inode, name, value, size, flags);
-	if (lock)
-		reiserfs_write_unlock_xattrs(dentry->d_sb);
-	else
-		reiserfs_read_unlock_xattrs(dentry->d_sb);
 	reiserfs_write_unlock_xattr_i(dentry->d_inode);
 	return err;
 }
@@ -902,8 +911,6 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 		return -EOPNOTSUPP;
 
 	reiserfs_write_lock_xattr_i(dentry->d_inode);
-	reiserfs_read_lock_xattrs(dentry->d_sb);
-
 	/* Deletion pre-operation */
 	if (xah->del) {
 		err = xah->del(dentry->d_inode, name);
@@ -917,7 +924,6 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 	mark_inode_dirty(dentry->d_inode);
 
       out:
-	reiserfs_read_unlock_xattrs(dentry->d_sb);
 	reiserfs_write_unlock_xattr_i(dentry->d_inode);
 	return err;
 }
@@ -966,8 +972,6 @@ reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
 
 /*
  * Inode operation listxattr()
- *
- * Preliminary locking: we down dentry->d_inode->i_mutex
  */
 ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
@@ -983,9 +987,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 		return -EOPNOTSUPP;
 
 	reiserfs_read_lock_xattr_i(dentry->d_inode);
-	reiserfs_read_lock_xattrs(dentry->d_sb);
 	dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE);
-	reiserfs_read_unlock_xattrs(dentry->d_sb);
 	if (IS_ERR(dir)) {
 		err = PTR_ERR(dir);
 		if (err == -ENODATA)
@@ -1114,11 +1116,9 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
 	int error = -EAGAIN; /* do regular unix permission checks by default */
 
 	reiserfs_read_lock_xattr_i(inode);
-	reiserfs_read_lock_xattrs(inode->i_sb);
 
 	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
 
-	reiserfs_read_unlock_xattrs(inode->i_sb);
 	reiserfs_read_unlock_xattr_i(inode);
 
 	if (acl) {
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9128e4d5ba64..d63b2c5850c3 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -172,6 +172,29 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 	return ERR_PTR(-EINVAL);
 }
 
+static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
+			    struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != ERR_PTR(-ENODATA))
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static inline struct posix_acl *iget_acl(struct inode *inode,
+					 struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = ERR_PTR(-ENODATA);
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != ERR_PTR(-ENODATA))
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
 /*
  * Inode operation get_posix_acl().
  *
@@ -199,11 +222,11 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (IS_ERR(*p_acl)) {
-		if (PTR_ERR(*p_acl) == -ENODATA)
-			return NULL;
-	} else if (*p_acl != NULL)
-		return posix_acl_dup(*p_acl);
+	acl = iget_acl(inode, p_acl);
+	if (acl && !IS_ERR(acl))
+		return acl;
+	else if (PTR_ERR(acl) == -ENODATA)
+		return NULL;
 
 	size = reiserfs_xattr_get(inode, name, NULL, 0);
 	if (size < 0) {
@@ -229,7 +252,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	} else {
 		acl = posix_acl_from_disk(value, retval);
 		if (!IS_ERR(acl))
-			*p_acl = posix_acl_dup(acl);
+			iset_acl(inode, p_acl, acl);
 	}
 
 	kfree(value);
@@ -300,16 +323,8 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 	kfree(value);
 
-	if (!error) {
-		/* Release the old one */
-		if (!IS_ERR(*p_acl) && *p_acl)
-			posix_acl_release(*p_acl);
-
-		if (acl == NULL)
-			*p_acl = ERR_PTR(-ENODATA);
-		else
-			*p_acl = posix_acl_dup(acl);
-	}
+	if (!error)
+		iset_acl(inode, p_acl, acl);
 
 	return error;
 }
@@ -404,9 +419,7 @@ int reiserfs_cache_default_acl(struct inode *inode)
 	if (reiserfs_posixacl(inode->i_sb) && !IS_PRIVATE(inode)) {
 		struct posix_acl *acl;
 		reiserfs_read_lock_xattr_i(inode);
-		reiserfs_read_lock_xattrs(inode->i_sb);
 		acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
-		reiserfs_read_unlock_xattrs(inode->i_sb);
 		reiserfs_read_unlock_xattr_i(inode);
 		ret = (acl && !IS_ERR(acl));
 		if (ret)
@@ -429,9 +442,7 @@ int reiserfs_acl_chmod(struct inode *inode)
 		return 0;
 	}
 
-	reiserfs_read_lock_xattrs(inode->i_sb);
 	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
 	if (!acl)
 		return 0;
 	if (IS_ERR(acl))
@@ -442,17 +453,8 @@ int reiserfs_acl_chmod(struct inode *inode)
 		return -ENOMEM;
 	error = posix_acl_chmod_masq(clone, inode->i_mode);
 	if (!error) {
-		int lock = !has_xattr_dir(inode);
 		reiserfs_write_lock_xattr_i(inode);
-		if (lock)
-			reiserfs_write_lock_xattrs(inode->i_sb);
-		else
-			reiserfs_read_lock_xattrs(inode->i_sb);
 		error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
-		if (lock)
-			reiserfs_write_unlock_xattrs(inode->i_sb);
-		else
-			reiserfs_read_unlock_xattrs(inode->i_sb);
 		reiserfs_write_unlock_xattr_i(inode);
 	}
 	posix_acl_release(clone);
@@ -480,14 +482,9 @@ posix_acl_access_set(struct inode *inode, const char *name,
 static int posix_acl_access_del(struct inode *inode, const char *name)
 {
 	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-	struct posix_acl **acl = &reiserfs_i->i_acl_access;
 	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
 		return -EINVAL;
-	if (!IS_ERR(*acl) && *acl) {
-		posix_acl_release(*acl);
-		*acl = ERR_PTR(-ENODATA);
-	}
-
+	iset_acl(inode, &reiserfs_i->i_acl_access, ERR_PTR(-ENODATA));
 	return 0;
 }
 
@@ -533,14 +530,9 @@ posix_acl_default_set(struct inode *inode, const char *name,
 static int posix_acl_default_del(struct inode *inode, const char *name)
 {
 	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-	struct posix_acl **acl = &reiserfs_i->i_acl_default;
 	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
 		return -EINVAL;
-	if (!IS_ERR(*acl) && *acl) {
-		posix_acl_release(*acl);
-		*acl = ERR_PTR(-ENODATA);
-	}
-
+	iset_acl(inode, &reiserfs_i->i_acl_default, ERR_PTR(-ENODATA));
 	return 0;
 }
 
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 6c4af98b6767..e00d240314c5 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2224,7 +2224,4 @@ int reiserfs_unpack(struct inode *inode, struct file *filp);
 #define reiserfs_write_lock( sb ) lock_kernel()
 #define reiserfs_write_unlock( sb ) unlock_kernel()
 
-/* xattr stuff */
-#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
-
 #endif				/* _LINUX_REISER_FS_H */
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index cbb8868e844e..c8aee41ccc23 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -403,9 +403,6 @@ struct reiserfs_sb_info {
 	spinlock_t bitmap_lock;	/* this lock on now only used to protect reserved_blocks variable */
 	struct dentry *priv_root;	/* root of /.reiserfs_priv */
 	struct dentry *xattr_root;	/* root of /.reiserfs_priv/.xa */
-#ifdef CONFIG_REISERFS_FS_XATTR
-	struct rw_semaphore xattr_dir_sem;
-#endif
 	int j_errno;
 #ifdef CONFIG_QUOTA
 	char *s_qf_names[MAXQUOTAS];
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 13cdd5e1cb60..65c16fa51246 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -67,45 +67,27 @@ extern struct reiserfs_xattr_handler user_handler;
 extern struct reiserfs_xattr_handler trusted_handler;
 extern struct reiserfs_xattr_handler security_handler;
 
-static inline void reiserfs_write_lock_xattrs(struct super_block *sb)
-{
-	down_write(&REISERFS_XATTR_DIR_SEM(sb));
-}
-static inline void reiserfs_write_unlock_xattrs(struct super_block *sb)
-{
-	up_write(&REISERFS_XATTR_DIR_SEM(sb));
-}
-static inline void reiserfs_read_lock_xattrs(struct super_block *sb)
-{
-	down_read(&REISERFS_XATTR_DIR_SEM(sb));
-}
-
-static inline void reiserfs_read_unlock_xattrs(struct super_block *sb)
-{
-	up_read(&REISERFS_XATTR_DIR_SEM(sb));
-}
-
 static inline void reiserfs_write_lock_xattr_i(struct inode *inode)
 {
-	down_write(&REISERFS_I(inode)->xattr_sem);
+	down_write(&REISERFS_I(inode)->i_xattr_sem);
 }
 static inline void reiserfs_write_unlock_xattr_i(struct inode *inode)
 {
-	up_write(&REISERFS_I(inode)->xattr_sem);
+	up_write(&REISERFS_I(inode)->i_xattr_sem);
 }
 static inline void reiserfs_read_lock_xattr_i(struct inode *inode)
 {
-	down_read(&REISERFS_I(inode)->xattr_sem);
+	down_read(&REISERFS_I(inode)->i_xattr_sem);
 }
 
 static inline void reiserfs_read_unlock_xattr_i(struct inode *inode)
 {
-	up_read(&REISERFS_I(inode)->xattr_sem);
+	up_read(&REISERFS_I(inode)->i_xattr_sem);
 }
 
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
-	init_rwsem(&REISERFS_I(inode)->xattr_sem);
+	init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
 }
 
 #else
-- 
cgit v1.2.3-71-gd317


From 8b6dd72a441a683cef7ace93de0a57ced4367f00 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:36 -0400
Subject: reiserfs: make per-inode xattr locking more fine grained

The per-inode locking can be made more fine-grained to surround just the
interaction with the filesystem itself.  This really only applies to
protecting reads during a write, since concurrent writes are barred with
inode->i_mutex at the vfs level.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/xattr.c            | 114 +++++++++++++++++++----------------------
 fs/reiserfs/xattr_acl.c        |   7 +--
 include/linux/reiserfs_fs_i.h  |   2 +-
 include/linux/reiserfs_xattr.h |  22 --------
 4 files changed, 55 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 62c98829c545..ccb8e4d4c032 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -29,10 +29,8 @@
  * to the inode so that unnecessary lookups are avoided.
  *
  * Locking works like so:
- * The xattr root (/.reiserfs_priv/xattrs) is protected by its i_mutex.
- * The xattr dir (/.reiserfs_priv/xattrs/<oid>.<gen>) is protected by
- * inode->xattr_sem.
- * The xattrs themselves are likewise protected by the xattr_sem.
+ * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
+ * The xattrs themselves are protected by the xattr_sem.
  */
 
 #include <linux/reiserfs_fs.h>
@@ -55,6 +53,8 @@
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
 
+static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char *);
+
 /* Helpers for inode ops. We do this so that we don't have all the VFS
  * overhead and also for proper i_mutex annotation.
  * dir->i_mutex must be held for all of them. */
@@ -339,12 +339,14 @@ int xattr_readdir(struct inode *inode, filldir_t filler, void *buf)
 	return res;
 }
 
+/* expects xadir->d_inode->i_mutex to be locked */
 static int
 __reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
 {
 	struct dentry *dentry;
 	struct inode *dir = xadir->d_inode;
 	int err = 0;
+	struct reiserfs_xattr_handler *xah;
 
 	dentry = lookup_one_len(name, xadir, namelen);
 	if (IS_ERR(dentry)) {
@@ -372,6 +374,14 @@ __reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
 		return -EIO;
 	}
 
+	/* Deletion pre-operation */
+	xah = find_xattr_handler_prefix(name);
+	if (xah && xah->del) {
+		err = xah->del(dentry->d_inode, name);
+		if (err)
+			goto out;
+	}
+
 	err = xattr_unlink(dir, dentry);
 
 out_file:
@@ -398,7 +408,7 @@ reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
 /* This is called w/ inode->i_mutex downed */
 int reiserfs_delete_xattrs(struct inode *inode)
 {
-	int err = 0;
+	int err = -ENODATA;
 	struct dentry *dir, *root;
 	struct reiserfs_transaction_handle th;
 	int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
@@ -414,14 +424,19 @@ int reiserfs_delete_xattrs(struct inode *inode)
 		goto out;
 	} else if (!dir->d_inode) {
 		dput(dir);
-		return 0;
+		goto out;
 	}
 
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
 	err = xattr_readdir(dir->d_inode, reiserfs_delete_xattrs_filler, dir);
 	mutex_unlock(&dir->d_inode->i_mutex);
-	if (err)
-		goto out_dir;
+	if (err) {
+		dput(dir);
+		goto out;
+	}
+
+	root = dget(dir->d_parent);
+	dput(dir);
 
 	/* We start a transaction here to avoid a ABBA situation
 	 * between the xattr root's i_mutex and the journal lock.
@@ -435,19 +450,14 @@ int reiserfs_delete_xattrs(struct inode *inode)
 	err = journal_begin(&th, inode->i_sb, blocks);
 	if (!err) {
 		int jerror;
-		root = dget(dir->d_parent);
 		mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_XATTR);
 		err = xattr_rmdir(root->d_inode, dir);
 		jerror = journal_end(&th, inode->i_sb, blocks);
 		mutex_unlock(&root->d_inode->i_mutex);
-		dput(root);
-
 		err = jerror ?: err;
 	}
 
-out_dir:
-	dput(dir);
-
+	dput(root);
 out:
 	if (!err)
 		REISERFS_I(inode)->i_flags =
@@ -484,7 +494,7 @@ reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
 
 	if (!S_ISDIR(xafile->d_inode->i_mode)) {
 		mutex_lock_nested(&xafile->d_inode->i_mutex, I_MUTEX_CHILD);
-		err = notify_change(xafile, attrs);
+		err = reiserfs_setattr(xafile, attrs);
 		mutex_unlock(&xafile->d_inode->i_mutex);
 	}
 	dput(xafile);
@@ -520,13 +530,16 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 	err = xattr_readdir(dir->d_inode, reiserfs_chown_xattrs_filler, &buf);
 
 	if (!err)
-		err = notify_change(dir, attrs);
+		err = reiserfs_setattr(dir, attrs);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
 	attrs->ia_valid = ia_valid;
 out_dir:
 	dput(dir);
 out:
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20007",
+				 "Couldn't chown all xattrs (%d)\n", err);
 	return err;
 }
 
@@ -635,9 +648,8 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	if (get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	/* Empty xattrs are ok, they're just empty files, no hash */
-	if (buffer && buffer_size)
-		xahash = xattr_hash(buffer, buffer_size);
+	if (!buffer)
+		return reiserfs_xattr_del(inode, name);
 
 	dentry = get_xa_file_dentry(inode, name, flags);
 	if (IS_ERR(dentry)) {
@@ -645,13 +657,19 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 		goto out;
 	}
 
+	down_write(&REISERFS_I(inode)->i_xattr_sem);
+
+	xahash = xattr_hash(buffer, buffer_size);
 	REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
 
 	/* Resize it so we're ok to write there */
 	newattrs.ia_size = buffer_size;
+	newattrs.ia_ctime = current_fs_time(inode->i_sb);
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = notify_change(dentry, &newattrs);
+	down_write(&dentry->d_inode->i_alloc_sem);
+	err = reiserfs_setattr(dentry, &newattrs);
+	up_write(&dentry->d_inode->i_alloc_sem);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	if (err)
 		goto out_filp;
@@ -712,6 +730,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	}
 
       out_filp:
+	up_write(&REISERFS_I(inode)->i_xattr_sem);
 	dput(dentry);
 
       out:
@@ -747,10 +766,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 		goto out;
 	}
 
-	/* protect against concurrent access. xattrs are backed by
-	 * regular files, but they're not regular files. The updates
-	 * must be atomic from the perspective of the user. */
-	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
+	down_read(&REISERFS_I(inode)->i_xattr_sem);
 
 	isize = i_size_read(dentry->d_inode);
 	REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
@@ -758,12 +774,12 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 	/* Just return the size needed */
 	if (buffer == NULL) {
 		err = isize - sizeof(struct reiserfs_xattr_header);
-		goto out_dput;
+		goto out_unlock;
 	}
 
 	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
 		err = -ERANGE;
-		goto out_dput;
+		goto out_unlock;
 	}
 
 	while (file_pos < isize) {
@@ -778,7 +794,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 		page = reiserfs_get_page(dentry->d_inode, file_pos);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
-			goto out_dput;
+			goto out_unlock;
 		}
 
 		lock_page(page);
@@ -797,7 +813,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 						 "associated with %k", name,
 						 INODE_PKEY(inode));
 				err = -EIO;
-				goto out_dput;
+				goto out_unlock;
 			}
 			hash = le32_to_cpu(rxh->h_hash);
 		}
@@ -818,8 +834,8 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 		err = -EIO;
 	}
 
-out_dput:
-	mutex_unlock(&dentry->d_inode->i_mutex);
+out_unlock:
+	up_read(&REISERFS_I(inode)->i_xattr_sem);
 	dput(dentry);
 
 out:
@@ -852,8 +868,6 @@ int reiserfs_xattr_del(struct inode *inode, const char *name)
 }
 
 /* Actual operations that are exported to VFS-land */
-
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char *);
 /*
  * Inode operation getxattr()
  */
@@ -868,9 +882,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_read_lock_xattr_i(dentry->d_inode);
 	err = xah->get(dentry->d_inode, name, buffer, size);
-	reiserfs_read_unlock_xattr_i(dentry->d_inode);
 	return err;
 }
 
@@ -890,9 +902,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_write_lock_xattr_i(dentry->d_inode);
 	err = xah->set(dentry->d_inode, name, value, size, flags);
-	reiserfs_write_unlock_xattr_i(dentry->d_inode);
 	return err;
 }
 
@@ -910,21 +920,11 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_write_lock_xattr_i(dentry->d_inode);
-	/* Deletion pre-operation */
-	if (xah->del) {
-		err = xah->del(dentry->d_inode, name);
-		if (err)
-			goto out;
-	}
-
 	err = reiserfs_xattr_del(dentry->d_inode, name);
 
 	dentry->d_inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dentry->d_inode);
 
-      out:
-	reiserfs_write_unlock_xattr_i(dentry->d_inode);
 	return err;
 }
 
@@ -986,7 +986,6 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_read_lock_xattr_i(dentry->d_inode);
 	dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE);
 	if (IS_ERR(dir)) {
 		err = PTR_ERR(dir);
@@ -1005,19 +1004,16 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
 	err = xattr_readdir(dir->d_inode, reiserfs_listxattr_filler, &buf);
 	mutex_unlock(&dir->d_inode->i_mutex);
-	if (err)
-		goto out_dir;
 
-	if (buf.r_pos > buf.r_size && buffer != NULL)
-		err = -ERANGE;
-	else
-		err = buf.r_pos;
+	if (!err) {
+		if (buf.r_pos > buf.r_size && buffer != NULL)
+			err = -ERANGE;
+		else
+			err = buf.r_pos;
+	}
 
-      out_dir:
 	dput(dir);
-
-      out:
-	reiserfs_read_unlock_xattr_i(dentry->d_inode);
+out:
 	return err;
 }
 
@@ -1115,12 +1111,8 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
 	struct posix_acl *acl;
 	int error = -EAGAIN; /* do regular unix permission checks by default */
 
-	reiserfs_read_lock_xattr_i(inode);
-
 	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
 
-	reiserfs_read_unlock_xattr_i(inode);
-
 	if (acl) {
 		if (!IS_ERR(acl)) {
 			error = posix_acl_permission(inode, acl, mask);
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d63b2c5850c3..d3ce6ee9b262 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -418,9 +418,7 @@ int reiserfs_cache_default_acl(struct inode *inode)
 	int ret = 0;
 	if (reiserfs_posixacl(inode->i_sb) && !IS_PRIVATE(inode)) {
 		struct posix_acl *acl;
-		reiserfs_read_lock_xattr_i(inode);
 		acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
-		reiserfs_read_unlock_xattr_i(inode);
 		ret = (acl && !IS_ERR(acl));
 		if (ret)
 			posix_acl_release(acl);
@@ -452,11 +450,8 @@ int reiserfs_acl_chmod(struct inode *inode)
 	if (!clone)
 		return -ENOMEM;
 	error = posix_acl_chmod_masq(clone, inode->i_mode);
-	if (!error) {
-		reiserfs_write_lock_xattr_i(inode);
+	if (!error)
 		error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
-		reiserfs_write_unlock_xattr_i(inode);
-	}
 	posix_acl_release(clone);
 	return error;
 }
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index 201dd910b042..76360b36ac33 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -59,7 +59,7 @@ struct reiserfs_inode_info {
 	struct posix_acl *i_acl_default;
 #endif
 #ifdef CONFIG_REISERFS_FS_XATTR
-	struct rw_semaphore xattr_sem;
+	struct rw_semaphore i_xattr_sem;
 #endif
 	struct inode vfs_inode;
 };
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 65c16fa51246..3bd154fd56e9 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -67,24 +67,6 @@ extern struct reiserfs_xattr_handler user_handler;
 extern struct reiserfs_xattr_handler trusted_handler;
 extern struct reiserfs_xattr_handler security_handler;
 
-static inline void reiserfs_write_lock_xattr_i(struct inode *inode)
-{
-	down_write(&REISERFS_I(inode)->i_xattr_sem);
-}
-static inline void reiserfs_write_unlock_xattr_i(struct inode *inode)
-{
-	up_write(&REISERFS_I(inode)->i_xattr_sem);
-}
-static inline void reiserfs_read_lock_xattr_i(struct inode *inode)
-{
-	down_read(&REISERFS_I(inode)->i_xattr_sem);
-}
-
-static inline void reiserfs_read_unlock_xattr_i(struct inode *inode)
-{
-	up_read(&REISERFS_I(inode)->i_xattr_sem);
-}
-
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
 	init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
@@ -96,10 +78,6 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 #define reiserfs_setxattr NULL
 #define reiserfs_listxattr NULL
 #define reiserfs_removexattr NULL
-#define reiserfs_write_lock_xattrs(sb) do {;} while(0)
-#define reiserfs_write_unlock_xattrs(sb) do {;} while(0)
-#define reiserfs_read_lock_xattrs(sb)
-#define reiserfs_read_unlock_xattrs(sb)
 
 #define reiserfs_permission NULL
 
-- 
cgit v1.2.3-71-gd317


From 48b32a3553a54740d236b79a90f20147a25875e3 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:38 -0400
Subject: reiserfs: use generic xattr handlers

Christoph Hellwig had asked me quite some time ago to port the reiserfs
xattrs to the generic xattr interface.

This patch replaces the reiserfs-specific xattr handling code with the
generic struct xattr_handler.

However, since reiserfs doesn't split the prefix and name when accessing
xattrs, it can't leverage generic_{set,get,list,remove}xattr without
needlessly reconstructing the name on the back end.

Update 7/26/07: Added missing dput() to deletion path.
Update 8/30/07: Added missing mark_inode_dirty when i_mode is used to
                represent an ACL and no previous ACL existed.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c            |   7 -
 fs/reiserfs/xattr.c            | 467 +++++++++++++++++------------------------
 fs/reiserfs/xattr_acl.c        |  79 +++----
 fs/reiserfs/xattr_security.c   |  26 +--
 fs/reiserfs/xattr_trusted.c    |  45 +---
 fs/reiserfs/xattr_user.c       |  31 +--
 include/linux/reiserfs_acl.h   |  16 +-
 include/linux/reiserfs_fs_sb.h |   3 +-
 include/linux/reiserfs_xattr.h |  25 +--
 9 files changed, 258 insertions(+), 441 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 6d10f81b4fc1..4a1e16362ebd 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2263,9 +2263,6 @@ static int __init init_reiserfs_fs(void)
 		return ret;
 	}
 
-	if ((ret = reiserfs_xattr_register_handlers()))
-		goto failed_reiserfs_xattr_register_handlers;
-
 	reiserfs_proc_info_global_init();
 	reiserfs_proc_register_global("version",
 				      reiserfs_global_version_in_proc);
@@ -2276,9 +2273,6 @@ static int __init init_reiserfs_fs(void)
 		return 0;
 	}
 
-	reiserfs_xattr_unregister_handlers();
-
-      failed_reiserfs_xattr_register_handlers:
 	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	destroy_inodecache();
@@ -2288,7 +2282,6 @@ static int __init init_reiserfs_fs(void)
 
 static void __exit exit_reiserfs_fs(void)
 {
-	reiserfs_xattr_unregister_handlers();
 	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	unregister_filesystem(&reiserfs_fs_type);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8d7e5d9ae17c..d3ce27436605 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -53,7 +53,6 @@
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
 
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char *);
 
 /* Helpers for inode ops. We do this so that we don't have all the VFS
  * overhead and also for proper i_mutex annotation.
@@ -110,7 +109,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 	return error;
 }
 
-
 #define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
 
 /* Returns and possibly creates the xattr dir. */
@@ -339,14 +337,17 @@ int xattr_readdir(struct inode *inode, filldir_t filler, void *buf)
 	return res;
 }
 
-/* expects xadir->d_inode->i_mutex to be locked */
+/* The following are side effects of other operations that aren't explicitly
+ * modifying extended attributes. This includes operations such as permissions
+ * or ownership changes, object deletions, etc. */
+
 static int
-__reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
+reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
+			      loff_t offset, u64 ino, unsigned int d_type)
 {
+	struct dentry *xadir = (struct dentry *)buf;
 	struct dentry *dentry;
-	struct inode *dir = xadir->d_inode;
 	int err = 0;
-	struct reiserfs_xattr_handler *xah;
 
 	dentry = lookup_one_len(name, xadir, namelen);
 	if (IS_ERR(dentry)) {
@@ -361,28 +362,7 @@ __reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
 	if (S_ISDIR(dentry->d_inode->i_mode))
 		goto out_file;
 
-	if (!IS_PRIVATE(dentry->d_inode)) {
-		reiserfs_error(dir->i_sb, "jdm-20003",
-			       "OID %08x [%.*s/%.*s] doesn't have "
-			       "priv flag set [parent is %sset].",
-			       le32_to_cpu(INODE_PKEY(dentry->d_inode)->
-					   k_objectid), xadir->d_name.len,
-			       xadir->d_name.name, namelen, name,
-			       IS_PRIVATE(xadir->d_inode) ? "" :
-			       "not ");
-		dput(dentry);
-		return -EIO;
-	}
-
-	/* Deletion pre-operation */
-	xah = find_xattr_handler_prefix(name);
-	if (xah && xah->del) {
-		err = xah->del(dentry->d_inode, name);
-		if (err)
-			goto out;
-	}
-
-	err = xattr_unlink(dir, dentry);
+	err = xattr_unlink(xadir->d_inode, dentry);
 
 out_file:
 	dput(dentry);
@@ -391,20 +371,6 @@ out:
 	return err;
 }
 
-/* The following are side effects of other operations that aren't explicitly
- * modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc. */
-
-static int
-reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
-			      loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct dentry *xadir = (struct dentry *)buf;
-
-	return __reiserfs_xattr_del(xadir, name, namelen);
-
-}
-
 /* This is called w/ inode->i_mutex downed */
 int reiserfs_delete_xattrs(struct inode *inode)
 {
@@ -541,14 +507,11 @@ out:
 }
 
 #ifdef CONFIG_REISERFS_FS_XATTR
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
-								*prefix);
-
 /* Returns a dentry corresponding to a specific extended attribute file
  * for the inode. If flags allow, the file is created. Otherwise, a
  * valid or negative dentry, or an error is returned. */
-static struct dentry *get_xa_file_dentry(const struct inode *inode,
-					 const char *name, int flags)
+static struct dentry *xattr_lookup(struct inode *inode, const char *name,
+				    int flags)
 {
 	struct dentry *xadir, *xafile;
 	int err = 0;
@@ -623,6 +586,45 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 int reiserfs_prepare_write(struct file *f, struct page *page,
 			   unsigned from, unsigned to);
 
+static void update_ctime(struct inode *inode)
+{
+	struct timespec now = current_fs_time(inode->i_sb);
+	if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+	    timespec_equal(&inode->i_ctime, &now))
+		return;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+static int lookup_and_delete_xattr(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct dentry *dentry, *xadir;
+
+	xadir = open_xa_dir(inode, XATTR_REPLACE);
+	if (IS_ERR(xadir))
+		return PTR_ERR(xadir);
+
+	dentry = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_dput;
+	}
+
+	if (dentry->d_inode) {
+		mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
+		err = xattr_unlink(xadir->d_inode, dentry);
+		mutex_unlock(&xadir->d_inode->i_mutex);
+		update_ctime(inode);
+	}
+
+	dput(dentry);
+out_dput:
+	dput(xadir);
+	return err;
+}
+
 
 /* Generic extended attribute operations that can be used by xa plugins */
 
@@ -630,8 +632,8 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
  * inode->i_mutex: down
  */
 int
-reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
-		   size_t buffer_size, int flags)
+__reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
+		     size_t buffer_size, int flags)
 {
 	int err = 0;
 	struct dentry *dentry;
@@ -639,37 +641,22 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	char *data;
 	size_t file_pos = 0;
 	size_t buffer_pos = 0;
-	struct iattr newattrs;
+	size_t new_size;
 	__u32 xahash = 0;
 
 	if (get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
 	if (!buffer)
-		return reiserfs_xattr_del(inode, name);
+		return lookup_and_delete_xattr(inode, name);
 
-	dentry = get_xa_file_dentry(inode, name, flags);
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out;
-	}
+	dentry = xattr_lookup(inode, name, flags);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	down_write(&REISERFS_I(inode)->i_xattr_sem);
 
 	xahash = xattr_hash(buffer, buffer_size);
-
-	/* Resize it so we're ok to write there */
-	newattrs.ia_size = buffer_size;
-	newattrs.ia_ctime = current_fs_time(inode->i_sb);
-	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
-	down_write(&dentry->d_inode->i_alloc_sem);
-	err = reiserfs_setattr(dentry, &newattrs);
-	up_write(&dentry->d_inode->i_alloc_sem);
-	mutex_unlock(&dentry->d_inode->i_mutex);
-	if (err)
-		goto out_filp;
-
 	while (buffer_pos < buffer_size || buffer_pos == 0) {
 		size_t chunk;
 		size_t skip = 0;
@@ -682,7 +669,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 		page = reiserfs_get_page(dentry->d_inode, file_pos);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
-			goto out_filp;
+			goto out_unlock;
 		}
 
 		lock_page(page);
@@ -716,20 +703,33 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 			break;
 	}
 
-	/* We can't mark the inode dirty if it's not hashed. This is the case
-	 * when we're inheriting the default ACL. If we dirty it, the inode
-	 * gets marked dirty, but won't (ever) make it onto the dirty list until
-	 * it's synced explicitly to clear I_DIRTY. This is bad. */
-	if (!hlist_unhashed(&inode->i_hash)) {
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-	}
-
-      out_filp:
+	new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
+	if (!err && new_size < i_size_read(dentry->d_inode)) {
+		struct iattr newattrs = {
+			.ia_ctime = current_fs_time(inode->i_sb),
+			.ia_size = buffer_size,
+			.ia_valid = ATTR_SIZE | ATTR_CTIME,
+		};
+		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
+		down_write(&dentry->d_inode->i_alloc_sem);
+		err = reiserfs_setattr(dentry, &newattrs);
+		up_write(&dentry->d_inode->i_alloc_sem);
+		mutex_unlock(&dentry->d_inode->i_mutex);
+	} else
+		update_ctime(inode);
+out_unlock:
 	up_write(&REISERFS_I(inode)->i_xattr_sem);
 	dput(dentry);
+	return err;
+}
 
-      out:
+int
+reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
+		     size_t buffer_size, int flags)
+{
+	int err = __reiserfs_xattr_set(inode, name, buffer, buffer_size, flags);
+	if (err == -ENODATA)
+		err = 0;
 	return err;
 }
 
@@ -737,7 +737,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
  * inode->i_mutex: down
  */
 int
-reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
+reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
 		   size_t buffer_size)
 {
 	ssize_t err = 0;
@@ -756,7 +756,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 	if (get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	dentry = get_xa_file_dentry(inode, name, XATTR_REPLACE);
+	dentry = xattr_lookup(inode, name, XATTR_REPLACE);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
 		goto out;
@@ -837,32 +837,53 @@ out:
 	return err;
 }
 
-int reiserfs_xattr_del(struct inode *inode, const char *name)
-{
-	struct dentry *dir;
-	int err;
+/* Actual operations that are exported to VFS-land */
+struct xattr_handler *reiserfs_xattr_handlers[] = {
+	&reiserfs_xattr_user_handler,
+	&reiserfs_xattr_trusted_handler,
+#ifdef CONFIG_REISERFS_FS_SECURITY
+	&reiserfs_xattr_security_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+	&reiserfs_posix_acl_access_handler,
+	&reiserfs_posix_acl_default_handler,
+#endif
+	NULL
+};
 
-	dir = open_xa_dir(inode, XATTR_REPLACE);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
-	}
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix with the generic xattr API, a filesystem should create a
+ * null-terminated array of struct xattr_handler (one for each prefix) and
+ * hang a pointer to it off of the s_xattr field of the superblock.
+ *
+ * The generic_fooxattr() functions will use this list to dispatch xattr
+ * operations to the correct xattr_handler.
+ */
+#define for_each_xattr_handler(handlers, handler)		\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
 
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = __reiserfs_xattr_del(dir, name, strlen(name));
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(dir);
+/* This is the implementation for the xattr plugin infrastructure */
+static inline struct xattr_handler *
+find_xattr_handler_prefix(struct xattr_handler **handlers,
+			   const char *name)
+{
+	struct xattr_handler *xah;
 
-	if (!err) {
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
+	if (!handlers)
+		return NULL;
+
+	for_each_xattr_handler(handlers, xah) {
+		if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+			break;
 	}
 
-      out:
-	return err;
+	return xah;
 }
 
-/* Actual operations that are exported to VFS-land */
+
 /*
  * Inode operation getxattr()
  */
@@ -870,15 +891,15 @@ ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 		  size_t size)
 {
-	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
-	int err;
+	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler;
 
-	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+	handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	err = xah->get(dentry->d_inode, name, buffer, size);
-	return err;
+	return handler->get(inode, name, buffer, size);
 }
 
 /*
@@ -890,15 +911,15 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags)
 {
-	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
-	int err;
+	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler;
 
-	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+	handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	err = xah->set(dentry->d_inode, name, value, size, flags);
-	return err;
+	return handler->set(inode, name, value, size, flags);
 }
 
 /*
@@ -908,71 +929,65 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
  */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-	int err;
-	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
+	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler;
+	handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
 
-	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+	if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	err = reiserfs_xattr_del(dentry->d_inode, name);
-
-	dentry->d_inode->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(dentry->d_inode);
-
-	return err;
+	return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
 }
 
-/* This is what filldir will use:
- * r_pos will always contain the amount of space required for the entire
- * list. If r_pos becomes larger than r_size, we need more space and we
- * return an error indicating this. If r_pos is less than r_size, then we've
- * filled the buffer successfully and we return success */
-struct reiserfs_listxattr_buf {
-	int r_pos;
-	int r_size;
-	char *r_buf;
-	struct inode *r_inode;
+struct listxattr_buf {
+	size_t size;
+	size_t pos;
+	char *buf;
+	struct inode *inode;
 };
 
-static int
-reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
-			  loff_t offset, u64 ino, unsigned int d_type)
+static int listxattr_filler(void *buf, const char *name, int namelen,
+			    loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf;
-	int len = 0;
-	if (name[0] != '.'
-	    || (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-		struct reiserfs_xattr_handler *xah =
-		    find_xattr_handler_prefix(name);
-		if (!xah)
-			return 0;	/* Unsupported xattr name, skip it */
-
-		/* We call ->list() twice because the operation isn't required to just
-		 * return the name back - we want to make sure we have enough space */
-		len += xah->list(b->r_inode, name, namelen, NULL);
-
-		if (len) {
-			if (b->r_pos + len + 1 <= b->r_size) {
-				char *p = b->r_buf + b->r_pos;
-				p += xah->list(b->r_inode, name, namelen, p);
-				*p++ = '\0';
-			}
-			b->r_pos += len + 1;
+	struct listxattr_buf *b = (struct listxattr_buf *)buf;
+	size_t size;
+	if (name[0] != '.' ||
+	    (namelen != 1 && (name[1] != '.' || namelen != 2))) {
+		struct xattr_handler *handler;
+		handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr,
+						    name);
+		if (!handler)	/* Unsupported xattr name */
+			return 0;
+		if (b->buf) {
+			size = handler->list(b->inode, b->buf + b->pos,
+					 b->size, name, namelen);
+			if (size > b->size)
+				return -ERANGE;
+		} else {
+			size = handler->list(b->inode, NULL, 0, name, namelen);
 		}
-	}
 
+		b->pos += size;
+	}
 	return 0;
 }
 
 /*
  * Inode operation listxattr()
+ *
+ * We totally ignore the generic listxattr here because it would be stupid
+ * not to. Since the xattrs are organized in a directory, we can just
+ * readdir to find them.
  */
 ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
 	struct dentry *dir;
 	int err = 0;
-	struct reiserfs_listxattr_buf buf;
+	struct listxattr_buf buf = {
+		.inode = dentry->d_inode,
+		.buf = buffer,
+		.size = buffer ? size : 0,
+	};
 
 	if (!dentry->d_inode)
 		return -EINVAL;
@@ -985,120 +1000,22 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	if (IS_ERR(dir)) {
 		err = PTR_ERR(dir);
 		if (err == -ENODATA)
-			err = 0;	/* Not an error if there aren't any xattrs */
+			err = 0;  /* Not an error if there aren't any xattrs */
 		goto out;
 	}
 
-	buf.r_buf = buffer;
-	buf.r_size = buffer ? size : 0;
-	buf.r_pos = 0;
-	buf.r_inode = dentry->d_inode;
-
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = xattr_readdir(dir->d_inode, reiserfs_listxattr_filler, &buf);
+	err = xattr_readdir(dir->d_inode, listxattr_filler, &buf);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
-	if (!err) {
-		if (buf.r_pos > buf.r_size && buffer != NULL)
-			err = -ERANGE;
-		else
-			err = buf.r_pos;
-	}
+	if (!err)
+		err = buf.pos;
 
 	dput(dir);
 out:
 	return err;
 }
 
-/* This is the implementation for the xattr plugin infrastructure */
-static LIST_HEAD(xattr_handlers);
-static DEFINE_RWLOCK(handler_lock);
-
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
-								*prefix)
-{
-	struct reiserfs_xattr_handler *xah = NULL;
-	struct list_head *p;
-
-	read_lock(&handler_lock);
-	list_for_each(p, &xattr_handlers) {
-		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
-		if (strncmp(xah->prefix, prefix, strlen(xah->prefix)) == 0)
-			break;
-		xah = NULL;
-	}
-
-	read_unlock(&handler_lock);
-	return xah;
-}
-
-static void __unregister_handlers(void)
-{
-	struct reiserfs_xattr_handler *xah;
-	struct list_head *p, *tmp;
-
-	list_for_each_safe(p, tmp, &xattr_handlers) {
-		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
-		if (xah->exit)
-			xah->exit();
-
-		list_del_init(p);
-	}
-	INIT_LIST_HEAD(&xattr_handlers);
-}
-
-int __init reiserfs_xattr_register_handlers(void)
-{
-	int err = 0;
-	struct reiserfs_xattr_handler *xah;
-	struct list_head *p;
-
-	write_lock(&handler_lock);
-
-	/* If we're already initialized, nothing to do */
-	if (!list_empty(&xattr_handlers)) {
-		write_unlock(&handler_lock);
-		return 0;
-	}
-
-	/* Add the handlers */
-	list_add_tail(&user_handler.handlers, &xattr_handlers);
-	list_add_tail(&trusted_handler.handlers, &xattr_handlers);
-#ifdef CONFIG_REISERFS_FS_SECURITY
-	list_add_tail(&security_handler.handlers, &xattr_handlers);
-#endif
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	list_add_tail(&posix_acl_access_handler.handlers, &xattr_handlers);
-	list_add_tail(&posix_acl_default_handler.handlers, &xattr_handlers);
-#endif
-
-	/* Run initializers, if available */
-	list_for_each(p, &xattr_handlers) {
-		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
-		if (xah->init) {
-			err = xah->init();
-			if (err) {
-				list_del_init(p);
-				break;
-			}
-		}
-	}
-
-	/* Clean up other handlers, if any failed */
-	if (err)
-		__unregister_handlers();
-
-	write_unlock(&handler_lock);
-	return err;
-}
-
-void reiserfs_xattr_unregister_handlers(void)
-{
-	write_lock(&handler_lock);
-	__unregister_handlers();
-	write_unlock(&handler_lock);
-}
-
 static int reiserfs_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
@@ -1157,20 +1074,16 @@ static int xattr_mount_check(struct super_block *s)
 {
 	/* We need generation numbers to ensure that the oid mapping is correct
 	 * v3.5 filesystems don't have them. */
-	if (!old_format_only(s)) {
-		set_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-	} else if (reiserfs_xattrs_optional(s)) {
-		/* Old format filesystem, but optional xattrs have been enabled
-		 * at mount time. Error out. */
-		reiserfs_warning(s, "jdm-20005",
-				 "xattrs/ACLs not supported on pre v3.6 "
-				 "format filesystem. Failing mount.");
-		return -EOPNOTSUPP;
-	} else {
-		/* Old format filesystem, but no optional xattrs have
-		 * been enabled. This means we silently disable xattrs
-		 * on the filesystem. */
-		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+	if (old_format_only(s)) {
+		if (reiserfs_xattrs_optional(s)) {
+			/* Old format filesystem, but optional xattrs have
+			 * been enabled. Error out. */
+			reiserfs_warning(s, "jdm-2005",
+					 "xattrs/ACLs not supported "
+					 "on pre-v3.6 format filesystems. "
+					 "Failing mount.");
+			return -EOPNOTSUPP;
+		}
 	}
 
 	return 0;
@@ -1251,9 +1164,11 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	}
 
 #ifdef CONFIG_REISERFS_FS_XATTR
+	if (!err)
+		s->s_xattr = reiserfs_xattr_handlers;
+
 error:
 	if (err) {
-		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
 		clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
 		clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
 	}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d3ce6ee9b262..bfecf7553002 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -271,7 +271,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	char *name;
 	void *value = NULL;
 	struct posix_acl **p_acl;
-	size_t size;
+	size_t size = 0;
 	int error;
 	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
 
@@ -308,16 +308,21 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		value = posix_acl_to_disk(acl, &size);
 		if (IS_ERR(value))
 			return (int)PTR_ERR(value);
-		error = reiserfs_xattr_set(inode, name, value, size, 0);
-	} else {
-		error = reiserfs_xattr_del(inode, name);
-		if (error == -ENODATA) {
-			/* This may seem odd here, but it means that the ACL was set
-			 * with a value representable with mode bits. If there was
-			 * an ACL before, reiserfs_xattr_del already dirtied the inode.
-			 */
+	}
+
+	error = __reiserfs_xattr_set(inode, name, value, size, 0);
+
+	/*
+	 * Ensure that the inode gets dirtied if we're only using
+	 * the mode bits and an old ACL didn't exist. We don't need
+	 * to check if the inode is hashed here since we won't get
+	 * called by reiserfs_inherit_default_acl().
+	 */
+	if (error == -ENODATA) {
+		error = 0;
+		if (type == ACL_TYPE_ACCESS) {
+			inode->i_ctime = CURRENT_TIME_SEC;
 			mark_inode_dirty(inode);
-			error = 0;
 		}
 	}
 
@@ -474,33 +479,22 @@ posix_acl_access_set(struct inode *inode, const char *name,
 	return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
 
-static int posix_acl_access_del(struct inode *inode, const char *name)
-{
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
-		return -EINVAL;
-	iset_acl(inode, &reiserfs_i->i_acl_access, ERR_PTR(-ENODATA));
-	return 0;
-}
-
-static int
-posix_acl_access_list(struct inode *inode, const char *name, int namelen,
-		      char *out)
+static size_t posix_acl_access_list(struct inode *inode, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
 {
-	int len = namelen;
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
 	if (!reiserfs_posixacl(inode->i_sb))
 		return 0;
-	if (out)
-		memcpy(out, name, len);
-
-	return len;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
 }
 
-struct reiserfs_xattr_handler posix_acl_access_handler = {
+struct xattr_handler reiserfs_posix_acl_access_handler = {
 	.prefix = POSIX_ACL_XATTR_ACCESS,
 	.get = posix_acl_access_get,
 	.set = posix_acl_access_set,
-	.del = posix_acl_access_del,
 	.list = posix_acl_access_list,
 };
 
@@ -522,32 +516,21 @@ posix_acl_default_set(struct inode *inode, const char *name,
 	return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
 
-static int posix_acl_default_del(struct inode *inode, const char *name)
+static size_t posix_acl_default_list(struct inode *inode, char *list,
+				     size_t list_size, const char *name,
+				     size_t name_len)
 {
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
-		return -EINVAL;
-	iset_acl(inode, &reiserfs_i->i_acl_default, ERR_PTR(-ENODATA));
-	return 0;
-}
-
-static int
-posix_acl_default_list(struct inode *inode, const char *name, int namelen,
-		       char *out)
-{
-	int len = namelen;
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
 	if (!reiserfs_posixacl(inode->i_sb))
 		return 0;
-	if (out)
-		memcpy(out, name, len);
-
-	return len;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
 }
 
-struct reiserfs_xattr_handler posix_acl_default_handler = {
+struct xattr_handler reiserfs_posix_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.get = posix_acl_default_get,
 	.set = posix_acl_default_set,
-	.del = posix_acl_default_del,
 	.list = posix_acl_default_list,
 };
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 1958b361c35d..2aacf1fe69fd 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -31,35 +31,25 @@ security_set(struct inode *inode, const char *name, const void *buffer,
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int security_del(struct inode *inode, const char *name)
+static size_t security_list(struct inode *inode, char *list, size_t list_len,
+			    const char *name, size_t namelen)
 {
-	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
-		return -EINVAL;
-
-	if (IS_PRIVATE(inode))
-		return -EPERM;
-
-	return 0;
-}
-
-static int
-security_list(struct inode *inode, const char *name, int namelen, char *out)
-{
-	int len = namelen;
+	const size_t len = namelen + 1;
 
 	if (IS_PRIVATE(inode))
 		return 0;
 
-	if (out)
-		memcpy(out, name, len);
+	if (list && len <= list_len) {
+		memcpy(list, name, namelen);
+		list[namelen] = '\0';
+	}
 
 	return len;
 }
 
-struct reiserfs_xattr_handler security_handler = {
+struct xattr_handler reiserfs_xattr_security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.get = security_get,
 	.set = security_set,
-	.del = security_del,
 	.list = security_list,
 };
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 076ad388d489..a865042f75e2 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -13,10 +13,7 @@ trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
 	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
 		return -EINVAL;
 
-	if (!reiserfs_xattrs(inode->i_sb))
-		return -EOPNOTSUPP;
-
-	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_get(inode, name, buffer, size);
@@ -29,50 +26,30 @@ trusted_set(struct inode *inode, const char *name, const void *buffer,
 	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
 		return -EINVAL;
 
-	if (!reiserfs_xattrs(inode->i_sb))
-		return -EOPNOTSUPP;
-
-	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int trusted_del(struct inode *inode, const char *name)
+static size_t trusted_list(struct inode *inode, char *list, size_t list_size,
+			   const char *name, size_t name_len)
 {
-	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
-		return -EINVAL;
+	const size_t len = name_len + 1;
 
-	if (!reiserfs_xattrs(inode->i_sb))
-		return -EOPNOTSUPP;
-
-	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
-		return -EPERM;
-
-	return 0;
-}
-
-static int
-trusted_list(struct inode *inode, const char *name, int namelen, char *out)
-{
-	int len = namelen;
-
-	if (!reiserfs_xattrs(inode->i_sb))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return 0;
 
-	if (!(capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)))
-		return 0;
-
-	if (out)
-		memcpy(out, name, len);
-
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
 	return len;
 }
 
-struct reiserfs_xattr_handler trusted_handler = {
+struct xattr_handler reiserfs_xattr_trusted_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
 	.get = trusted_get,
 	.set = trusted_set,
-	.del = trusted_del,
 	.list = trusted_list,
 };
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 1384efcb938e..e3238dc4f3db 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -6,10 +6,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-# include <linux/reiserfs_acl.h>
-#endif
-
 static int
 user_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
@@ -25,7 +21,6 @@ static int
 user_set(struct inode *inode, const char *name, const void *buffer,
 	 size_t size, int flags)
 {
-
 	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
 		return -EINVAL;
 
@@ -34,33 +29,23 @@ user_set(struct inode *inode, const char *name, const void *buffer,
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int user_del(struct inode *inode, const char *name)
+static size_t user_list(struct inode *inode, char *list, size_t list_size,
+			const char *name, size_t name_len)
 {
-	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
-		return -EINVAL;
-
-	if (!reiserfs_xattrs_user(inode->i_sb))
-		return -EOPNOTSUPP;
-	return 0;
-}
+	const size_t len = name_len + 1;
 
-static int
-user_list(struct inode *inode, const char *name, int namelen, char *out)
-{
-	int len = namelen;
 	if (!reiserfs_xattrs_user(inode->i_sb))
 		return 0;
-
-	if (out)
-		memcpy(out, name, len);
-
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
 	return len;
 }
 
-struct reiserfs_xattr_handler user_handler = {
+struct xattr_handler reiserfs_xattr_user_handler = {
 	.prefix = XATTR_USER_PREFIX,
 	.get = user_get,
 	.set = user_set,
-	.del = user_del,
 	.list = user_list,
 };
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index fe00f781a622..d180446470f2 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -52,10 +52,8 @@ int reiserfs_acl_chmod(struct inode *inode);
 int reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 				 struct inode *inode);
 int reiserfs_cache_default_acl(struct inode *dir);
-extern int reiserfs_xattr_posix_acl_init(void) __init;
-extern int reiserfs_xattr_posix_acl_exit(void);
-extern struct reiserfs_xattr_handler posix_acl_default_handler;
-extern struct reiserfs_xattr_handler posix_acl_access_handler;
+extern struct xattr_handler reiserfs_posix_acl_default_handler;
+extern struct xattr_handler reiserfs_posix_acl_access_handler;
 
 static inline void reiserfs_init_acl_access(struct inode *inode)
 {
@@ -75,16 +73,6 @@ static inline struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	return NULL;
 }
 
-static inline int reiserfs_xattr_posix_acl_init(void)
-{
-	return 0;
-}
-
-static inline int reiserfs_xattr_posix_acl_exit(void)
-{
-	return 0;
-}
-
 static inline int reiserfs_acl_chmod(struct inode *inode)
 {
 	return 0;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index c8aee41ccc23..4686b90886ed 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -451,7 +451,6 @@ enum reiserfs_mount_options {
 	REISERFS_NO_UNHASHED_RELOCATION,
 	REISERFS_HASHED_RELOCATION,
 	REISERFS_ATTRS,
-	REISERFS_XATTRS,
 	REISERFS_XATTRS_USER,
 	REISERFS_POSIXACL,
 	REISERFS_BARRIER_NONE,
@@ -489,7 +488,7 @@ enum reiserfs_mount_options {
 #define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
 #define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
 #define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
-#define reiserfs_xattrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS))
+#define reiserfs_xattrs(s) ((s)->s_xattr != NULL)
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 3bd154fd56e9..958fcaca0e8b 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -29,20 +29,6 @@ struct iattr;
 struct super_block;
 struct nameidata;
 
-struct reiserfs_xattr_handler {
-	char *prefix;
-	int (*init) (void);
-	void (*exit) (void);
-	int (*get) (struct inode * inode, const char *name, void *buffer,
-		    size_t size);
-	int (*set) (struct inode * inode, const char *name, const void *buffer,
-		    size_t size, int flags);
-	int (*del) (struct inode * inode, const char *name);
-	int (*list) (struct inode * inode, const char *name, int namelen,
-		     char *out);
-	struct list_head handlers;
-};
-
 int reiserfs_xattr_register_handlers(void) __init;
 void reiserfs_xattr_unregister_handlers(void);
 int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
@@ -59,13 +45,14 @@ ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int reiserfs_removexattr(struct dentry *dentry, const char *name);
 int reiserfs_permission(struct inode *inode, int mask);
 
-int reiserfs_xattr_del(struct inode *, const char *);
-int reiserfs_xattr_get(const struct inode *, const char *, void *, size_t);
+int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
+int __reiserfs_xattr_set(struct inode *, const char *, const void *,
+			 size_t, int);
 int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
 
-extern struct reiserfs_xattr_handler user_handler;
-extern struct reiserfs_xattr_handler trusted_handler;
-extern struct reiserfs_xattr_handler security_handler;
+extern struct xattr_handler reiserfs_xattr_user_handler;
+extern struct xattr_handler reiserfs_xattr_trusted_handler;
+extern struct xattr_handler reiserfs_xattr_security_handler;
 
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
-- 
cgit v1.2.3-71-gd317


From 0ab2621ebd9a28bf7a524ecd50d492a10579dfcc Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:39 -0400
Subject: reiserfs: journaled xattrs

Deadlocks are possible in the xattr code between the journal lock and the
xattr sems.

This patch implements journalling for xattr operations. The benefit is
twofold:
 * It gets rid of the deadlock possibility by always ensuring that xattr
   write operations are initiated inside a transaction.
 * It corrects the problem where xattr backing files aren't considered any
   differently than normal files, despite the fact they are metadata.

I discussed the added journal load with Chris Mason, and we decided that
since xattrs (versus other journal activity) is fairly rare, the introduction
of larger transactions to support journaled xattrs wouldn't be too big a deal.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/inode.c            |   3 +-
 fs/reiserfs/namei.c            |  14 ++----
 fs/reiserfs/xattr.c            |  39 +++++++++++----
 fs/reiserfs/xattr_acl.c        | 105 +++++++++++++++++++++++++++++++----------
 include/linux/reiserfs_acl.h   |   3 +-
 include/linux/reiserfs_fs.h    |   4 ++
 include/linux/reiserfs_xattr.h |  40 +++++++++++++++-
 7 files changed, 159 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 50a73e7afdc8..995f6975cae1 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1914,9 +1914,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		goto out_inserted_sd;
 	}
 
-	/* XXX CHECK THIS */
 	if (reiserfs_posixacl(inode->i_sb)) {
-		retval = reiserfs_inherit_default_acl(dir, dentry, inode);
+		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
 		if (retval) {
 			err = retval;
 			reiserfs_check_path(&path_to_key);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ddf1bcd41c87..d9c1c8bd2950 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -598,15 +598,13 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
 	struct reiserfs_transaction_handle th;
-	int locked;
 
 	if (!(inode = new_inode(dir->i_sb))) {
 		return -ENOMEM;
 	}
 	new_inode_init(inode, dir, mode);
 
-	locked = reiserfs_cache_default_acl(dir);
-
+	jbegin_count += reiserfs_cache_default_acl(dir);
 	reiserfs_write_lock(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -662,7 +660,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	    JOURNAL_PER_BALANCE_CNT * 3 +
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-	int locked;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -672,8 +669,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	}
 	new_inode_init(inode, dir, mode);
 
-	locked = reiserfs_cache_default_acl(dir);
-
+	jbegin_count += reiserfs_cache_default_acl(dir);
 	reiserfs_write_lock(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -732,7 +728,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	    JOURNAL_PER_BALANCE_CNT * 3 +
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-	int locked;
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	/* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
@@ -744,8 +739,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 	new_inode_init(inode, dir, mode);
 
-	locked = reiserfs_cache_default_acl(dir);
-
+	jbegin_count += reiserfs_cache_default_acl(dir);
 	reiserfs_write_lock(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -1034,8 +1028,6 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	memcpy(name, symname, strlen(symname));
 	padd_item(name, item_len, strlen(symname));
 
-	/* We would inherit the default ACL here, but symlinks don't get ACLs */
-
 	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
 	if (retval) {
 		drop_new_inode(inode);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d3ce27436605..c2e3a92aaf2b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -632,8 +632,9 @@ out_dput:
  * inode->i_mutex: down
  */
 int
-__reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
-		     size_t buffer_size, int flags)
+reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
+			  struct inode *inode, const char *name,
+			  const void *buffer, size_t buffer_size, int flags)
 {
 	int err = 0;
 	struct dentry *dentry;
@@ -723,14 +724,34 @@ out_unlock:
 	return err;
 }
 
-int
-reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
-		     size_t buffer_size, int flags)
+/* We need to start a transaction to maintain lock ordering */
+int reiserfs_xattr_set(struct inode *inode, const char *name,
+		       const void *buffer, size_t buffer_size, int flags)
 {
-	int err = __reiserfs_xattr_set(inode, name, buffer, buffer_size, flags);
-	if (err == -ENODATA)
-		err = 0;
-	return err;
+
+	struct reiserfs_transaction_handle th;
+	int error, error2;
+	size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+
+	if (!(flags & XATTR_REPLACE))
+		jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jbegin_count);
+	if (error) {
+		reiserfs_write_unlock(inode->i_sb);
+		return error;
+	}
+
+	error = reiserfs_xattr_set_handle(&th, inode, name,
+					  buffer, buffer_size, flags);
+
+	error2 = journal_end(&th, inode->i_sb, jbegin_count);
+	if (error == 0)
+		error = error2;
+	reiserfs_write_unlock(inode->i_sb);
+
+	return error;
 }
 
 /*
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index bfecf7553002..d423416d93d1 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -10,15 +10,17 @@
 #include <linux/reiserfs_acl.h>
 #include <asm/uaccess.h>
 
-static int reiserfs_set_acl(struct inode *inode, int type,
+static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
+			    struct inode *inode, int type,
 			    struct posix_acl *acl);
 
 static int
 xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 {
 	struct posix_acl *acl;
-	int error;
-
+	int error, error2;
+	struct reiserfs_transaction_handle th;
+	size_t jcreate_blocks;
 	if (!reiserfs_posixacl(inode->i_sb))
 		return -EOPNOTSUPP;
 	if (!is_owner_or_cap(inode))
@@ -36,7 +38,21 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 	} else
 		acl = NULL;
 
-	error = reiserfs_set_acl(inode, type, acl);
+	/* Pessimism: We can't assume that anything from the xattr root up
+	 * has been created. */
+
+	jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, size) * 2;
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jcreate_blocks);
+	if (error == 0) {
+		error = reiserfs_set_acl(&th, inode, type, acl);
+		error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
+		if (error2)
+			error = error2;
+	}
+	reiserfs_write_unlock(inode->i_sb);
 
       release_and_out:
 	posix_acl_release(acl);
@@ -266,7 +282,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
  * BKL held [before 2.5.x]
  */
 static int
-reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
+		 int type, struct posix_acl *acl)
 {
 	char *name;
 	void *value = NULL;
@@ -310,7 +327,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 			return (int)PTR_ERR(value);
 	}
 
-	error = __reiserfs_xattr_set(inode, name, value, size, 0);
+	error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
 
 	/*
 	 * Ensure that the inode gets dirtied if we're only using
@@ -337,7 +354,8 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 /* dir->i_mutex: locked,
  * inode is new and not released into the wild yet */
 int
-reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
+reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+			     struct inode *dir, struct dentry *dentry,
 			     struct inode *inode)
 {
 	struct posix_acl *acl;
@@ -374,7 +392,8 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 
 		/* Copy the default ACL to the default ACL of a new directory */
 		if (S_ISDIR(inode->i_mode)) {
-			err = reiserfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+					       acl);
 			if (err)
 				goto cleanup;
 		}
@@ -395,9 +414,9 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 
 			/* If we need an ACL.. */
 			if (need_acl > 0) {
-				err =
-				    reiserfs_set_acl(inode, ACL_TYPE_ACCESS,
-						     acl_copy);
+				err = reiserfs_set_acl(th, inode,
+						       ACL_TYPE_ACCESS,
+						       acl_copy);
 				if (err)
 					goto cleanup_copy;
 			}
@@ -415,21 +434,45 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-/* Looks up and caches the result of the default ACL.
- * We do this so that we don't need to carry the xattr_sem into
- * reiserfs_new_inode if we don't need to */
+/* This is used to cache the default acl before a new object is created.
+ * The biggest reason for this is to get an idea of how many blocks will
+ * actually be required for the create operation if we must inherit an ACL.
+ * An ACL write can add up to 3 object creations and an additional file write
+ * so we'd prefer not to reserve that many blocks in the journal if we can.
+ * It also has the advantage of not loading the ACL with a transaction open,
+ * this may seem silly, but if the owner of the directory is doing the
+ * creation, the ACL may not be loaded since the permissions wouldn't require
+ * it.
+ * We return the number of blocks required for the transaction.
+ */
 int reiserfs_cache_default_acl(struct inode *inode)
 {
-	int ret = 0;
-	if (reiserfs_posixacl(inode->i_sb) && !IS_PRIVATE(inode)) {
-		struct posix_acl *acl;
-		acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
-		ret = (acl && !IS_ERR(acl));
-		if (ret)
-			posix_acl_release(acl);
+	struct posix_acl *acl;
+	int nblocks = 0;
+
+	if (IS_PRIVATE(inode))
+		return 0;
+
+	acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
+
+	if (acl && !IS_ERR(acl)) {
+		int size = reiserfs_acl_size(acl->a_count);
+
+		/* Other xattrs can be created during inode creation. We don't
+		 * want to claim too many blocks, so we check to see if we
+		 * we need to create the tree to the xattrs, and then we
+		 * just want two files. */
+		nblocks = reiserfs_xattr_jcreate_nblocks(inode);
+		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+		/* We need to account for writes + bitmaps for two files */
+		nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
+		posix_acl_release(acl);
 	}
 
-	return ret;
+	return nblocks;
 }
 
 int reiserfs_acl_chmod(struct inode *inode)
@@ -455,8 +498,22 @@ int reiserfs_acl_chmod(struct inode *inode)
 	if (!clone)
 		return -ENOMEM;
 	error = posix_acl_chmod_masq(clone, inode->i_mode);
-	if (!error)
-		error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+	if (!error) {
+		struct reiserfs_transaction_handle th;
+		size_t size = reiserfs_xattr_nblocks(inode,
+					     reiserfs_acl_size(clone->a_count));
+		reiserfs_write_lock(inode->i_sb);
+		error = journal_begin(&th, inode->i_sb, size * 2);
+		if (!error) {
+			int error2;
+			error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS,
+						 clone);
+			error2 = journal_end(&th, inode->i_sb, size * 2);
+			if (error2)
+				error = error2;
+		}
+		reiserfs_write_unlock(inode->i_sb);
+	}
 	posix_acl_release(clone);
 	return error;
 }
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index d180446470f2..52240e02de02 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -49,7 +49,8 @@ static inline int reiserfs_acl_count(size_t size)
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
 int reiserfs_acl_chmod(struct inode *inode);
-int reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
+int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+				 struct inode *dir, struct dentry *dentry,
 				 struct inode *inode);
 int reiserfs_cache_default_acl(struct inode *dir);
 extern struct xattr_handler reiserfs_posix_acl_default_handler;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index e00d240314c5..67ad310fa88b 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1615,6 +1615,10 @@ struct reiserfs_journal_header {
 #define JOURNAL_MAX_COMMIT_AGE 30
 #define JOURNAL_MAX_TRANS_AGE 30
 #define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
+#define JOURNAL_BLOCKS_PER_OBJECT(sb)  (JOURNAL_PER_BALANCE_CNT * 3 + \
+					 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
+					      REISERFS_QUOTA_TRANS_BLOCKS(sb)))
+
 #ifdef CONFIG_QUOTA
 /* We need to update data and inode (atime) */
 #define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & (1<<REISERFS_QUOTA) ? 2 : 0)
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 958fcaca0e8b..20eca09729a2 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -46,14 +46,50 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name);
 int reiserfs_permission(struct inode *inode, int mask);
 
 int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
-int __reiserfs_xattr_set(struct inode *, const char *, const void *,
-			 size_t, int);
 int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
+int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
+			      struct inode *, const char *, const void *,
+			      size_t, int);
 
 extern struct xattr_handler reiserfs_xattr_user_handler;
 extern struct xattr_handler reiserfs_xattr_trusted_handler;
 extern struct xattr_handler reiserfs_xattr_security_handler;
 
+#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
+static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
+{
+	loff_t ret = 0;
+	if (reiserfs_file_data_log(inode)) {
+		ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
+		ret >>= inode->i_sb->s_blocksize_bits;
+	}
+	return ret;
+}
+
+/* We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
+ * Let's try to be smart about it.
+ * xattr root: We cache it. If it's not cached, we may need to create it.
+ * xattr dir: If anything has been loaded for this inode, we can set a flag
+ *            saying so.
+ * xattr file: Since we don't cache xattrs, we can't tell. We always include
+ *             blocks for it.
+ *
+ * However, since root and dir can be created between calls - YOU MUST SAVE
+ * THIS VALUE.
+ */
+static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
+{
+	size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+
+	if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
+		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+		if (REISERFS_SB(inode->i_sb)->xattr_root == NULL)
+			nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+	}
+
+	return nblocks;
+}
+
 static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 {
 	init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
-- 
cgit v1.2.3-71-gd317


From a41f1a4715f26f7bc4d047d0bc7710145c8e69c7 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:40 -0400
Subject: reiserfs: use generic readdir for operations across all xattrs

The current reiserfs xattr implementation open codes reiserfs_readdir
and frees the path before calling the filldir function.  Typically, the
filldir function is something that modifies the file system, such as a
chown or an inode deletion that also require reading of an inode
associated with each direntry.  Since the file system is modified, the
path retained becomes invalid for the next run.  In addition, it runs
backwards in attempt to minimize activity.

This is clearly suboptimal from a code cleanliness perspective as well
as performance-wise.

This patch implements a generic reiserfs_for_each_xattr that uses the
generic readdir and a specific filldir routine that simply populates an
array of dentries and then performs a specific operation on them.  When
all files have been operated on, it then calls the operation on the
directory itself.

The result is a noticable code reduction and better performance.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/dir.c           |  28 +--
 fs/reiserfs/xattr.c         | 402 +++++++++++++-------------------------------
 include/linux/reiserfs_fs.h |   1 +
 3 files changed, 131 insertions(+), 300 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index e6b03d2020c1..67a80d7e59e2 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,10 +41,10 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 
 #define store_ih(where,what) copy_item_head (where, what)
 
-//
-static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
+			   filldir_t filldir, loff_t *pos)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = dentry->d_inode;
 	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
 	INITIALIZE_PATH(path_to_entry);
 	struct buffer_head *bh;
@@ -64,13 +64,9 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	/* form key for search the next directory entry using f_pos field of
 	   file structure */
-	make_cpu_key(&pos_key, inode,
-		     (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, TYPE_DIRENTRY,
-		     3);
+	make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
 	next_pos = cpu_key_k_offset(&pos_key);
 
-	/*  reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos); */
-
 	path_to_entry.reada = PATH_READA;
 	while (1) {
 	      research:
@@ -144,7 +140,7 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				/* Ignore the .reiserfs_priv entry */
 				if (reiserfs_xattrs(inode->i_sb) &&
 				    !old_format_only(inode->i_sb) &&
-				    filp->f_path.dentry == inode->i_sb->s_root &&
+				    dentry == inode->i_sb->s_root &&
 				    REISERFS_SB(inode->i_sb)->priv_root &&
 				    REISERFS_SB(inode->i_sb)->priv_root->d_inode
 				    && deh_objectid(deh) ==
@@ -156,7 +152,7 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				}
 
 				d_off = deh_offset(deh);
-				filp->f_pos = d_off;
+				*pos = d_off;
 				d_ino = deh_objectid(deh);
 				if (d_reclen <= 32) {
 					local_buf = small_buf;
@@ -223,15 +219,21 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	}			/* while */
 
-      end:
-	filp->f_pos = next_pos;
+end:
+	*pos = next_pos;
 	pathrelse(&path_to_entry);
 	reiserfs_check_path(&path_to_entry);
-      out:
+out:
 	reiserfs_write_unlock(inode->i_sb);
 	return ret;
 }
 
+static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
+}
+
 /* compose directory item containing "." and ".." entries (entries are
    not aligned to 4 byte boundary) */
 /* the last four params are LE */
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c2e3a92aaf2b..1baafec64331 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -167,218 +167,65 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 
 }
 
-/*
- * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but
- * we need to drop the path before calling the filldir struct.  That
- * would be a big performance hit to the non-xattr case, so I've copied
- * the whole thing for now. --clm
- *
- * the big difference is that I go backwards through the directory,
- * and don't mess with f->f_pos, but the idea is the same.  Do some
- * action on each and every entry in the directory.
- *
- * we're called with i_mutex held, so there are no worries about the directory
- * changing underneath us.
- */
-static int __xattr_readdir(struct inode *inode, void *dirent, filldir_t filldir)
-{
-	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
-	INITIALIZE_PATH(path_to_entry);
-	struct buffer_head *bh;
-	int entry_num;
-	struct item_head *ih, tmp_ih;
-	int search_res;
-	char *local_buf;
-	loff_t next_pos;
-	char small_buf[32];	/* avoid kmalloc if we can */
-	struct reiserfs_de_head *deh;
-	int d_reclen;
-	char *d_name;
-	off_t d_off;
-	ino_t d_ino;
-	struct reiserfs_dir_entry de;
-
-	/* form key for search the next directory entry using f_pos field of
-	   file structure */
-	next_pos = max_reiserfs_offset(inode);
-
-	while (1) {
-	      research:
-		if (next_pos <= DOT_DOT_OFFSET)
-			break;
-		make_cpu_key(&pos_key, inode, next_pos, TYPE_DIRENTRY, 3);
-
-		search_res =
-		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
-					&de);
-		if (search_res == IO_ERROR) {
-			// FIXME: we could just skip part of directory which could
-			// not be read
-			pathrelse(&path_to_entry);
-			return -EIO;
-		}
-
-		if (search_res == NAME_NOT_FOUND)
-			de.de_entry_num--;
-
-		set_de_name_and_namelen(&de);
-		entry_num = de.de_entry_num;
-		deh = &(de.de_deh[entry_num]);
-
-		bh = de.de_bh;
-		ih = de.de_ih;
-
-		if (!is_direntry_le_ih(ih)) {
-			reiserfs_error(inode->i_sb, "jdm-20000",
-				       "not direntry %h", ih);
-			break;
-		}
-		copy_item_head(&tmp_ih, ih);
-
-		/* we must have found item, that is item of this directory, */
-		RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key),
-		       "vs-9000: found item %h does not match to dir we readdir %K",
-		       ih, &pos_key);
-
-		if (deh_offset(deh) <= DOT_DOT_OFFSET) {
-			break;
-		}
-
-		/* look for the previous entry in the directory */
-		next_pos = deh_offset(deh) - 1;
-
-		if (!de_visible(deh))
-			/* it is hidden entry */
-			continue;
-
-		d_reclen = entry_length(bh, ih, entry_num);
-		d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
-		d_off = deh_offset(deh);
-		d_ino = deh_objectid(deh);
-
-		if (!d_name[d_reclen - 1])
-			d_reclen = strlen(d_name);
-
-		if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)) {
-			/* too big to send back to VFS */
-			continue;
-		}
-
-		/* Ignore the .reiserfs_priv entry */
-		if (reiserfs_xattrs(inode->i_sb) &&
-		    !old_format_only(inode->i_sb) &&
-		    deh_objectid(deh) ==
-		    le32_to_cpu(INODE_PKEY
-				(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->
-				k_objectid))
-			continue;
-
-		if (d_reclen <= 32) {
-			local_buf = small_buf;
-		} else {
-			local_buf = kmalloc(d_reclen, GFP_NOFS);
-			if (!local_buf) {
-				pathrelse(&path_to_entry);
-				return -ENOMEM;
-			}
-			if (item_moved(&tmp_ih, &path_to_entry)) {
-				kfree(local_buf);
-
-				/* sigh, must retry.  Do this same offset again */
-				next_pos = d_off;
-				goto research;
-			}
-		}
-
-		// Note, that we copy name to user space via temporary
-		// buffer (local_buf) because filldir will block if
-		// user space buffer is swapped out. At that time
-		// entry can move to somewhere else
-		memcpy(local_buf, d_name, d_reclen);
-
-		/* the filldir function might need to start transactions,
-		 * or do who knows what.  Release the path now that we've
-		 * copied all the important stuff out of the deh
-		 */
-		pathrelse(&path_to_entry);
-
-		if (filldir(dirent, local_buf, d_reclen, d_off, d_ino,
-			    DT_UNKNOWN) < 0) {
-			if (local_buf != small_buf) {
-				kfree(local_buf);
-			}
-			goto end;
-		}
-		if (local_buf != small_buf) {
-			kfree(local_buf);
-		}
-	}			/* while */
-
-      end:
-	pathrelse(&path_to_entry);
-	return 0;
-}
-
-/*
- * this could be done with dedicated readdir ops for the xattr files,
- * but I want to get something working asap
- * this is stolen from vfs_readdir
- *
- */
-static
-int xattr_readdir(struct inode *inode, filldir_t filler, void *buf)
-{
-	int res = -ENOENT;
-	if (!IS_DEADDIR(inode)) {
-		lock_kernel();
-		res = __xattr_readdir(inode, buf, filler);
-		unlock_kernel();
-	}
-	return res;
-}
-
 /* The following are side effects of other operations that aren't explicitly
  * modifying extended attributes. This includes operations such as permissions
  * or ownership changes, object deletions, etc. */
+struct reiserfs_dentry_buf {
+	struct dentry *xadir;
+	int count;
+	struct dentry *dentries[8];
+};
 
 static int
-reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
-			      loff_t offset, u64 ino, unsigned int d_type)
+fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
+		    u64 ino, unsigned int d_type)
 {
-	struct dentry *xadir = (struct dentry *)buf;
+	struct reiserfs_dentry_buf *dbuf = buf;
 	struct dentry *dentry;
-	int err = 0;
 
-	dentry = lookup_one_len(name, xadir, namelen);
+	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
+		return -ENOSPC;
+
+	if (name[0] == '.' && (name[1] == '\0' ||
+			       (name[1] == '.' && name[2] == '\0')))
+		return 0;
+
+	dentry = lookup_one_len(name, dbuf->xadir, namelen);
 	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out;
+		return PTR_ERR(dentry);
 	} else if (!dentry->d_inode) {
-		err = -ENODATA;
-		goto out_file;
+		/* A directory entry exists, but no file? */
+		reiserfs_error(dentry->d_sb, "xattr-20003",
+			       "Corrupted directory: xattr %s listed but "
+			       "not found for file %s.\n",
+			       dentry->d_name.name, dbuf->xadir->d_name.name);
+		dput(dentry);
+		return -EIO;
 	}
 
-	/* Skip directories.. */
-	if (S_ISDIR(dentry->d_inode->i_mode))
-		goto out_file;
-
-	err = xattr_unlink(xadir->d_inode, dentry);
-
-out_file:
-	dput(dentry);
+	dbuf->dentries[dbuf->count++] = dentry;
+	return 0;
+}
 
-out:
-	return err;
+static void
+cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
+{
+	int i;
+	for (i = 0; i < buf->count; i++)
+		if (buf->dentries[i])
+			dput(buf->dentries[i]);
 }
 
-/* This is called w/ inode->i_mutex downed */
-int reiserfs_delete_xattrs(struct inode *inode)
+static int reiserfs_for_each_xattr(struct inode *inode,
+				   int (*action)(struct dentry *, void *),
+				   void *data)
 {
-	int err = -ENODATA;
-	struct dentry *dir, *root;
-	struct reiserfs_transaction_handle th;
-	int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
-		     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+	struct dentry *dir;
+	int i, err = 0;
+	loff_t pos = 0;
+	struct reiserfs_dentry_buf buf = {
+		.count = 0,
+	};
 
 	/* Skip out, an xattr has no xattrs associated with it */
 	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
@@ -389,117 +236,97 @@ int reiserfs_delete_xattrs(struct inode *inode)
 		err = PTR_ERR(dir);
 		goto out;
 	} else if (!dir->d_inode) {
-		dput(dir);
-		goto out;
+		err = 0;
+		goto out_dir;
 	}
 
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = xattr_readdir(dir->d_inode, reiserfs_delete_xattrs_filler, dir);
-	mutex_unlock(&dir->d_inode->i_mutex);
-	if (err) {
-		dput(dir);
-		goto out;
+	buf.xadir = dir;
+	err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
+	while ((err == 0 || err == -ENOSPC) && buf.count) {
+		err = 0;
+
+		for (i = 0; i < buf.count && buf.dentries[i]; i++) {
+			int lerr = 0;
+			struct dentry *dentry = buf.dentries[i];
+
+			if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
+				lerr = action(dentry, data);
+
+			dput(dentry);
+			buf.dentries[i] = NULL;
+			err = lerr ?: err;
+		}
+		buf.count = 0;
+		if (!err)
+			err = reiserfs_readdir_dentry(dir, &buf,
+						      fill_with_dentries, &pos);
 	}
+	mutex_unlock(&dir->d_inode->i_mutex);
 
-	root = dget(dir->d_parent);
-	dput(dir);
+	/* Clean up after a failed readdir */
+	cleanup_dentry_buf(&buf);
 
-	/* We start a transaction here to avoid a ABBA situation
-	 * between the xattr root's i_mutex and the journal lock.
-	 * Inode creation will inherit an ACL, which requires a
-	 * lookup. The lookup locks the xattr root i_mutex with a
-	 * transaction open.  Inode deletion takes teh xattr root
-	 * i_mutex to delete the directory and then starts a
-	 * transaction inside it. Boom. This doesn't incur much
-	 * additional overhead since the reiserfs_rmdir transaction
-	 * will just nest inside the outer transaction. */
-	err = journal_begin(&th, inode->i_sb, blocks);
 	if (!err) {
-		int jerror;
-		mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_XATTR);
-		err = xattr_rmdir(root->d_inode, dir);
-		jerror = journal_end(&th, inode->i_sb, blocks);
-		mutex_unlock(&root->d_inode->i_mutex);
-		err = jerror ?: err;
+		/* We start a transaction here to avoid a ABBA situation
+		 * between the xattr root's i_mutex and the journal lock.
+		 * This doesn't incur much additional overhead since the
+		 * new transaction will just nest inside the
+		 * outer transaction. */
+		int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+			     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+		struct reiserfs_transaction_handle th;
+		err = journal_begin(&th, inode->i_sb, blocks);
+		if (!err) {
+			int jerror;
+			mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
+					  I_MUTEX_XATTR);
+			err = action(dir, data);
+			jerror = journal_end(&th, inode->i_sb, blocks);
+			mutex_unlock(&dir->d_parent->d_inode->i_mutex);
+			err = jerror ?: err;
+		}
 	}
-
-	dput(root);
+out_dir:
+	dput(dir);
 out:
-	if (err)
-		reiserfs_warning(inode->i_sb, "jdm-20004",
-				 "Couldn't remove all xattrs (%d)\n", err);
+	/* -ENODATA isn't an error */
+	if (err == -ENODATA)
+		err = 0;
 	return err;
 }
 
-struct reiserfs_chown_buf {
-	struct inode *inode;
-	struct dentry *xadir;
-	struct iattr *attrs;
-};
-
-/* XXX: If there is a better way to do this, I'd love to hear about it */
-static int
-reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
-			     loff_t offset, u64 ino, unsigned int d_type)
+static int delete_one_xattr(struct dentry *dentry, void *data)
 {
-	struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
-	struct dentry *xafile, *xadir = chown_buf->xadir;
-	struct iattr *attrs = chown_buf->attrs;
-	int err = 0;
+	struct inode *dir = dentry->d_parent->d_inode;
 
-	xafile = lookup_one_len(name, xadir, namelen);
-	if (IS_ERR(xafile))
-		return PTR_ERR(xafile);
-	else if (!xafile->d_inode) {
-		dput(xafile);
-		return -ENODATA;
-	}
+	/* This is the xattr dir, handle specially. */
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		return xattr_rmdir(dir, dentry);
 
-	if (!S_ISDIR(xafile->d_inode->i_mode)) {
-		mutex_lock_nested(&xafile->d_inode->i_mutex, I_MUTEX_CHILD);
-		err = reiserfs_setattr(xafile, attrs);
-		mutex_unlock(&xafile->d_inode->i_mutex);
-	}
-	dput(xafile);
+	return xattr_unlink(dir, dentry);
+}
+
+static int chown_one_xattr(struct dentry *dentry, void *data)
+{
+	struct iattr *attrs = data;
+	return reiserfs_setattr(dentry, attrs);
+}
 
+/* No i_mutex, but the inode is unconnected. */
+int reiserfs_delete_xattrs(struct inode *inode)
+{
+	int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20004",
+				 "Couldn't delete all xattrs (%d)\n", err);
 	return err;
 }
 
+/* inode->i_mutex: down */
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 {
-	struct dentry *dir;
-	int err = 0;
-	struct reiserfs_chown_buf buf;
-	unsigned int ia_valid = attrs->ia_valid;
-
-	/* Skip out, an xattr has no xattrs associated with it */
-	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
-		return 0;
-
-	dir = open_xa_dir(inode, XATTR_REPLACE);
-	if (IS_ERR(dir)) {
-		if (PTR_ERR(dir) != -ENODATA)
-			err = PTR_ERR(dir);
-		goto out;
-	} else if (!dir->d_inode)
-		goto out_dir;
-
-	attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
-	buf.xadir = dir;
-	buf.attrs = attrs;
-	buf.inode = inode;
-
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = xattr_readdir(dir->d_inode, reiserfs_chown_xattrs_filler, &buf);
-
-	if (!err)
-		err = reiserfs_setattr(dir, attrs);
-	mutex_unlock(&dir->d_inode->i_mutex);
-
-	attrs->ia_valid = ia_valid;
-out_dir:
-	dput(dir);
-out:
+	int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
 	if (err)
 		reiserfs_warning(inode->i_sb, "jdm-20007",
 				 "Couldn't chown all xattrs (%d)\n", err);
@@ -1004,6 +831,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
 	struct dentry *dir;
 	int err = 0;
+	loff_t pos = 0;
 	struct listxattr_buf buf = {
 		.inode = dentry->d_inode,
 		.buf = buffer,
@@ -1026,7 +854,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	}
 
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = xattr_readdir(dir->d_inode, listxattr_filler, &buf);
+	err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
 	if (!err)
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 67ad310fa88b..c0365e07fce6 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1984,6 +1984,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
 extern const struct inode_operations reiserfs_symlink_inode_operations;
 extern const struct inode_operations reiserfs_special_inode_operations;
 extern const struct file_operations reiserfs_dir_operations;
+int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
 
 /* tail_conversion.c */
 int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
-- 
cgit v1.2.3-71-gd317


From 57fe60df62410f949da094d06ced1dda9575b69c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:41 -0400
Subject: reiserfs: add atomic addition of selinux attributes during inode
 creation

Some time ago, some changes were made to make security inode attributes
be atomically written during inode creation.  ReiserFS fell behind in
this area, but with the reworking of the xattr code, it's now fairly
easy to add.

The following patch adds the ability for security attributes to be added
automatically during inode creation.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/inode.c            | 16 ++++++++++++-
 fs/reiserfs/namei.c            | 37 +++++++++++++++++++++++++----
 fs/reiserfs/xattr_security.c   | 54 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/reiserfs_fs.h    |  4 +++-
 include/linux/reiserfs_xattr.h | 32 +++++++++++++++++++++++++
 5 files changed, 137 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 995f6975cae1..fcd302d81447 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1747,7 +1747,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       /* 0 for regular, EMTRY_DIR_SIZE for dirs, 
 		          strlen (symname) for symlinks) */
 		       loff_t i_size, struct dentry *dentry,
-		       struct inode *inode)
+		       struct inode *inode,
+		       struct reiserfs_security_handle *security)
 {
 	struct super_block *sb;
 	struct reiserfs_iget_args args;
@@ -1929,6 +1930,19 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	} else if (IS_PRIVATE(dir))
 		inode->i_flags |= S_PRIVATE;
 
+	if (security->name) {
+		retval = reiserfs_security_write(th, inode, security);
+		if (retval) {
+			err = retval;
+			reiserfs_check_path(&path_to_key);
+			retval = journal_end(th, th->t_super,
+					     th->t_blocks_allocated);
+			if (retval)
+				err = retval;
+			goto out_inserted_sd;
+		}
+	}
+
 	reiserfs_update_sd(th, inode);
 	reiserfs_check_path(&path_to_key);
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d9c1c8bd2950..cb1a9e977907 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -598,6 +598,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
 	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
 
 	if (!(inode = new_inode(dir->i_sb))) {
 		return -ENOMEM;
@@ -605,6 +606,12 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	new_inode_init(inode, dir, mode);
 
 	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
 	reiserfs_write_lock(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -615,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	retval =
 	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode);
+			       inode, &security);
 	if (retval)
 		goto out_failed;
 
@@ -655,6 +662,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	int retval;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
 	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 3 +
@@ -670,6 +678,12 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	new_inode_init(inode, dir, mode);
 
 	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
 	reiserfs_write_lock(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -680,7 +694,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 
 	retval =
 	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode);
+			       inode, &security);
 	if (retval) {
 		goto out_failed;
 	}
@@ -723,6 +737,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int retval;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
 	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 3 +
@@ -740,6 +755,12 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	new_inode_init(inode, dir, mode);
 
 	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
 	reiserfs_write_lock(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -756,7 +777,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	    retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
 					old_format_only(dir->i_sb) ?
 					EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
-					dentry, inode);
+					dentry, inode, &security);
 	if (retval) {
 		dir->i_nlink--;
 		goto out_failed;
@@ -999,6 +1020,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	char *name;
 	int item_len;
 	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
 	int mode = S_IFLNK | S_IRWXUGO;
 	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
 	int jbegin_count =
@@ -1011,6 +1033,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	}
 	new_inode_init(inode, parent_dir, mode);
 
+	retval = reiserfs_security_init(parent_dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+
 	reiserfs_write_lock(parent_dir->i_sb);
 	item_len = ROUND_UP(strlen(symname));
 	if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
@@ -1037,7 +1066,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 
 	retval =
 	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
-			       dentry, inode);
+			       dentry, inode, &security);
 	kfree(name);
 	if (retval) {		/* reiserfs_new_inode iputs for us */
 		goto out_failed;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 2aacf1fe69fd..4d3c20e787c3 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -4,6 +4,7 @@
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
 #include <linux/reiserfs_xattr.h>
+#include <linux/security.h>
 #include <asm/uaccess.h>
 
 static int
@@ -47,6 +48,59 @@ static size_t security_list(struct inode *inode, char *list, size_t list_len,
 	return len;
 }
 
+/* Initializes the security context for a new inode and returns the number
+ * of blocks needed for the transaction. If successful, reiserfs_security
+ * must be released using reiserfs_security_free when the caller is done. */
+int reiserfs_security_init(struct inode *dir, struct inode *inode,
+			   struct reiserfs_security_handle *sec)
+{
+	int blocks = 0;
+	int error = security_inode_init_security(inode, dir, &sec->name,
+						 &sec->value, &sec->length);
+	if (error) {
+		if (error == -EOPNOTSUPP)
+			error = 0;
+
+		sec->name = NULL;
+		sec->value = NULL;
+		sec->length = 0;
+		return error;
+	}
+
+	if (sec->length) {
+		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, sec->length);
+		/* We don't want to count the directories twice if we have
+		 * a default ACL. */
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+	}
+	return blocks;
+}
+
+int reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			    struct inode *inode,
+			    struct reiserfs_security_handle *sec)
+{
+	int error;
+	if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value,
+					  sec->length, XATTR_CREATE);
+	if (error == -ENODATA || error == -EOPNOTSUPP)
+		error = 0;
+
+	return error;
+}
+
+void reiserfs_security_free(struct reiserfs_security_handle *sec)
+{
+	kfree(sec->name);
+	kfree(sec->value);
+	sec->name = NULL;
+	sec->value = NULL;
+}
+
 struct xattr_handler reiserfs_xattr_security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.get = security_get,
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index c0365e07fce6..eb4e912e6bd3 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1915,10 +1915,12 @@ void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 		       loff_t offset, int type, int length, int entry_count);
 struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
 
+struct reiserfs_security_handle;
 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       struct inode *dir, int mode,
 		       const char *symname, loff_t i_size,
-		       struct dentry *dentry, struct inode *inode);
+		       struct dentry *dentry, struct inode *inode,
+		       struct reiserfs_security_handle *security);
 
 void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
 			     struct inode *inode, loff_t size);
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 20eca09729a2..dcae01e63e40 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -15,6 +15,12 @@ struct reiserfs_xattr_header {
 	__le32 h_hash;		/* hash of the value */
 };
 
+struct reiserfs_security_handle {
+	char *name;
+	void *value;
+	size_t length;
+};
+
 #ifdef __KERNEL__
 
 #include <linux/init.h>
@@ -54,6 +60,14 @@ int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
 extern struct xattr_handler reiserfs_xattr_user_handler;
 extern struct xattr_handler reiserfs_xattr_trusted_handler;
 extern struct xattr_handler reiserfs_xattr_security_handler;
+#ifdef CONFIG_REISERFS_FS_SECURITY
+int reiserfs_security_init(struct inode *dir, struct inode *inode,
+			   struct reiserfs_security_handle *sec);
+int reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			    struct inode *inode,
+			    struct reiserfs_security_handle *sec);
+void reiserfs_security_free(struct reiserfs_security_handle *sec);
+#endif
 
 #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
 static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
@@ -109,6 +123,24 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
 }
 #endif  /*  CONFIG_REISERFS_FS_XATTR  */
 
+#ifndef CONFIG_REISERFS_FS_SECURITY
+static inline int reiserfs_security_init(struct inode *dir,
+					 struct inode *inode,
+					 struct reiserfs_security_handle *sec)
+{
+	return 0;
+}
+static inline int
+reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			struct inode *inode,
+			struct reiserfs_security_handle *sec)
+{
+	return 0;
+}
+static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
+{}
+#endif
+
 #endif  /*  __KERNEL__  */
 
 #endif  /*  _LINUX_REISERFS_XATTR_H  */
-- 
cgit v1.2.3-71-gd317


From 0222e6571c332563a48d4cf5487b67feabe60b5e Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:44 -0400
Subject: reiserfs: strip trailing whitespace

This patch strips trailing whitespace from the reiserfs code.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/README             |   4 +-
 fs/reiserfs/do_balan.c         |  14 ++---
 fs/reiserfs/file.c             |   8 +--
 fs/reiserfs/fix_node.c         |  38 ++++++-------
 fs/reiserfs/hashes.c           |   2 +-
 fs/reiserfs/ibalance.c         |  10 ++--
 fs/reiserfs/inode.c            |  52 +++++++++---------
 fs/reiserfs/ioctl.c            |   2 +-
 fs/reiserfs/journal.c          | 120 ++++++++++++++++++++---------------------
 fs/reiserfs/lbalance.c         |  18 +++----
 fs/reiserfs/namei.c            |  30 +++++------
 fs/reiserfs/objectid.c         |   2 +-
 fs/reiserfs/prints.c           |  26 ++++-----
 fs/reiserfs/procfs.c           |   2 +-
 fs/reiserfs/resize.c           |   6 +--
 fs/reiserfs/stree.c            |   8 +--
 fs/reiserfs/super.c            |  10 ++--
 fs/reiserfs/tail_conversion.c  |   2 +-
 include/linux/reiserfs_fs_sb.h |  14 ++---
 19 files changed, 184 insertions(+), 184 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 90e1670e4e6f..14e8c9d460e5 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -1,4 +1,4 @@
-[LICENSING] 
+[LICENSING]
 
 ReiserFS is hereby licensed under the GNU General
 Public License version 2.
@@ -31,7 +31,7 @@ the GPL as not allowing those additional licensing options, you read
 it wrongly, and Richard Stallman agrees with me, when carefully read
 you can see that those restrictions on additional terms do not apply
 to the owner of the copyright, and my interpretation of this shall
-govern for this license.  
+govern for this license.
 
 Finally, nothing in this license shall be interpreted to allow you to
 fail to fairly credit me, or to remove my credits, without my
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 723a7f4011d0..4beb964a2a3e 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -76,21 +76,21 @@ inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
 #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
 
-/* summary: 
+/* summary:
  if deleting something ( tb->insert_size[0] < 0 )
    return(balance_leaf_when_delete()); (flag d handled here)
  else
    if lnum is larger than 0 we put items into the left node
    if rnum is larger than 0 we put items into the right node
    if snum1 is larger than 0 we put items into the new node s1
-   if snum2 is larger than 0 we put items into the new node s2 
+   if snum2 is larger than 0 we put items into the new node s2
 Note that all *num* count new items being created.
 
 It would be easier to read balance_leaf() if each of these summary
 lines was a separate procedure rather than being inlined.  I think
 that there are many passages here and in balance_leaf_when_delete() in
 which two calls to one procedure can replace two passages, and it
-might save cache space and improve software maintenance costs to do so.  
+might save cache space and improve software maintenance costs to do so.
 
 Vladimir made the perceptive comment that we should offload most of
 the decision making in this function into fix_nodes/check_balance, and
@@ -288,15 +288,15 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
     )
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);	/*  index into the array of item headers in S[0] 
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);	/*  index into the array of item headers in S[0]
 							   of the affected item */
 	struct buffer_info bi;
 	struct buffer_head *S_new[2];	/* new nodes allocated to hold what could not fit into S */
 	int snum[2];		/* number of items that will be placed
 				   into S_new (includes partially shifted
 				   items) */
-	int sbytes[2];		/* if an item is partially shifted into S_new then 
-				   if it is a directory item 
+	int sbytes[2];		/* if an item is partially shifted into S_new then
+				   if it is a directory item
 				   it is the number of entries from the item that are shifted into S_new
 				   else
 				   it is the number of bytes from the item that are shifted into S_new
@@ -1983,7 +1983,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
 	/* store_print_tb (tb); */
 
 	/* do not delete, just comment it out */
-/*    print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, 
+/*    print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
 	     "check");*/
 	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 47bab8978be1..f0160ee03e17 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -20,14 +20,14 @@
 ** insertion/balancing, for files that are written in one write.
 ** It avoids unnecessary tail packings (balances) for files that are written in
 ** multiple writes and are small enough to have tails.
-** 
+**
 ** file_release is called by the VFS layer when the file is closed.  If
 ** this is the last open file descriptor, and the file
 ** small enough to have a tail, and the tail is currently in an
 ** unformatted node, the tail is converted back into a direct item.
-** 
+**
 ** We use reiserfs_truncate_file to pack the tail, since it already has
-** all the conditions coded.  
+** all the conditions coded.
 */
 static int reiserfs_file_release(struct inode *inode, struct file *filp)
 {
@@ -223,7 +223,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
 }
 
 /* Write @count bytes at position @ppos in a file indicated by @file
-   from the buffer @buf.  
+   from the buffer @buf.
 
    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index aee50c97988d..a3be7da3e2b9 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -30,8 +30,8 @@
  ** get_direct_parent
  ** get_neighbors
  ** fix_nodes
- ** 
- ** 
+ **
+ **
  **/
 
 #include <linux/time.h>
@@ -377,9 +377,9 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
 	int needed_nodes;
 	int start_item,		/* position of item we start filling node from */
 	 end_item,		/* position of item we finish filling node by */
-	 start_bytes,		/* number of first bytes (entries for directory) of start_item-th item 
+	 start_bytes,		/* number of first bytes (entries for directory) of start_item-th item
 				   we do not include into node that is being filled */
-	 end_bytes;		/* number of last bytes (entries for directory) of end_item-th item 
+	 end_bytes;		/* number of last bytes (entries for directory) of end_item-th item
 				   we do node include into node that is being filled */
 	int split_item_positions[2];	/* these are positions in virtual item of
 					   items, that are split between S[0] and
@@ -569,7 +569,7 @@ extern struct tree_balance *cur_tb;
 
 /* Set parameters for balancing.
  * Performs write of results of analysis of balancing into structure tb,
- * where it will later be used by the functions that actually do the balancing. 
+ * where it will later be used by the functions that actually do the balancing.
  * Parameters:
  *	tb	tree_balance structure;
  *	h	current level of the node;
@@ -1204,7 +1204,7 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1239,7 +1239,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 	/* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
 	   where 4th parameter is s1bytes and 5th - s2bytes
 	 */
-	short snum012[40] = { 0, };	/* s0num, s1num, s2num for 8 cases 
+	short snum012[40] = { 0, };	/* s0num, s1num, s2num for 8 cases
 					   0,1 - do not shift and do not shift but bottle
 					   2 - shift only whole item to left
 					   3 - shift to left and bottle as much as possible
@@ -1288,7 +1288,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 
 	create_virtual_node(tb, h);
 
-	/*  
+	/*
 	   determine maximal number of items we can shift to the left neighbor (in tb structure)
 	   and the maximal number of bytes that can flow to the left neighbor
 	   from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
@@ -1349,13 +1349,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 
 	{
 		int lpar, rpar, nset, lset, rset, lrset;
-		/* 
+		/*
 		 * regular overflowing of the node
 		 */
 
-		/* get_num_ver works in 2 modes (FLOW & NO_FLOW) 
+		/* get_num_ver works in 2 modes (FLOW & NO_FLOW)
 		   lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
-		   nset, lset, rset, lrset - shows, whether flowing items give better packing 
+		   nset, lset, rset, lrset - shows, whether flowing items give better packing
 		 */
 #define FLOW 1
 #define NO_FLOW 0		/* do not any splitting */
@@ -1545,7 +1545,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1728,7 +1728,7 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1822,7 +1822,7 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	d - delete, c - cut.
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1851,7 +1851,7 @@ static int dc_check_balance(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste, d - delete, c - cut.
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -2296,15 +2296,15 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
  *	analyze what and where should be moved;
  *	get sufficient number of new nodes;
  * Balancing will start only after all resources will be collected at a time.
- * 
+ *
  * When ported to SMP kernels, only at the last moment after all needed nodes
  * are collected in cache, will the resources be locked using the usual
  * textbook ordered lock acquisition algorithms.  Note that ensuring that
  * this code neither write locks what it does not need to write lock nor locks out of order
  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
- * 
+ *
  * fix is meant in the sense of render unchanging
- * 
+ *
  * Latency might be improved by first gathering a list of what buffers are needed
  * and then getting as many of them in parallel as possible? -Hans
  *
@@ -2316,7 +2316,7 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
  *      ins_ih & ins_sd are used when inserting
  * Returns:	1 - schedule occurred while the function worked;
  *	        0 - schedule didn't occur while the function worked;
- *             -1 - if no_disk_space 
+ *             -1 - if no_disk_space
  */
 
 int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih,	// item head of item being inserted
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index e664ac16fad9..6471c670743e 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -7,7 +7,7 @@
  * (see Applied Cryptography, 2nd edition, p448).
  *
  * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
- * 
+ *
  * Jeremy has agreed to the contents of reiserfs/README. -Hans
  * Yura's function is added (04/07/2000)
  */
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 063b5514fe29..2074fd95046b 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -278,7 +278,7 @@ static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
 
 /* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
 * last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest 
+ * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
  */
 static void internal_copy_pointers_items(struct buffer_info *dest_bi,
 					 struct buffer_head *src,
@@ -385,7 +385,7 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi,
 	if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
 		first_pointer = 0;
 		first_item = 0;
-		/* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, 
+		/* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
 		   for key - with first_item */
 		internal_delete_pointers_items(src_bi, first_pointer,
 					       first_item, cpy_num - del_par);
@@ -453,7 +453,7 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b
 	}
 }
 
-/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. 
+/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
  * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
  * Replace  d_key'th key in buffer cfl.
  * Delete pointer_amount items and node pointers from buffer src.
@@ -518,7 +518,7 @@ static void internal_shift1_left(struct tree_balance *tb,
 	/*    internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */
 }
 
-/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. 
+/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
  * Copy n node pointers and n - 1 items from buffer src to buffer dest.
  * Replace  d_key'th key in buffer cfr.
  * Delete n items and node pointers from buffer src.
@@ -749,7 +749,7 @@ int balance_internal(struct tree_balance *tb,	/* tree_balance structure
        this means that new pointers and items must be inserted AFTER *
        child_pos
        }
-       else 
+       else
        {
        it is the position of the leftmost pointer that must be deleted (together with
        its corresponding key to the left of the pointer)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index fcd302d81447..d106edaef64f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -52,7 +52,7 @@ void reiserfs_delete_inode(struct inode *inode)
 		/* Do quota update inside a transaction for journaled quotas. We must do that
 		 * after delete_object so that quota updates go into the same transaction as
 		 * stat data deletion */
-		if (!err) 
+		if (!err)
 			DQUOT_FREE_INODE(inode);
 
 		if (journal_end(&th, inode->i_sb, jbegin_count))
@@ -363,7 +363,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
 		}
 		/* make sure we don't read more bytes than actually exist in
 		 ** the file.  This can happen in odd cases where i_size isn't
-		 ** correct, and when direct item padding results in a few 
+		 ** correct, and when direct item padding results in a few
 		 ** extra bytes at the end of the direct item
 		 */
 		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
@@ -438,15 +438,15 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
-** 
+**
 ** So, this allows block_prepare_write to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
 **
 ** The point of the wrapper is forcing a certain value for create, even
-** though the VFS layer is calling this function with create==1.  If you 
-** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, 
+** though the VFS layer is calling this function with create==1.  If you
+** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 ** don't use this function.
 */
 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
@@ -602,7 +602,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 	int done;
 	int fs_gen;
 	struct reiserfs_transaction_handle *th = NULL;
-	/* space reserved in transaction batch: 
+	/* space reserved in transaction batch:
 	   . 3 balancings in direct->indirect conversion
 	   . 1 block involved into reiserfs_update_sd()
 	   XXX in practically impossible worst case direct2indirect()
@@ -754,7 +754,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 		reiserfs_write_unlock(inode->i_sb);
 
 		/* the item was found, so new blocks were not added to the file
-		 ** there is no need to make sure the inode is updated with this 
+		 ** there is no need to make sure the inode is updated with this
 		 ** transaction
 		 */
 		return retval;
@@ -986,7 +986,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 
 		/* this loop could log more blocks than we had originally asked
 		 ** for.  So, we have to allow the transaction to end if it is
-		 ** too big or too full.  Update the inode so things are 
+		 ** too big or too full.  Update the inode so things are
 		 ** consistent if we crash before the function returns
 		 **
 		 ** release the path so that anybody waiting on the path before
@@ -997,7 +997,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			if (retval)
 				goto failure;
 		}
-		/* inserting indirect pointers for a hole can take a 
+		/* inserting indirect pointers for a hole can take a
 		 ** long time.  reschedule if needed
 		 */
 		cond_resched();
@@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
 	   update sd on unlink all that is required is to check for nlink
 	   here. This bug was first found by Sizif when debugging
 	   SquidNG/Butterfly, forgotten, and found again after Philippe
-	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it. 
+	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
 
 	   More logical fix would require changes in fs/inode.c:iput() to
 	   remove inode from hash-table _after_ fs cleaned disk stuff up and
@@ -1619,7 +1619,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return -EROFS;
 	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
-	 ** these cases are just when the system needs ram, not when the 
+	 ** these cases are just when the system needs ram, not when the
 	 ** inode needs to reach disk for safety, and they can safely be
 	 ** ignored because the altered inode has already been logged.
 	 */
@@ -1736,7 +1736,7 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
 /* inserts the stat data into the tree, and then calls
    reiserfs_new_directory (to insert ".", ".." item if new object is
    directory) or reiserfs_new_symlink (to insert symlink body if new
-   object is symlink) or nothing (if new object is regular file) 
+   object is symlink) or nothing (if new object is regular file)
 
    NOTE! uid and gid must already be set in the inode.  If we return
    non-zero due to an error, we have to drop the quota previously allocated
@@ -1744,7 +1744,7 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
    if we return non-zero, we also end the transaction.  */
 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       struct inode *dir, int mode, const char *symname,
-		       /* 0 for regular, EMTRY_DIR_SIZE for dirs, 
+		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
 		          strlen (symname) for symlinks) */
 		       loff_t i_size, struct dentry *dentry,
 		       struct inode *inode,
@@ -1794,7 +1794,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		goto out_bad_inode;
 	}
 	if (old_format_only(sb))
-		/* not a perfect generation count, as object ids can be reused, but 
+		/* not a perfect generation count, as object ids can be reused, but
 		 ** this is as good as reiserfs can do right now.
 		 ** note that the private part of inode isn't filled in yet, we have
 		 ** to use the directory.
@@ -2081,7 +2081,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 
 	if (p_s_inode->i_size > 0) {
 		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
-			// -ENOENT means we truncated past the end of the file, 
+			// -ENOENT means we truncated past the end of the file,
 			// and get_block_create_0 could not find a block to read in,
 			// which is ok.
 			if (error != -ENOENT)
@@ -2093,11 +2093,11 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 		}
 	}
 
-	/* so, if page != NULL, we have a buffer head for the offset at 
-	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, 
-	 ** then we have an unformatted node.  Otherwise, we have a direct item, 
-	 ** and no zeroing is required on disk.  We zero after the truncate, 
-	 ** because the truncate might pack the item anyway 
+	/* so, if page != NULL, we have a buffer head for the offset at
+	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+	 ** then we have an unformatted node.  Otherwise, we have a direct item,
+	 ** and no zeroing is required on disk.  We zero after the truncate,
+	 ** because the truncate might pack the item anyway
 	 ** (it will unmap bh if it packs).
 	 */
 	/* it is enough to reserve space in transaction for 2 balancings:
@@ -2306,8 +2306,8 @@ static int map_block_for_writepage(struct inode *inode,
 	return retval;
 }
 
-/* 
- * mason@suse.com: updated in 2.5.54 to follow the same general io 
+/*
+ * mason@suse.com: updated in 2.5.54 to follow the same general io
  * start/recovery path as __block_write_full_page, along with special
  * code to handle reiserfs tails.
  */
@@ -2447,7 +2447,7 @@ static int reiserfs_write_full_page(struct page *page,
 	unlock_page(page);
 
 	/*
-	 * since any buffer might be the only dirty buffer on the page, 
+	 * since any buffer might be the only dirty buffer on the page,
 	 * the first submit_bh can bring the page out of writeback.
 	 * be careful with the buffers.
 	 */
@@ -2466,8 +2466,8 @@ static int reiserfs_write_full_page(struct page *page,
 	if (nr == 0) {
 		/*
 		 * if this page only had a direct item, it is very possible for
-		 * no io to be required without there being an error.  Or, 
-		 * someone else could have locked them and sent them down the 
+		 * no io to be required without there being an error.  Or,
+		 * someone else could have locked them and sent them down the
 		 * pipe without locking the page
 		 */
 		bh = head;
@@ -2486,7 +2486,7 @@ static int reiserfs_write_full_page(struct page *page,
 
       fail:
 	/* catches various errors, we need to make sure any valid dirty blocks
-	 * get to the media.  The page is currently locked and not marked for 
+	 * get to the media.  The page is currently locked and not marked for
 	 * writeback
 	 */
 	ClearPageUptodate(page);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 830332021ed4..0ccc3fdda7bf 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -189,7 +189,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	}
 
 	/* we unpack by finding the page with the tail, and calling
-	 ** reiserfs_prepare_write on that page.  This will force a 
+	 ** reiserfs_prepare_write on that page.  This will force a
 	 ** reiserfs_get_block to unpack the tail for us.
 	 */
 	index = inode->i_size >> PAGE_CACHE_SHIFT;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index db91754cfb83..4f787462becc 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,36 +1,36 @@
 /*
 ** Write ahead logging implementation copyright Chris Mason 2000
 **
-** The background commits make this code very interelated, and 
+** The background commits make this code very interelated, and
 ** overly complex.  I need to rethink things a bit....The major players:
 **
-** journal_begin -- call with the number of blocks you expect to log.  
+** journal_begin -- call with the number of blocks you expect to log.
 **                  If the current transaction is too
-** 		    old, it will block until the current transaction is 
+** 		    old, it will block until the current transaction is
 ** 		    finished, and then start a new one.
-**		    Usually, your transaction will get joined in with 
+**		    Usually, your transaction will get joined in with
 **                  previous ones for speed.
 **
-** journal_join  -- same as journal_begin, but won't block on the current 
+** journal_join  -- same as journal_begin, but won't block on the current
 **                  transaction regardless of age.  Don't ever call
-**                  this.  Ever.  There are only two places it should be 
+**                  this.  Ever.  There are only two places it should be
 **                  called from, and they are both inside this file.
 **
-** journal_mark_dirty -- adds blocks into this transaction.  clears any flags 
+** journal_mark_dirty -- adds blocks into this transaction.  clears any flags
 **                       that might make them get sent to disk
-**                       and then marks them BH_JDirty.  Puts the buffer head 
-**                       into the current transaction hash.  
+**                       and then marks them BH_JDirty.  Puts the buffer head
+**                       into the current transaction hash.
 **
 ** journal_end -- if the current transaction is batchable, it does nothing
 **                   otherwise, it could do an async/synchronous commit, or
-**                   a full flush of all log and real blocks in the 
+**                   a full flush of all log and real blocks in the
 **                   transaction.
 **
-** flush_old_commits -- if the current transaction is too old, it is ended and 
-**                      commit blocks are sent to disk.  Forces commit blocks 
-**                      to disk for all backgrounded commits that have been 
+** flush_old_commits -- if the current transaction is too old, it is ended and
+**                      commit blocks are sent to disk.  Forces commit blocks
+**                      to disk for all backgrounded commits that have been
 **                      around too long.
-**		     -- Note, if you call this as an immediate flush from 
+**		     -- Note, if you call this as an immediate flush from
 **		        from within kupdate, it will ignore the immediate flag
 */
 
@@ -212,7 +212,7 @@ static void allocate_bitmap_nodes(struct super_block *p_s_sb)
 			list_add(&bn->list, &journal->j_bitmap_nodes);
 			journal->j_free_bitmap_nodes++;
 		} else {
-			break;	// this is ok, we'll try again when more are needed 
+			break;	/* this is ok, we'll try again when more are needed */
 		}
 	}
 }
@@ -283,7 +283,7 @@ static int free_bitmap_nodes(struct super_block *p_s_sb)
 }
 
 /*
-** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. 
+** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
 ** jb_array is the array to be filled in.
 */
 int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
@@ -315,7 +315,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
 }
 
 /*
-** find an available list bitmap.  If you can't find one, flush a commit list 
+** find an available list bitmap.  If you can't find one, flush a commit list
 ** and try again
 */
 static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
@@ -348,7 +348,7 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
 	return jb;
 }
 
-/* 
+/*
 ** allocates a new chunk of X nodes, and links them all together as a list.
 ** Uses the cnode->next and cnode->prev pointers
 ** returns NULL on failure
@@ -376,7 +376,7 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
 }
 
 /*
-** pulls a cnode off the free list, or returns NULL on failure 
+** pulls a cnode off the free list, or returns NULL on failure
 */
 static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
 {
@@ -403,7 +403,7 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
 }
 
 /*
-** returns a cnode to the free list 
+** returns a cnode to the free list
 */
 static void free_cnode(struct super_block *p_s_sb,
 		       struct reiserfs_journal_cnode *cn)
@@ -1192,8 +1192,8 @@ static int flush_commit_list(struct super_block *s,
 }
 
 /*
-** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or 
-** returns NULL if it can't find anything 
+** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or
+** returns NULL if it can't find anything
 */
 static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
 							  reiserfs_journal_cnode
@@ -1335,8 +1335,8 @@ static int update_journal_header_block(struct super_block *p_s_sb,
 	return _update_journal_header_block(p_s_sb, offset, trans_id);
 }
 
-/* 
-** flush any and all journal lists older than you are 
+/*
+** flush any and all journal lists older than you are
 ** can only be called from flush_journal_list
 */
 static int flush_older_journal_lists(struct super_block *p_s_sb,
@@ -1382,8 +1382,8 @@ static void del_from_work_list(struct super_block *s,
 ** always set flushall to 1, unless you are calling from inside
 ** flush_journal_list
 **
-** IMPORTANT.  This can only be called while there are no journal writers, 
-** and the journal is locked.  That means it can only be called from 
+** IMPORTANT.  This can only be called while there are no journal writers,
+** and the journal is locked.  That means it can only be called from
 ** do_journal_end, or by journal_release
 */
 static int flush_journal_list(struct super_block *s,
@@ -1429,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
 		goto flush_older_and_return;
 	}
 
-	/* start by putting the commit list on disk.  This will also flush 
+	/* start by putting the commit list on disk.  This will also flush
 	 ** the commit lists of any olders transactions
 	 */
 	flush_commit_list(s, jl, 1);
@@ -1444,8 +1444,8 @@ static int flush_journal_list(struct super_block *s,
 		goto flush_older_and_return;
 	}
 
-	/* loop through each cnode, see if we need to write it, 
-	 ** or wait on a more recent transaction, or just ignore it 
+	/* loop through each cnode, see if we need to write it,
+	 ** or wait on a more recent transaction, or just ignore it
 	 */
 	if (atomic_read(&(journal->j_wcount)) != 0) {
 		reiserfs_panic(s, "journal-844", "journal list is flushing, "
@@ -1473,8 +1473,8 @@ static int flush_journal_list(struct super_block *s,
 		if (!pjl && cn->bh) {
 			saved_bh = cn->bh;
 
-			/* we do this to make sure nobody releases the buffer while 
-			 ** we are working with it 
+			/* we do this to make sure nobody releases the buffer while
+			 ** we are working with it
 			 */
 			get_bh(saved_bh);
 
@@ -1497,8 +1497,8 @@ static int flush_journal_list(struct super_block *s,
 			goto free_cnode;
 		}
 
-		/* bh == NULL when the block got to disk on its own, OR, 
-		 ** the block got freed in a future transaction 
+		/* bh == NULL when the block got to disk on its own, OR,
+		 ** the block got freed in a future transaction
 		 */
 		if (saved_bh == NULL) {
 			goto free_cnode;
@@ -1586,7 +1586,7 @@ static int flush_journal_list(struct super_block *s,
 			       __func__);
       flush_older_and_return:
 
-	/* before we can update the journal header block, we _must_ flush all 
+	/* before we can update the journal header block, we _must_ flush all
 	 ** real blocks from all older transactions to disk.  This is because
 	 ** once the header block is updated, this transaction will not be
 	 ** replayed after a crash
@@ -1596,7 +1596,7 @@ static int flush_journal_list(struct super_block *s,
 	}
 
 	err = journal->j_errno;
-	/* before we can remove everything from the hash tables for this 
+	/* before we can remove everything from the hash tables for this
 	 ** transaction, we must make sure it can never be replayed
 	 **
 	 ** since we are only called from do_journal_end, we know for sure there
@@ -2016,9 +2016,9 @@ static int journal_compare_desc_commit(struct super_block *p_s_sb,
 	return 0;
 }
 
-/* returns 0 if it did not find a description block  
+/* returns 0 if it did not find a description block
 ** returns -1 if it found a corrupt commit block
-** returns 1 if both desc and commit were valid 
+** returns 1 if both desc and commit were valid
 */
 static int journal_transaction_is_valid(struct super_block *p_s_sb,
 					struct buffer_head *d_bh,
@@ -2380,8 +2380,8 @@ static int journal_read(struct super_block *p_s_sb)
 		      bdevname(journal->j_dev_bd, b));
 	start = get_seconds();
 
-	/* step 1, read in the journal header block.  Check the transaction it says 
-	 ** is the first unflushed, and if that transaction is not valid, 
+	/* step 1, read in the journal header block.  Check the transaction it says
+	 ** is the first unflushed, and if that transaction is not valid,
 	 ** replay is done
 	 */
 	journal->j_header_bh = journal_bread(p_s_sb,
@@ -2406,8 +2406,8 @@ static int journal_read(struct super_block *p_s_sb)
 			       le32_to_cpu(jh->j_last_flush_trans_id));
 		valid_journal_header = 1;
 
-		/* now, we try to read the first unflushed offset.  If it is not valid, 
-		 ** there is nothing more we can do, and it makes no sense to read 
+		/* now, we try to read the first unflushed offset.  If it is not valid,
+		 ** there is nothing more we can do, and it makes no sense to read
 		 ** through the whole log.
 		 */
 		d_bh =
@@ -2919,7 +2919,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
 	return 0;
 }
 
-/* this must be called inside a transaction, and requires the 
+/* this must be called inside a transaction, and requires the
 ** kernel_lock to be held
 */
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
@@ -3040,7 +3040,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	now = get_seconds();
 
 	/* if there is no room in the journal OR
-	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
+	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
 	 ** we don't sleep if there aren't other writers
 	 */
 
@@ -3240,7 +3240,7 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 **
 ** if it was dirty, cleans and files onto the clean list.  I can't let it be dirty again until the
 ** transaction is committed.
-** 
+**
 ** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
 */
 int journal_mark_dirty(struct reiserfs_transaction_handle *th,
@@ -3290,7 +3290,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 				 atomic_read(&(journal->j_wcount)));
 		return 1;
 	}
-	/* this error means I've screwed up, and we've overflowed the transaction.  
+	/* this error means I've screwed up, and we've overflowed the transaction.
 	 ** Nothing can be done here, except make the FS readonly or panic.
 	 */
 	if (journal->j_len >= journal->j_trans_max) {
@@ -3380,7 +3380,7 @@ int journal_end(struct reiserfs_transaction_handle *th,
 	}
 }
 
-/* removes from the current transaction, relsing and descrementing any counters.  
+/* removes from the current transaction, relsing and descrementing any counters.
 ** also files the removed buffer directly onto the clean list
 **
 ** called by journal_mark_freed when a block has been deleted
@@ -3478,7 +3478,7 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
 }
 
 /* syncs the commit blocks, but does not force the real buffers to disk
-** will wait until the current transaction is done/committed before returning 
+** will wait until the current transaction is done/committed before returning
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th,
 		     struct super_block *p_s_sb, unsigned long nblocks)
@@ -3560,13 +3560,13 @@ int reiserfs_flush_old_commits(struct super_block *p_s_sb)
 
 /*
 ** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
-** 
-** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all 
+**
+** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
 ** the writers are done.  By the time it wakes up, the transaction it was called has already ended, so it just
 ** flushes the commit list and returns 0.
 **
 ** Won't batch when flush or commit_now is set.  Also won't batch when others are waiting on j_join_wait.
-** 
+**
 ** Note, we can't allow the journal_end to proceed while there are still writers in the log.
 */
 static int check_journal_end(struct reiserfs_transaction_handle *th,
@@ -3594,7 +3594,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 		atomic_dec(&(journal->j_wcount));
 	}
 
-	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released 
+	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
 	 ** will be dealt with by next transaction that actually writes something, but should be taken
 	 ** care of in this trans
 	 */
@@ -3603,7 +3603,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	/* if wcount > 0, and we are called to with flush or commit_now,
 	 ** we wait on j_join_wait.  We will wake up when the last writer has
 	 ** finished the transaction, and started it on its way to the disk.
-	 ** Then, we flush the commit or journal list, and just return 0 
+	 ** Then, we flush the commit or journal list, and just return 0
 	 ** because the rest of journal end was already done for this transaction.
 	 */
 	if (atomic_read(&(journal->j_wcount)) > 0) {
@@ -3674,7 +3674,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 /*
 ** Does all the work that makes deleting blocks safe.
 ** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
-** 
+**
 ** otherwise:
 ** set a bit for the block in the journal bitmap.  That will prevent it from being allocated for unformatted nodes
 ** before this transaction has finished.
@@ -3878,7 +3878,7 @@ extern struct tree_balance *cur_tb;
 ** be written to disk while we are altering it.  So, we must:
 ** clean it
 ** wait on it.
-** 
+**
 */
 int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
 				 struct buffer_head *bh, int wait)
@@ -3920,7 +3920,7 @@ static void flush_old_journal_lists(struct super_block *s)
 	}
 }
 
-/* 
+/*
 ** long and ugly.  If flush, will not return until all commit
 ** blocks and all real buffers in the trans are on disk.
 ** If no_async, won't return until all commit blocks are on disk.
@@ -3981,7 +3981,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		wait_on_commit = 1;
 	}
 
-	/* check_journal_end locks the journal, and unlocks if it does not return 1 
+	/* check_journal_end locks the journal, and unlocks if it does not return 1
 	 ** it tells us if we should continue with the journal_end, or just return
 	 */
 	if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
@@ -4078,7 +4078,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 				last_cn->next = jl_cn;
 			}
 			last_cn = jl_cn;
-			/* make sure the block we are trying to log is not a block 
+			/* make sure the block we are trying to log is not a block
 			   of journal or reserved area */
 
 			if (is_block_in_log_or_reserved_area
@@ -4225,9 +4225,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
 		queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
 
-	/* if the next transaction has any chance of wrapping, flush 
-	 ** transactions that might get overwritten.  If any journal lists are very 
-	 ** old flush them as well.  
+	/* if the next transaction has any chance of wrapping, flush
+	 ** transactions that might get overwritten.  If any journal lists are very
+	 ** old flush them as well.
 	 */
       first_jl:
 	list_for_each_safe(entry, safe, &journal->j_journal_list) {
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 21a171ceba1d..381750a155f6 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -119,8 +119,8 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
 			   DEH_SIZE * copy_count + copy_records_len);
 }
 
-/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or 
-   part of it or nothing (see the return 0 below) from SOURCE to the end 
+/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
+   part of it or nothing (see the return 0 below) from SOURCE to the end
    (if last_first) or beginning (!last_first) of the DEST */
 /* returns 1 if anything was copied, else 0 */
 static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
@@ -396,7 +396,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 		else {
 			struct item_head n_ih;
 
-			/* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST 
+			/* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
 			   part defined by 'cpy_bytes'; create new item header; change old item_header (????);
 			   n_ih = new item_header;
 			 */
@@ -426,7 +426,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 		else {
 			struct item_head n_ih;
 
-			/* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST 
+			/* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
 			   part defined by 'cpy_bytes'; create new item header;
 			   n_ih = new item_header;
 			 */
@@ -724,7 +724,7 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
 static void leaf_delete_items_entirely(struct buffer_info *bi,
 				       int first, int del_num);
 /*  If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
-    If not. 
+    If not.
     If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
     the first item. Part defined by del_bytes. Don't delete first item header
     If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
@@ -783,7 +783,7 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 				/* len = body len of item */
 				len = ih_item_len(ih);
 
-			/* delete the part of the last item of the bh 
+			/* delete the part of the last item of the bh
 			   do not delete item header
 			 */
 			leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
@@ -865,7 +865,7 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
 	}
 }
 
-/* paste paste_size bytes to affected_item_num-th item. 
+/* paste paste_size bytes to affected_item_num-th item.
    When item is a directory, this only prepare space for new entries */
 void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
 			  int pos_in_item, int paste_size,
@@ -1022,7 +1022,7 @@ static int leaf_cut_entries(struct buffer_head *bh,
 /*  when cut item is part of regular file
         pos_in_item - first byte that must be cut
         cut_size - number of bytes to be cut beginning from pos_in_item
- 
+
    when cut item is part of directory
         pos_in_item - number of first deleted entry
         cut_size - count of deleted entries
@@ -1275,7 +1275,7 @@ void leaf_paste_entries(struct buffer_info *bi,
 	/* change item key if necessary (when we paste before 0-th entry */
 	if (!before) {
 		set_le_ih_k_offset(ih, deh_offset(new_dehs));
-/*      memcpy (&ih->ih_key.k_offset, 
+/*      memcpy (&ih->ih_key.k_offset,
 		       &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
 	}
 #ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index cb1a9e977907..9d1070e741fc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -106,7 +106,7 @@ key of the first directory entry in it.
 This function first calls search_by_key, then, if item whose first
 entry matches is not found it looks for the entry inside directory
 item found by search_by_key. Fills the path to the entry, and to the
-entry position in the item 
+entry position in the item
 
 */
 
@@ -371,7 +371,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 	return d_splice_alias(inode, dentry);
 }
 
-/* 
+/*
 ** looks up the dentry of the parent directory for child.
 ** taken from ext2_get_parent
 */
@@ -401,7 +401,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
 	return d_obtain_alias(inode);
 }
 
-/* add entry to the directory (entry can be hidden). 
+/* add entry to the directory (entry can be hidden).
 
 insert definition of when hidden directories are used here -Hans
 
@@ -559,7 +559,7 @@ static int drop_new_inode(struct inode *inode)
 	return 0;
 }
 
-/* utility function that does setup for reiserfs_new_inode.  
+/* utility function that does setup for reiserfs_new_inode.
 ** DQUOT_INIT needs lots of credits so it's better to have it
 ** outside of a transaction, so we had to pull some bits of
 ** reiserfs_new_inode out into this func.
@@ -820,7 +820,7 @@ static inline int reiserfs_empty_dir(struct inode *inode)
 {
 	/* we can cheat because an old format dir cannot have
 	 ** EMPTY_DIR_SIZE, and a new format dir cannot have
-	 ** EMPTY_DIR_SIZE_V1.  So, if the inode is either size, 
+	 ** EMPTY_DIR_SIZE_V1.  So, if the inode is either size,
 	 ** regardless of disk format version, the directory is empty.
 	 */
 	if (inode->i_size != EMPTY_DIR_SIZE &&
@@ -1162,7 +1162,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	return retval;
 }
 
-// de contains information pointing to an entry which 
+/* de contains information pointing to an entry which */
 static int de_still_valid(const char *name, int len,
 			  struct reiserfs_dir_entry *de)
 {
@@ -1206,10 +1206,10 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
 	de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
 }
 
-/* 
+/*
  * process, that is going to call fix_nodes/do_balance must hold only
  * one path. If it holds 2 or more, it can get into endless waiting in
- * get_empty_nodes or its clones 
+ * get_empty_nodes or its clones
  */
 static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry)
@@ -1263,7 +1263,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	old_inode_mode = old_inode->i_mode;
 	if (S_ISDIR(old_inode_mode)) {
-		// make sure, that directory being renamed has correct ".." 
+		// make sure, that directory being renamed has correct ".."
 		// and that its new parent directory has not too many links
 		// already
 
@@ -1274,8 +1274,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			}
 		}
 
-		/* directory is renamed, its parent directory will be changed, 
-		 ** so find ".." entry 
+		/* directory is renamed, its parent directory will be changed,
+		 ** so find ".." entry
 		 */
 		dot_dot_de.de_gen_number_bit_string = NULL;
 		retval =
@@ -1385,9 +1385,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		   this stuff, yes? Then, having
 		   gathered everything into RAM we
 		   should lock the buffers, yes?  -Hans */
-		/* probably.  our rename needs to hold more 
-		 ** than one path at once.  The seals would 
-		 ** have to be written to deal with multi-path 
+		/* probably.  our rename needs to hold more
+		 ** than one path at once.  The seals would
+		 ** have to be written to deal with multi-path
 		 ** issues -chris
 		 */
 		/* sanity checking before doing the rename - avoid races many
@@ -1465,7 +1465,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	if (S_ISDIR(old_inode_mode)) {
-		// adjust ".." of renamed directory 
+		/* adjust ".." of renamed directory */
 		set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
 		journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh);
 
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index d2d6b5650188..3a6de810bd61 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -180,7 +180,7 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s)
 
 	if (cur_size > new_size) {
 		/* mark everyone used that was listed as free at the end of the objectid
-		 ** map 
+		 ** map
 		 */
 		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
 		set_sb_oid_cursize(disk_sb, new_size);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 8e826c07cd21..536eacaeb710 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -178,11 +178,11 @@ static char *is_there_reiserfs_struct(char *fmt, int *what)
    appropriative printk. With this reiserfs_warning you can use format
    specification for complex structures like you used to do with
    printfs for integers, doubles and pointers. For instance, to print
-   out key structure you have to write just: 
-   reiserfs_warning ("bad key %k", key); 
-   instead of 
-   printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, 
-           key->k_offset, key->k_uniqueness); 
+   out key structure you have to write just:
+   reiserfs_warning ("bad key %k", key);
+   instead of
+   printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
+           key->k_offset, key->k_uniqueness);
 */
 static DEFINE_SPINLOCK(error_lock);
 static void prepare_error_buf(const char *fmt, va_list args)
@@ -244,11 +244,11 @@ static void prepare_error_buf(const char *fmt, va_list args)
 }
 
 /* in addition to usual conversion specifiers this accepts reiserfs
-   specific conversion specifiers: 
-   %k to print little endian key, 
-   %K to print cpu key, 
+   specific conversion specifiers:
+   %k to print little endian key,
+   %K to print cpu key,
    %h to print item_head,
-   %t to print directory entry 
+   %t to print directory entry
    %z to print block head (arg must be struct buffer_head *
    %b to print buffer_head
 */
@@ -314,17 +314,17 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
     maintainer-errorid.  Don't bother with reusing errorids, there are
     lots of numbers out there.
 
-    Example: 
-    
+    Example:
+
     reiserfs_panic(
 	p_sb, "reiser-29: reiserfs_new_blocknrs: "
 	"one of search_start or rn(%d) is equal to MAX_B_NUM,"
-	"which means that we are optimizing location based on the bogus location of a temp buffer (%p).", 
+	"which means that we are optimizing location based on the bogus location of a temp buffer (%p).",
 	rn, bh
     );
 
     Regular panic()s sometimes clear the screen before the message can
-    be read, thus the need for the while loop.  
+    be read, thus the need for the while loop.
 
     Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
     pointless complexity):
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index d4d7f1433ed0..d5066400638a 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -633,7 +633,7 @@ int reiserfs_global_version_in_proc(char *buffer, char **start,
  *
  */
 
-/* 
+/*
  * Make Linus happy.
  * Local variables:
  * c-indentation-style: "K&R"
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index f71c3948edef..238e9d9b31e0 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -1,8 +1,8 @@
-/* 
+/*
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-/* 
+/*
  * Written by Alexander Zarochentcev.
  *
  * The kernel part of the (on-line) reiserfs resizer.
@@ -101,7 +101,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 			memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
 
 			/* just in case vfree schedules on us, copy the new
-			 ** pointer into the journal struct before freeing the 
+			 ** pointer into the journal struct before freeing the
 			 ** old one
 			 */
 			node_tmp = jb->bitmaps;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index b2eaa0c6b7b7..a65bfee28bb8 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -77,7 +77,7 @@ inline void copy_item_head(struct item_head *p_v_to,
 /* k1 is pointer to on-disk structure which is stored in little-endian
    form. k2 is pointer to cpu variable. For key of items of the same
    object this returns 0.
-   Returns: -1 if key1 < key2 
+   Returns: -1 if key1 < key2
    0 if key1 == key2
    1 if key1 > key2 */
 inline int comp_short_keys(const struct reiserfs_key *le_key,
@@ -890,7 +890,7 @@ static inline int prepare_for_direct_item(struct treepath *path,
 	}
 	// new file gets truncated
 	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
-		// 
+		//
 		round_len = ROUND_UP(new_file_length);
 		/* this was n_new_file_length < le_ih ... */
 		if (round_len < le_ih_k_offset(le_ih)) {
@@ -1443,7 +1443,7 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
 	if (atomic_read(&p_s_inode->i_count) > 1 ||
 	    !tail_has_to_be_packed(p_s_inode) ||
 	    !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) {
-		// leave tail in an unformatted node    
+		/* leave tail in an unformatted node */
 		*p_c_mode = M_SKIP_BALANCING;
 		cut_bytes =
 		    n_block_size - (n_new_file_size & (n_block_size - 1));
@@ -1826,7 +1826,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 		/* While there are bytes to truncate and previous file item is presented in the tree. */
 
 		/*
-		 ** This loop could take a really long time, and could log 
+		 ** This loop could take a really long time, and could log
 		 ** many more blocks than a transaction can hold.  So, we do a polite
 		 ** journal end here, and if the transaction needs ending, we make
 		 ** sure the file is consistent before ending the current trans
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a1e16362ebd..d7519b951500 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -758,7 +758,7 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 			   char **opt_arg, unsigned long *bit_flags)
 {
 	char *p;
-	/* foo=bar, 
+	/* foo=bar,
 	   ^   ^  ^
 	   |   |  +-- option_end
 	   |   +-- arg_start
@@ -1348,7 +1348,7 @@ static int read_super_block(struct super_block *s, int offset)
 	}
 	//
 	// ok, reiserfs signature (old or new) found in at the given offset
-	//    
+	//
 	fs_blocksize = sb_blocksize(rs);
 	brelse(bh);
 	sb_set_blocksize(s, fs_blocksize);
@@ -1534,8 +1534,8 @@ static int what_hash(struct super_block *s)
 		code = find_hash_out(s);
 
 	if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
-		/* detection has found the hash, and we must check against the 
-		 ** mount options 
+		/* detection has found the hash, and we must check against the
+		 ** mount options
 		 */
 		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
 			reiserfs_warning(s, "reiserfs-2507",
@@ -1567,7 +1567,7 @@ static int what_hash(struct super_block *s)
 		}
 	}
 
-	/* if we are mounted RW, and we have a new valid hash code, update 
+	/* if we are mounted RW, and we have a new valid hash code, update
 	 ** the super
 	 */
 	if (code != UNSET_HASH &&
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 083f74435f65..0635cfe0f0b7 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -46,7 +46,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 	/* Set the key to search for the place for new unfm pointer */
 	make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
 
-	// FIXME: we could avoid this 
+	/* FIXME: we could avoid this */
 	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
 		reiserfs_error(sb, "PAP-14030",
 			       "pasted or inserted byte exists in "
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 4686b90886ed..5621d87c4479 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -14,7 +14,7 @@ typedef enum {
 } reiserfs_super_block_flags;
 
 /* struct reiserfs_super_block accessors/mutators
- * since this is a disk structure, it will always be in 
+ * since this is a disk structure, it will always be in
  * little endian format. */
 #define sb_block_count(sbp)         (le32_to_cpu((sbp)->s_v1.s_block_count))
 #define set_sb_block_count(sbp,v)   ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
@@ -83,16 +83,16 @@ typedef enum {
 
 /* LOGGING -- */
 
-/* These all interelate for performance.  
+/* These all interelate for performance.
 **
-** If the journal block count is smaller than n transactions, you lose speed. 
+** If the journal block count is smaller than n transactions, you lose speed.
 ** I don't know what n is yet, I'm guessing 8-16.
 **
 ** typical transaction size depends on the application, how often fsync is
-** called, and how many metadata blocks you dirty in a 30 second period.  
+** called, and how many metadata blocks you dirty in a 30 second period.
 ** The more small files (<16k) you use, the larger your transactions will
 ** be.
-** 
+**
 ** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal
 ** to wrap, which slows things down.  If you need high speed meta data updates, the journal should be big enough
 ** to prevent wrapping before dirty meta blocks get to disk.
@@ -242,7 +242,7 @@ struct reiserfs_journal {
 
 	struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];	/* array of bitmaps to record the deleted blocks */
 	struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];	/* hash table for real buffer heads in current trans */
-	struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];	/* hash table for all the real buffer heads in all 
+	struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];	/* hash table for all the real buffer heads in all
 										   the transactions */
 	struct list_head j_prealloc_list;	/* list of inodes which have preallocated blocks */
 	int j_persistent_trans;
@@ -426,7 +426,7 @@ enum reiserfs_mount_options {
 				   partition will be dealt with in a
 				   manner of 3.5.x */
 
-/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting 
+/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting
 ** reiserfs disks from 3.5.19 or earlier.  99% of the time, this option
 ** is not required.  If the normal autodection code can't determine which
 ** hash to use (because both hashes had the same value for a file)
-- 
cgit v1.2.3-71-gd317


From a9dd364358fbdc68faee5d20c2d648c320dc3cf0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:45 -0400
Subject: reiserfs: rename p_s_sb to sb

This patch is a simple s/p_s_sb/sb/g to the reiserfs code.  This is the
first in a series of patches to rip out some of the awful variable
naming in reiserfs.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/fix_node.c        |  46 +--
 fs/reiserfs/journal.c         | 735 +++++++++++++++++++++---------------------
 fs/reiserfs/stree.c           | 126 ++++----
 fs/reiserfs/tail_conversion.c |  16 +-
 include/linux/reiserfs_fs.h   |  14 +-
 5 files changed, 468 insertions(+), 469 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index a3be7da3e2b9..799c0ce24291 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -785,7 +785,7 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
 	b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
 	int n_counter, n_number_of_freeblk, n_amount_needed,	/* number of needed empty blocks */
 	 n_retval = CARRY_ON;
-	struct super_block *p_s_sb = p_s_tb->tb_sb;
+	struct super_block *sb = p_s_tb->tb_sb;
 
 	/* number_of_freeblk is the number of empty blocks which have been
 	   acquired for use by the balancing algorithm minus the number of
@@ -830,7 +830,7 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
 		RFALSE(!*p_n_blocknr,
 		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
 
-		p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
+		p_s_new_bh = sb_getblk(sb, *p_n_blocknr);
 		RFALSE(buffer_dirty(p_s_new_bh) ||
 		       buffer_journaled(p_s_new_bh) ||
 		       buffer_journal_dirty(p_s_new_bh),
@@ -899,7 +899,7 @@ static int get_rfree(struct tree_balance *tb, int h)
 static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
 {
 	struct buffer_head *p_s_father, *left;
-	struct super_block *p_s_sb = p_s_tb->tb_sb;
+	struct super_block *sb = p_s_tb->tb_sb;
 	b_blocknr_t n_left_neighbor_blocknr;
 	int n_left_neighbor_position;
 
@@ -924,7 +924,7 @@ static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
 	n_left_neighbor_blocknr =
 	    B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
 	/* Look for the left neighbor in the cache. */
-	if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) {
+	if ((left = sb_find_get_block(sb, n_left_neighbor_blocknr))) {
 
 		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
 		       "vs-8170: left neighbor (%b %z) is not in the tree",
@@ -1942,14 +1942,14 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
 	int n_child_position,
 	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
 	unsigned long n_son_number;
-	struct super_block *p_s_sb = p_s_tb->tb_sb;
+	struct super_block *sb = p_s_tb->tb_sb;
 	struct buffer_head *p_s_bh;
 
-	PROC_INFO_INC(p_s_sb, get_neighbors[n_h]);
+	PROC_INFO_INC(sb, get_neighbors[n_h]);
 
 	if (p_s_tb->lnum[n_h]) {
 		/* We need left neighbor to balance S[n_h]. */
-		PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]);
+		PROC_INFO_INC(sb, need_l_neighbor[n_h]);
 		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
 
 		RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
@@ -1961,12 +1961,12 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
 		     p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
 								       FL[n_h]);
 		n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
-		p_s_bh = sb_bread(p_s_sb, n_son_number);
+		p_s_bh = sb_bread(sb, n_son_number);
 		if (!p_s_bh)
 			return IO_ERROR;
 		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
 			brelse(p_s_bh);
-			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
+			PROC_INFO_INC(sb, get_neighbors_restart[n_h]);
 			return REPEAT_SEARCH;
 		}
 
@@ -1986,7 +1986,7 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
 	}
 
 	if (p_s_tb->rnum[n_h]) {	/* We need right neighbor to balance S[n_path_offset]. */
-		PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]);
+		PROC_INFO_INC(sb, need_r_neighbor[n_h]);
 		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
 
 		RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
@@ -1998,12 +1998,12 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
 		n_child_position =
 		    (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
 		n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
-		p_s_bh = sb_bread(p_s_sb, n_son_number);
+		p_s_bh = sb_bread(sb, n_son_number);
 		if (!p_s_bh)
 			return IO_ERROR;
 		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
 			brelse(p_s_bh);
-			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
+			PROC_INFO_INC(sb, get_neighbors_restart[n_h]);
 			return REPEAT_SEARCH;
 		}
 		brelse(p_s_tb->R[n_h]);
@@ -2089,51 +2089,51 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
 }
 
 #ifdef CONFIG_REISERFS_CHECK
-static void tb_buffer_sanity_check(struct super_block *p_s_sb,
+static void tb_buffer_sanity_check(struct super_block *sb,
 				   struct buffer_head *p_s_bh,
 				   const char *descr, int level)
 {
 	if (p_s_bh) {
 		if (atomic_read(&(p_s_bh->b_count)) <= 0) {
 
-			reiserfs_panic(p_s_sb, "jmacd-1", "negative or zero "
+			reiserfs_panic(sb, "jmacd-1", "negative or zero "
 				       "reference counter for buffer %s[%d] "
 				       "(%b)", descr, level, p_s_bh);
 		}
 
 		if (!buffer_uptodate(p_s_bh)) {
-			reiserfs_panic(p_s_sb, "jmacd-2", "buffer is not up "
+			reiserfs_panic(sb, "jmacd-2", "buffer is not up "
 				       "to date %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
 		if (!B_IS_IN_TREE(p_s_bh)) {
-			reiserfs_panic(p_s_sb, "jmacd-3", "buffer is not "
+			reiserfs_panic(sb, "jmacd-3", "buffer is not "
 				       "in tree %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
-		if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
-			reiserfs_panic(p_s_sb, "jmacd-4", "buffer has wrong "
+		if (p_s_bh->b_bdev != sb->s_bdev) {
+			reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
 				       "device %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
-		if (p_s_bh->b_size != p_s_sb->s_blocksize) {
-			reiserfs_panic(p_s_sb, "jmacd-5", "buffer has wrong "
+		if (p_s_bh->b_size != sb->s_blocksize) {
+			reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
 				       "blocksize %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 
-		if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
-			reiserfs_panic(p_s_sb, "jmacd-6", "buffer block "
+		if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(sb)) {
+			reiserfs_panic(sb, "jmacd-6", "buffer block "
 				       "number too high %s[%d] (%b)",
 				       descr, level, p_s_bh);
 		}
 	}
 }
 #else
-static void tb_buffer_sanity_check(struct super_block *p_s_sb,
+static void tb_buffer_sanity_check(struct super_block *sb,
 				   struct buffer_head *p_s_bh,
 				   const char *descr, int level)
 {;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4f787462becc..77f5bb746bf0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s,
 			     struct reiserfs_journal_list *jl, int flushall);
 static int can_dirty(struct reiserfs_journal_cnode *cn);
 static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *p_s_sb, unsigned long nblocks);
+			struct super_block *sb, unsigned long nblocks);
 static int release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal);
 static int dirty_one_transaction(struct super_block *s,
@@ -113,12 +113,12 @@ enum {
 };
 
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *p_s_sb,
+			      struct super_block *sb,
 			      unsigned long nblocks, int join);
 
-static void init_journal_hash(struct super_block *p_s_sb)
+static void init_journal_hash(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	memset(journal->j_hash_table, 0,
 	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
 }
@@ -145,7 +145,7 @@ static void disable_barrier(struct super_block *s)
 }
 
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
-							 *p_s_sb)
+							 *sb)
 {
 	struct reiserfs_bitmap_node *bn;
 	static int id;
@@ -154,7 +154,7 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 	if (!bn) {
 		return NULL;
 	}
-	bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS);
+	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
 	if (!bn->data) {
 		kfree(bn);
 		return NULL;
@@ -164,9 +164,9 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 	return bn;
 }
 
-static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)
+static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_bitmap_node *bn = NULL;
 	struct list_head *entry = journal->j_bitmap_nodes.next;
 
@@ -176,21 +176,21 @@ static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)
 	if (entry != &journal->j_bitmap_nodes) {
 		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
 		list_del(entry);
-		memset(bn->data, 0, p_s_sb->s_blocksize);
+		memset(bn->data, 0, sb->s_blocksize);
 		journal->j_free_bitmap_nodes--;
 		return bn;
 	}
-	bn = allocate_bitmap_node(p_s_sb);
+	bn = allocate_bitmap_node(sb);
 	if (!bn) {
 		yield();
 		goto repeat;
 	}
 	return bn;
 }
-static inline void free_bitmap_node(struct super_block *p_s_sb,
+static inline void free_bitmap_node(struct super_block *sb,
 				    struct reiserfs_bitmap_node *bn)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	journal->j_used_bitmap_nodes--;
 	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
 		kfree(bn->data);
@@ -201,13 +201,13 @@ static inline void free_bitmap_node(struct super_block *p_s_sb,
 	}
 }
 
-static void allocate_bitmap_nodes(struct super_block *p_s_sb)
+static void allocate_bitmap_nodes(struct super_block *sb)
 {
 	int i;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_bitmap_node *bn = NULL;
 	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
-		bn = allocate_bitmap_node(p_s_sb);
+		bn = allocate_bitmap_node(sb);
 		if (bn) {
 			list_add(&bn->list, &journal->j_bitmap_nodes);
 			journal->j_free_bitmap_nodes++;
@@ -217,30 +217,30 @@ static void allocate_bitmap_nodes(struct super_block *p_s_sb)
 	}
 }
 
-static int set_bit_in_list_bitmap(struct super_block *p_s_sb,
+static int set_bit_in_list_bitmap(struct super_block *sb,
 				  b_blocknr_t block,
 				  struct reiserfs_list_bitmap *jb)
 {
-	unsigned int bmap_nr = block / (p_s_sb->s_blocksize << 3);
-	unsigned int bit_nr = block % (p_s_sb->s_blocksize << 3);
+	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
+	unsigned int bit_nr = block % (sb->s_blocksize << 3);
 
 	if (!jb->bitmaps[bmap_nr]) {
-		jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb);
+		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
 	}
 	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
 	return 0;
 }
 
-static void cleanup_bitmap_list(struct super_block *p_s_sb,
+static void cleanup_bitmap_list(struct super_block *sb,
 				struct reiserfs_list_bitmap *jb)
 {
 	int i;
 	if (jb->bitmaps == NULL)
 		return;
 
-	for (i = 0; i < reiserfs_bmap_count(p_s_sb); i++) {
+	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
 		if (jb->bitmaps[i]) {
-			free_bitmap_node(p_s_sb, jb->bitmaps[i]);
+			free_bitmap_node(sb, jb->bitmaps[i]);
 			jb->bitmaps[i] = NULL;
 		}
 	}
@@ -249,7 +249,7 @@ static void cleanup_bitmap_list(struct super_block *p_s_sb,
 /*
 ** only call this on FS unmount.
 */
-static int free_list_bitmaps(struct super_block *p_s_sb,
+static int free_list_bitmaps(struct super_block *sb,
 			     struct reiserfs_list_bitmap *jb_array)
 {
 	int i;
@@ -257,16 +257,16 @@ static int free_list_bitmaps(struct super_block *p_s_sb,
 	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 		jb = jb_array + i;
 		jb->journal_list = NULL;
-		cleanup_bitmap_list(p_s_sb, jb);
+		cleanup_bitmap_list(sb, jb);
 		vfree(jb->bitmaps);
 		jb->bitmaps = NULL;
 	}
 	return 0;
 }
 
-static int free_bitmap_nodes(struct super_block *p_s_sb)
+static int free_bitmap_nodes(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct list_head *next = journal->j_bitmap_nodes.next;
 	struct reiserfs_bitmap_node *bn;
 
@@ -286,7 +286,7 @@ static int free_bitmap_nodes(struct super_block *p_s_sb)
 ** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
 ** jb_array is the array to be filled in.
 */
-int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
+int reiserfs_allocate_list_bitmaps(struct super_block *sb,
 				   struct reiserfs_list_bitmap *jb_array,
 				   unsigned int bmap_nr)
 {
@@ -300,7 +300,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
 		jb->journal_list = NULL;
 		jb->bitmaps = vmalloc(mem);
 		if (!jb->bitmaps) {
-			reiserfs_warning(p_s_sb, "clm-2000", "unable to "
+			reiserfs_warning(sb, "clm-2000", "unable to "
 					 "allocate bitmaps for journal lists");
 			failed = 1;
 			break;
@@ -308,7 +308,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
 		memset(jb->bitmaps, 0, mem);
 	}
 	if (failed) {
-		free_list_bitmaps(p_s_sb, jb_array);
+		free_list_bitmaps(sb, jb_array);
 		return -1;
 	}
 	return 0;
@@ -318,12 +318,12 @@ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
 ** find an available list bitmap.  If you can't find one, flush a commit list
 ** and try again
 */
-static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
+static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
 						    struct reiserfs_journal_list
 						    *jl)
 {
 	int i, j;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_list_bitmap *jb = NULL;
 
 	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
@@ -331,7 +331,7 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
 		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
 		jb = journal->j_list_bitmap + i;
 		if (journal->j_list_bitmap[i].journal_list) {
-			flush_commit_list(p_s_sb,
+			flush_commit_list(sb,
 					  journal->j_list_bitmap[i].
 					  journal_list, 1);
 			if (!journal->j_list_bitmap[i].journal_list) {
@@ -378,12 +378,12 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
 /*
 ** pulls a cnode off the free list, or returns NULL on failure
 */
-static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
+static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
 {
 	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
-	reiserfs_check_lock_depth(p_s_sb, "get_cnode");
+	reiserfs_check_lock_depth(sb, "get_cnode");
 
 	if (journal->j_cnode_free <= 0) {
 		return NULL;
@@ -405,12 +405,12 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
 /*
 ** returns a cnode to the free list
 */
-static void free_cnode(struct super_block *p_s_sb,
+static void free_cnode(struct super_block *sb,
 		       struct reiserfs_journal_cnode *cn)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
-	reiserfs_check_lock_depth(p_s_sb, "free_cnode");
+	reiserfs_check_lock_depth(sb, "free_cnode");
 
 	journal->j_cnode_used--;
 	journal->j_cnode_free++;
@@ -481,11 +481,11 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
 ** reject it on the next call to reiserfs_in_journal
 **
 */
-int reiserfs_in_journal(struct super_block *p_s_sb,
+int reiserfs_in_journal(struct super_block *sb,
 			unsigned int bmap_nr, int bit_nr, int search_all,
 			b_blocknr_t * next_zero_bit)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn;
 	struct reiserfs_list_bitmap *jb;
 	int i;
@@ -493,14 +493,14 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 
 	*next_zero_bit = 0;	/* always start this at zero. */
 
-	PROC_INFO_INC(p_s_sb, journal.in_journal);
+	PROC_INFO_INC(sb, journal.in_journal);
 	/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
 	 ** if we crash before the transaction that freed it commits,  this transaction won't
 	 ** have committed either, and the block will never be written
 	 */
 	if (search_all) {
 		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-			PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap);
+			PROC_INFO_INC(sb, journal.in_journal_bitmap);
 			jb = journal->j_list_bitmap + i;
 			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
 			    test_bit(bit_nr,
@@ -510,28 +510,28 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 				    find_next_zero_bit((unsigned long *)
 						       (jb->bitmaps[bmap_nr]->
 							data),
-						       p_s_sb->s_blocksize << 3,
+						       sb->s_blocksize << 3,
 						       bit_nr + 1);
 				return 1;
 			}
 		}
 	}
 
-	bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;
+	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
 	/* is it in any old transactions? */
 	if (search_all
 	    && (cn =
-		get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {
+		get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
 		return 1;
 	}
 
 	/* is it in the current transaction.  This should never happen */
-	if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {
+	if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
 		BUG();
 		return 1;
 	}
 
-	PROC_INFO_INC(p_s_sb, journal.in_journal_reusable);
+	PROC_INFO_INC(sb, journal.in_journal_reusable);
 	/* safe for reuse */
 	return 0;
 }
@@ -553,16 +553,16 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 }
 
 /* lock the current transaction */
-static inline void lock_journal(struct super_block *p_s_sb)
+static inline void lock_journal(struct super_block *sb)
 {
-	PROC_INFO_INC(p_s_sb, journal.lock_journal);
-	mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex);
+	PROC_INFO_INC(sb, journal.lock_journal);
+	mutex_lock(&SB_JOURNAL(sb)->j_mutex);
 }
 
 /* unlock the current transaction */
-static inline void unlock_journal(struct super_block *p_s_sb)
+static inline void unlock_journal(struct super_block *sb)
 {
-	mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex);
+	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
 }
 
 static inline void get_journal_list(struct reiserfs_journal_list *jl)
@@ -586,13 +586,13 @@ static inline void put_journal_list(struct super_block *s,
 ** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
 ** transaction.
 */
-static void cleanup_freed_for_journal_list(struct super_block *p_s_sb,
+static void cleanup_freed_for_journal_list(struct super_block *sb,
 					   struct reiserfs_journal_list *jl)
 {
 
 	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
 	if (jb) {
-		cleanup_bitmap_list(p_s_sb, jb);
+		cleanup_bitmap_list(sb, jb);
 	}
 	jl->j_list_bitmap->journal_list = NULL;
 	jl->j_list_bitmap = NULL;
@@ -1237,11 +1237,11 @@ static void remove_journal_hash(struct super_block *,
 ** journal list for this transaction.  Aside from freeing the cnode, this also allows the
 ** block to be reallocated for data blocks if it had been deleted.
 */
-static void remove_all_from_journal_list(struct super_block *p_s_sb,
+static void remove_all_from_journal_list(struct super_block *sb,
 					 struct reiserfs_journal_list *jl,
 					 int debug)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn, *last;
 	cn = jl->j_realblock;
 
@@ -1251,18 +1251,18 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
 	while (cn) {
 		if (cn->blocknr != 0) {
 			if (debug) {
-				reiserfs_warning(p_s_sb, "reiserfs-2201",
+				reiserfs_warning(sb, "reiserfs-2201",
 						 "block %u, bh is %d, state %ld",
 						 cn->blocknr, cn->bh ? 1 : 0,
 						 cn->state);
 			}
 			cn->state = 0;
-			remove_journal_hash(p_s_sb, journal->j_list_hash_table,
+			remove_journal_hash(sb, journal->j_list_hash_table,
 					    jl, cn->blocknr, 1);
 		}
 		last = cn;
 		cn = cn->next;
-		free_cnode(p_s_sb, last);
+		free_cnode(sb, last);
 	}
 	jl->j_realblock = NULL;
 }
@@ -1274,12 +1274,12 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
 ** called by flush_journal_list, before it calls remove_all_from_journal_list
 **
 */
-static int _update_journal_header_block(struct super_block *p_s_sb,
+static int _update_journal_header_block(struct super_block *sb,
 					unsigned long offset,
 					unsigned int trans_id)
 {
 	struct reiserfs_journal_header *jh;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	if (reiserfs_is_journal_aborted(journal))
 		return -EIO;
@@ -1289,7 +1289,7 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 			wait_on_buffer((journal->j_header_bh));
 			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-				reiserfs_warning(p_s_sb, "journal-699",
+				reiserfs_warning(sb, "journal-699",
 						 "buffer write failed");
 #endif
 				return -EIO;
@@ -1303,24 +1303,24 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 		jh->j_first_unflushed_offset = cpu_to_le32(offset);
 		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
 
-		if (reiserfs_barrier_flush(p_s_sb)) {
+		if (reiserfs_barrier_flush(sb)) {
 			int ret;
 			lock_buffer(journal->j_header_bh);
 			ret = submit_barrier_buffer(journal->j_header_bh);
 			if (ret == -EOPNOTSUPP) {
 				set_buffer_uptodate(journal->j_header_bh);
-				disable_barrier(p_s_sb);
+				disable_barrier(sb);
 				goto sync;
 			}
 			wait_on_buffer(journal->j_header_bh);
-			check_barrier_completion(p_s_sb, journal->j_header_bh);
+			check_barrier_completion(sb, journal->j_header_bh);
 		} else {
 		      sync:
 			set_buffer_dirty(journal->j_header_bh);
 			sync_dirty_buffer(journal->j_header_bh);
 		}
 		if (!buffer_uptodate(journal->j_header_bh)) {
-			reiserfs_warning(p_s_sb, "journal-837",
+			reiserfs_warning(sb, "journal-837",
 					 "IO error during journal replay");
 			return -EIO;
 		}
@@ -1328,23 +1328,23 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 	return 0;
 }
 
-static int update_journal_header_block(struct super_block *p_s_sb,
+static int update_journal_header_block(struct super_block *sb,
 				       unsigned long offset,
 				       unsigned int trans_id)
 {
-	return _update_journal_header_block(p_s_sb, offset, trans_id);
+	return _update_journal_header_block(sb, offset, trans_id);
 }
 
 /*
 ** flush any and all journal lists older than you are
 ** can only be called from flush_journal_list
 */
-static int flush_older_journal_lists(struct super_block *p_s_sb,
+static int flush_older_journal_lists(struct super_block *sb,
 				     struct reiserfs_journal_list *jl)
 {
 	struct list_head *entry;
 	struct reiserfs_journal_list *other_jl;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	unsigned int trans_id = jl->j_trans_id;
 
 	/* we know we are the only ones flushing things, no extra race
@@ -1359,7 +1359,7 @@ static int flush_older_journal_lists(struct super_block *p_s_sb,
 	if (other_jl->j_trans_id < trans_id) {
 		BUG_ON(other_jl->j_refcount <= 0);
 		/* do not flush all */
-		flush_journal_list(p_s_sb, other_jl, 0);
+		flush_journal_list(sb, other_jl, 0);
 
 		/* other_jl is now deleted from the list */
 		goto restart;
@@ -1908,22 +1908,22 @@ void remove_journal_hash(struct super_block *sb,
 	}
 }
 
-static void free_journal_ram(struct super_block *p_s_sb)
+static void free_journal_ram(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	kfree(journal->j_current_jl);
 	journal->j_num_lists--;
 
 	vfree(journal->j_cnode_free_orig);
-	free_list_bitmaps(p_s_sb, journal->j_list_bitmap);
-	free_bitmap_nodes(p_s_sb);	/* must be after free_list_bitmaps */
+	free_list_bitmaps(sb, journal->j_list_bitmap);
+	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
 	if (journal->j_header_bh) {
 		brelse(journal->j_header_bh);
 	}
 	/* j_header_bh is on the journal dev, make sure not to release the journal
 	 * dev until we brelse j_header_bh
 	 */
-	release_journal_dev(p_s_sb, journal);
+	release_journal_dev(sb, journal);
 	vfree(journal);
 }
 
@@ -1932,27 +1932,27 @@ static void free_journal_ram(struct super_block *p_s_sb)
 ** of read_super() yet.  Any other caller must keep error at 0.
 */
 static int do_journal_release(struct reiserfs_transaction_handle *th,
-			      struct super_block *p_s_sb, int error)
+			      struct super_block *sb, int error)
 {
 	struct reiserfs_transaction_handle myth;
 	int flushed = 0;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	/* we only want to flush out transactions if we were called with error == 0
 	 */
-	if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
+	if (!error && !(sb->s_flags & MS_RDONLY)) {
 		/* end the current trans */
 		BUG_ON(!th->t_trans_id);
-		do_journal_end(th, p_s_sb, 10, FLUSH_ALL);
+		do_journal_end(th, sb, 10, FLUSH_ALL);
 
 		/* make sure something gets logged to force our way into the flush code */
-		if (!journal_join(&myth, p_s_sb, 1)) {
-			reiserfs_prepare_for_journal(p_s_sb,
-						     SB_BUFFER_WITH_SB(p_s_sb),
+		if (!journal_join(&myth, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
 						     1);
-			journal_mark_dirty(&myth, p_s_sb,
-					   SB_BUFFER_WITH_SB(p_s_sb));
-			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
+			journal_mark_dirty(&myth, sb,
+					   SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, sb, 1, FLUSH_ALL);
 			flushed = 1;
 		}
 	}
@@ -1960,26 +1960,26 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	/* this also catches errors during the do_journal_end above */
 	if (!error && reiserfs_is_journal_aborted(journal)) {
 		memset(&myth, 0, sizeof(myth));
-		if (!journal_join_abort(&myth, p_s_sb, 1)) {
-			reiserfs_prepare_for_journal(p_s_sb,
-						     SB_BUFFER_WITH_SB(p_s_sb),
+		if (!journal_join_abort(&myth, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
 						     1);
-			journal_mark_dirty(&myth, p_s_sb,
-					   SB_BUFFER_WITH_SB(p_s_sb));
-			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
+			journal_mark_dirty(&myth, sb,
+					   SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, sb, 1, FLUSH_ALL);
 		}
 	}
 
 	reiserfs_mounted_fs_count--;
 	/* wait for all commits to finish */
-	cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);
+	cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
 	flush_workqueue(commit_wq);
 	if (!reiserfs_mounted_fs_count) {
 		destroy_workqueue(commit_wq);
 		commit_wq = NULL;
 	}
 
-	free_journal_ram(p_s_sb);
+	free_journal_ram(sb);
 
 	return 0;
 }
@@ -1988,28 +1988,28 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 ** call on unmount.  flush all journal trans, release all alloc'd ram
 */
 int journal_release(struct reiserfs_transaction_handle *th,
-		    struct super_block *p_s_sb)
+		    struct super_block *sb)
 {
-	return do_journal_release(th, p_s_sb, 0);
+	return do_journal_release(th, sb, 0);
 }
 
 /*
 ** only call from an error condition inside reiserfs_read_super!
 */
 int journal_release_error(struct reiserfs_transaction_handle *th,
-			  struct super_block *p_s_sb)
+			  struct super_block *sb)
 {
-	return do_journal_release(th, p_s_sb, 1);
+	return do_journal_release(th, sb, 1);
 }
 
 /* compares description block with commit block.  returns 1 if they differ, 0 if they are the same */
-static int journal_compare_desc_commit(struct super_block *p_s_sb,
+static int journal_compare_desc_commit(struct super_block *sb,
 				       struct reiserfs_journal_desc *desc,
 				       struct reiserfs_journal_commit *commit)
 {
 	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
 	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
-	    get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||
+	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
 	    get_commit_trans_len(commit) <= 0) {
 		return 1;
 	}
@@ -2020,7 +2020,7 @@ static int journal_compare_desc_commit(struct super_block *p_s_sb,
 ** returns -1 if it found a corrupt commit block
 ** returns 1 if both desc and commit were valid
 */
-static int journal_transaction_is_valid(struct super_block *p_s_sb,
+static int journal_transaction_is_valid(struct super_block *sb,
 					struct buffer_head *d_bh,
 					unsigned int *oldest_invalid_trans_id,
 					unsigned long *newest_mount_id)
@@ -2038,7 +2038,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
 		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
 		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
-			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 				       "journal-986: transaction "
 				       "is valid returning because trans_id %d is greater than "
 				       "oldest_invalid %lu",
@@ -2048,7 +2048,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 		}
 		if (newest_mount_id
 		    && *newest_mount_id > get_desc_mount_id(desc)) {
-			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 				       "journal-1087: transaction "
 				       "is valid returning because mount_id %d is less than "
 				       "newest_mount_id %lu",
@@ -2056,37 +2056,37 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 				       *newest_mount_id);
 			return -1;
 		}
-		if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) {
-			reiserfs_warning(p_s_sb, "journal-2018",
+		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
+			reiserfs_warning(sb, "journal-2018",
 					 "Bad transaction length %d "
 					 "encountered, ignoring transaction",
 					 get_desc_trans_len(desc));
 			return -1;
 		}
-		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 
 		/* ok, we have a journal description block, lets see if the transaction was valid */
 		c_bh =
-		    journal_bread(p_s_sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				  ((offset + get_desc_trans_len(desc) +
-				    1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
 		if (!c_bh)
 			return 0;
 		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-		if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
-			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		if (journal_compare_desc_commit(sb, desc, commit)) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 				       "journal_transaction_is_valid, commit offset %ld had bad "
 				       "time %d or length %d",
 				       c_bh->b_blocknr -
-				       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 				       get_commit_trans_id(commit),
 				       get_commit_trans_len(commit));
 			brelse(c_bh);
 			if (oldest_invalid_trans_id) {
 				*oldest_invalid_trans_id =
 				    get_desc_trans_id(desc);
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1004: "
 					       "transaction_is_valid setting oldest invalid trans_id "
 					       "to %d",
@@ -2095,11 +2095,11 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 			return -1;
 		}
 		brelse(c_bh);
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1006: found valid "
 			       "transaction start offset %llu, len %d id %d",
 			       d_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 			       get_desc_trans_len(desc),
 			       get_desc_trans_id(desc));
 		return 1;
@@ -2121,13 +2121,13 @@ static void brelse_array(struct buffer_head **heads, int num)
 ** this either reads in a replays a transaction, or returns because the transaction
 ** is invalid, or too old.
 */
-static int journal_read_transaction(struct super_block *p_s_sb,
+static int journal_read_transaction(struct super_block *sb,
 				    unsigned long cur_dblock,
 				    unsigned long oldest_start,
 				    unsigned int oldest_trans_id,
 				    unsigned long newest_mount_id)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_desc *desc;
 	struct reiserfs_journal_commit *commit;
 	unsigned int trans_id = 0;
@@ -2139,45 +2139,45 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	int i;
 	int trans_half;
 
-	d_bh = journal_bread(p_s_sb, cur_dblock);
+	d_bh = journal_bread(sb, cur_dblock);
 	if (!d_bh)
 		return 1;
 	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
+	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
 		       "journal_read_transaction, offset %llu, len %d mount_id %d",
-		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		       get_desc_trans_len(desc), get_desc_mount_id(desc));
 	if (get_desc_trans_id(desc) < oldest_trans_id) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
 			       "journal_read_trans skipping because %lu is too old",
 			       cur_dblock -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
 		brelse(d_bh);
 		return 1;
 	}
 	if (get_desc_mount_id(desc) != newest_mount_id) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
 			       "journal_read_trans skipping because %d is != "
 			       "newest_mount_id %lu", get_desc_mount_id(desc),
 			       newest_mount_id);
 		brelse(d_bh);
 		return 1;
 	}
-	c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 			     ((trans_offset + get_desc_trans_len(desc) + 1) %
-			      SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+			      SB_ONDISK_JOURNAL_SIZE(sb)));
 	if (!c_bh) {
 		brelse(d_bh);
 		return 1;
 	}
 	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-	if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+	if (journal_compare_desc_commit(sb, desc, commit)) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal_read_transaction, "
 			       "commit offset %llu had bad time %d or length %d",
 			       c_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 			       get_commit_trans_id(commit),
 			       get_commit_trans_len(commit));
 		brelse(c_bh);
@@ -2195,30 +2195,30 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 		brelse(d_bh);
 		kfree(log_blocks);
 		kfree(real_blocks);
-		reiserfs_warning(p_s_sb, "journal-1169",
+		reiserfs_warning(sb, "journal-1169",
 				 "kmalloc failed, unable to mount FS");
 		return -1;
 	}
 	/* get all the buffer heads */
-	trans_half = journal_trans_half(p_s_sb->s_blocksize);
+	trans_half = journal_trans_half(sb->s_blocksize);
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		log_blocks[i] =
-		    journal_getblk(p_s_sb,
-				   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    journal_getblk(sb,
+				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				   (trans_offset + 1 +
-				    i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
 		if (i < trans_half) {
 			real_blocks[i] =
-			    sb_getblk(p_s_sb,
+			    sb_getblk(sb,
 				      le32_to_cpu(desc->j_realblock[i]));
 		} else {
 			real_blocks[i] =
-			    sb_getblk(p_s_sb,
+			    sb_getblk(sb,
 				      le32_to_cpu(commit->
 						  j_realblock[i - trans_half]));
 		}
-		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
-			reiserfs_warning(p_s_sb, "journal-1207",
+		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
+			reiserfs_warning(sb, "journal-1207",
 					 "REPLAY FAILURE fsck required! "
 					 "Block to replay is outside of "
 					 "filesystem");
@@ -2226,8 +2226,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 		}
 		/* make sure we don't try to replay onto log or reserved area */
 		if (is_block_in_log_or_reserved_area
-		    (p_s_sb, real_blocks[i]->b_blocknr)) {
-			reiserfs_warning(p_s_sb, "journal-1204",
+		    (sb, real_blocks[i]->b_blocknr)) {
+			reiserfs_warning(sb, "journal-1204",
 					 "REPLAY FAILURE fsck required! "
 					 "Trying to replay onto a log block");
 		      abort_replay:
@@ -2245,7 +2245,7 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(log_blocks[i]);
 		if (!buffer_uptodate(log_blocks[i])) {
-			reiserfs_warning(p_s_sb, "journal-1212",
+			reiserfs_warning(sb, "journal-1212",
 					 "REPLAY FAILURE fsck required! "
 					 "buffer write failed");
 			brelse_array(log_blocks + i,
@@ -2270,7 +2270,7 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(real_blocks[i]);
 		if (!buffer_uptodate(real_blocks[i])) {
-			reiserfs_warning(p_s_sb, "journal-1226",
+			reiserfs_warning(sb, "journal-1226",
 					 "REPLAY FAILURE, fsck required! "
 					 "buffer write failed");
 			brelse_array(real_blocks + i,
@@ -2284,15 +2284,15 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 		brelse(real_blocks[i]);
 	}
 	cur_dblock =
-	    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 	    ((trans_offset + get_desc_trans_len(desc) +
-	      2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 		       "journal-1095: setting journal " "start to offset %ld",
-		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
+		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
 
 	/* init starting values for the first transaction, in case this is the last transaction to be replayed. */
-	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 	journal->j_last_flush_trans_id = trans_id;
 	journal->j_trans_id = trans_id + 1;
 	/* check for trans_id overflow */
@@ -2357,9 +2357,9 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
 **
 ** On exit, it sets things up so the first transaction will work correctly.
 */
-static int journal_read(struct super_block *p_s_sb)
+static int journal_read(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_desc *desc;
 	unsigned int oldest_trans_id = 0;
 	unsigned int oldest_invalid_trans_id = 0;
@@ -2375,8 +2375,8 @@ static int journal_read(struct super_block *p_s_sb)
 	int ret;
 	char b[BDEVNAME_SIZE];
 
-	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
-	reiserfs_info(p_s_sb, "checking transaction log (%s)\n",
+	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_info(sb, "checking transaction log (%s)\n",
 		      bdevname(journal->j_dev_bd, b));
 	start = get_seconds();
 
@@ -2384,22 +2384,22 @@ static int journal_read(struct super_block *p_s_sb)
 	 ** is the first unflushed, and if that transaction is not valid,
 	 ** replay is done
 	 */
-	journal->j_header_bh = journal_bread(p_s_sb,
-					     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)
-					     + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+	journal->j_header_bh = journal_bread(sb,
+					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
+					     + SB_ONDISK_JOURNAL_SIZE(sb));
 	if (!journal->j_header_bh) {
 		return 1;
 	}
 	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
 	if (le32_to_cpu(jh->j_first_unflushed_offset) <
-	    SB_ONDISK_JOURNAL_SIZE(p_s_sb)
+	    SB_ONDISK_JOURNAL_SIZE(sb)
 	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
 		oldest_start =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 		    le32_to_cpu(jh->j_first_unflushed_offset);
 		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
 		newest_mount_id = le32_to_cpu(jh->j_mount_id);
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1153: found in "
 			       "header: first_unflushed_offset %d, last_flushed_trans_id "
 			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
@@ -2411,10 +2411,10 @@ static int journal_read(struct super_block *p_s_sb)
 		 ** through the whole log.
 		 */
 		d_bh =
-		    journal_bread(p_s_sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				  le32_to_cpu(jh->j_first_unflushed_offset));
-		ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL);
+		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
 		if (!ret) {
 			continue_replay = 0;
 		}
@@ -2422,8 +2422,8 @@ static int journal_read(struct super_block *p_s_sb)
 		goto start_log_replay;
 	}
 
-	if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
-		reiserfs_warning(p_s_sb, "clm-2076",
+	if (continue_replay && bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
 				 "device is readonly, unable to replay log");
 		return -1;
 	}
@@ -2433,17 +2433,17 @@ static int journal_read(struct super_block *p_s_sb)
 	 */
 	while (continue_replay
 	       && cur_dblock <
-	       (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-		SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {
+	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+		SB_ONDISK_JOURNAL_SIZE(sb))) {
 		/* Note that it is required for blocksize of primary fs device and journal
 		   device to be the same */
 		d_bh =
 		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
-				    p_s_sb->s_blocksize,
-				    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-				    SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+				    sb->s_blocksize,
+				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				    SB_ONDISK_JOURNAL_SIZE(sb));
 		ret =
-		    journal_transaction_is_valid(p_s_sb, d_bh,
+		    journal_transaction_is_valid(sb, d_bh,
 						 &oldest_invalid_trans_id,
 						 &newest_mount_id);
 		if (ret == 1) {
@@ -2452,26 +2452,26 @@ static int journal_read(struct super_block *p_s_sb)
 				oldest_trans_id = get_desc_trans_id(desc);
 				oldest_start = d_bh->b_blocknr;
 				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1179: Setting "
 					       "oldest_start to offset %llu, trans_id %lu",
 					       oldest_start -
 					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (p_s_sb), oldest_trans_id);
+					       (sb), oldest_trans_id);
 			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
 				/* one we just read was older */
 				oldest_trans_id = get_desc_trans_id(desc);
 				oldest_start = d_bh->b_blocknr;
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1180: Resetting "
 					       "oldest_start to offset %lu, trans_id %lu",
 					       oldest_start -
 					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (p_s_sb), oldest_trans_id);
+					       (sb), oldest_trans_id);
 			}
 			if (newest_mount_id < get_desc_mount_id(desc)) {
 				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1299: Setting "
 					       "newest_mount_id to %d",
 					       get_desc_mount_id(desc));
@@ -2486,17 +2486,17 @@ static int journal_read(struct super_block *p_s_sb)
       start_log_replay:
 	cur_dblock = oldest_start;
 	if (oldest_trans_id) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1206: Starting replay "
 			       "from offset %llu, trans_id %lu",
-			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 			       oldest_trans_id);
 
 	}
 	replay_count = 0;
 	while (continue_replay && oldest_trans_id > 0) {
 		ret =
-		    journal_read_transaction(p_s_sb, cur_dblock, oldest_start,
+		    journal_read_transaction(sb, cur_dblock, oldest_start,
 					     oldest_trans_id, newest_mount_id);
 		if (ret < 0) {
 			return ret;
@@ -2504,14 +2504,14 @@ static int journal_read(struct super_block *p_s_sb)
 			break;
 		}
 		cur_dblock =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start;
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
 		replay_count++;
 		if (cur_dblock == oldest_start)
 			break;
 	}
 
 	if (oldest_trans_id == 0) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1225: No valid " "transactions found");
 	}
 	/* j_start does not get set correctly if we don't replay any transactions.
@@ -2531,16 +2531,16 @@ static int journal_read(struct super_block *p_s_sb)
 	} else {
 		journal->j_mount_id = newest_mount_id + 1;
 	}
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
 		       "newest_mount_id to %lu", journal->j_mount_id);
 	journal->j_first_unflushed_offset = journal->j_start;
 	if (replay_count > 0) {
-		reiserfs_info(p_s_sb,
+		reiserfs_info(sb,
 			      "replayed %d transactions in %lu seconds\n",
 			      replay_count, get_seconds() - start);
 	}
-	if (!bdev_read_only(p_s_sb->s_bdev) &&
-	    _update_journal_header_block(p_s_sb, journal->j_start,
+	if (!bdev_read_only(sb->s_bdev) &&
+	    _update_journal_header_block(sb, journal->j_start,
 					 journal->j_last_flush_trans_id)) {
 		/* replay failed, caller must call free_journal_ram and abort
 		 ** the mount
@@ -2565,9 +2565,9 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 	return jl;
 }
 
-static void journal_list_init(struct super_block *p_s_sb)
+static void journal_list_init(struct super_block *sb)
 {
-	SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
+	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
 }
 
 static int release_journal_dev(struct super_block *super,
@@ -2666,28 +2666,28 @@ static int journal_init_dev(struct super_block *super,
  */
 #define REISERFS_STANDARD_BLKSIZE (4096)
 
-static int check_advise_trans_params(struct super_block *p_s_sb,
+static int check_advise_trans_params(struct super_block *sb,
 				     struct reiserfs_journal *journal)
 {
         if (journal->j_trans_max) {
 	        /* Non-default journal params.
 		   Do sanity check for them. */
 	        int ratio = 1;
-		if (p_s_sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
-		        ratio = REISERFS_STANDARD_BLKSIZE / p_s_sb->s_blocksize;
+		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
+		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
 
 		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
 		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
-		    SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max <
+		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
 		    JOURNAL_MIN_RATIO) {
-			reiserfs_warning(p_s_sb, "sh-462",
+			reiserfs_warning(sb, "sh-462",
 					 "bad transaction max size (%u). "
 					 "FSCK?", journal->j_trans_max);
 			return 1;
 		}
 		if (journal->j_max_batch != (journal->j_trans_max) *
 		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
-			reiserfs_warning(p_s_sb, "sh-463",
+			reiserfs_warning(sb, "sh-463",
 					 "bad transaction max batch (%u). "
 					 "FSCK?", journal->j_max_batch);
 			return 1;
@@ -2697,9 +2697,9 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
                    The file system was created by old version
 		   of mkreiserfs, so some fields contain zeros,
 		   and we need to advise proper values for them */
-		if (p_s_sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
-			reiserfs_warning(p_s_sb, "sh-464", "bad blocksize (%u)",
-					 p_s_sb->s_blocksize);
+		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
+			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
+					 sb->s_blocksize);
 			return 1;
 		}
 		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
@@ -2712,10 +2712,10 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
 /*
 ** must be called once on fs mount.  calls journal_read for you
 */
-int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
+int journal_init(struct super_block *sb, const char *j_dev_name,
 		 int old_format, unsigned int commit_max_age)
 {
-	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2;
+	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
 	struct buffer_head *bhjh;
 	struct reiserfs_super_block *rs;
 	struct reiserfs_journal_header *jh;
@@ -2723,9 +2723,9 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	struct reiserfs_journal_list *jl;
 	char b[BDEVNAME_SIZE];
 
-	journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal));
+	journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
 	if (!journal) {
-		reiserfs_warning(p_s_sb, "journal-1256",
+		reiserfs_warning(sb, "journal-1256",
 				 "unable to get memory for journal structure");
 		return 1;
 	}
@@ -2735,50 +2735,50 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	INIT_LIST_HEAD(&journal->j_working_list);
 	INIT_LIST_HEAD(&journal->j_journal_list);
 	journal->j_persistent_trans = 0;
-	if (reiserfs_allocate_list_bitmaps(p_s_sb,
+	if (reiserfs_allocate_list_bitmaps(sb,
 					   journal->j_list_bitmap,
-					   reiserfs_bmap_count(p_s_sb)))
+					   reiserfs_bmap_count(sb)))
 		goto free_and_return;
-	allocate_bitmap_nodes(p_s_sb);
+	allocate_bitmap_nodes(sb);
 
 	/* reserved for journal area support */
-	SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
+	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
 						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
-						 / p_s_sb->s_blocksize +
-						 reiserfs_bmap_count(p_s_sb) +
+						 / sb->s_blocksize +
+						 reiserfs_bmap_count(sb) +
 						 1 :
 						 REISERFS_DISK_OFFSET_IN_BYTES /
-						 p_s_sb->s_blocksize + 2);
+						 sb->s_blocksize + 2);
 
 	/* Sanity check to see is the standard journal fitting withing first bitmap
 	   (actual for small blocksizes) */
-	if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) &&
-	    (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) +
-	     SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) {
-		reiserfs_warning(p_s_sb, "journal-1393",
+	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
+	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
+	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
+		reiserfs_warning(sb, "journal-1393",
 				 "journal does not fit for area addressed "
 				 "by first of bitmap blocks. It starts at "
 				 "%u and its size is %u. Block size %ld",
-				 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
-				 SB_ONDISK_JOURNAL_SIZE(p_s_sb),
-				 p_s_sb->s_blocksize);
+				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
+				 SB_ONDISK_JOURNAL_SIZE(sb),
+				 sb->s_blocksize);
 		goto free_and_return;
 	}
 
-	if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) {
-		reiserfs_warning(p_s_sb, "sh-462",
+	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+		reiserfs_warning(sb, "sh-462",
 				 "unable to initialize jornal device");
 		goto free_and_return;
 	}
 
-	rs = SB_DISK_SUPER_BLOCK(p_s_sb);
+	rs = SB_DISK_SUPER_BLOCK(sb);
 
 	/* read journal header */
-	bhjh = journal_bread(p_s_sb,
-			     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-			     SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+	bhjh = journal_bread(sb,
+			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			     SB_ONDISK_JOURNAL_SIZE(sb));
 	if (!bhjh) {
-		reiserfs_warning(p_s_sb, "sh-459",
+		reiserfs_warning(sb, "sh-459",
 				 "unable to read journal header");
 		goto free_and_return;
 	}
@@ -2788,7 +2788,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	if (is_reiserfs_jr(rs)
 	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
 		sb_jp_journal_magic(rs))) {
-		reiserfs_warning(p_s_sb, "sh-460",
+		reiserfs_warning(sb, "sh-460",
 				 "journal header magic %x (device %s) does "
 				 "not match to magic found in super block %x",
 				 jh->jh_journal.jp_journal_magic,
@@ -2804,7 +2804,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
 	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
 
-	if (check_advise_trans_params(p_s_sb, journal) != 0)
+	if (check_advise_trans_params(sb, journal) != 0)
 	        goto free_and_return;
 	journal->j_default_max_commit_age = journal->j_max_commit_age;
 
@@ -2813,12 +2813,12 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 		journal->j_max_trans_age = commit_max_age;
 	}
 
-	reiserfs_info(p_s_sb, "journal params: device %s, size %u, "
+	reiserfs_info(sb, "journal params: device %s, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
 		      bdevname(journal->j_dev_bd, b),
-		      SB_ONDISK_JOURNAL_SIZE(p_s_sb),
-		      SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+		      SB_ONDISK_JOURNAL_SIZE(sb),
+		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		      journal->j_trans_max,
 		      journal->j_max_batch,
 		      journal->j_max_commit_age, journal->j_max_trans_age);
@@ -2826,7 +2826,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	brelse(bhjh);
 
 	journal->j_list_bitmap_index = 0;
-	journal_list_init(p_s_sb);
+	journal_list_init(sb);
 
 	memset(journal->j_list_hash_table, 0,
 	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
@@ -2858,7 +2858,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	journal->j_must_wait = 0;
 
 	if (journal->j_cnode_free == 0) {
-		reiserfs_warning(p_s_sb, "journal-2004", "Journal cnode memory "
+		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
 		                 "allocation failed (%ld bytes). Journal is "
 		                 "too large for available memory. Usually "
 		                 "this is due to a journal that is too large.",
@@ -2866,16 +2866,16 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
         	goto free_and_return;
 	}
 
-	init_journal_hash(p_s_sb);
+	init_journal_hash(sb);
 	jl = journal->j_current_jl;
-	jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+	jl->j_list_bitmap = get_list_bitmap(sb, jl);
 	if (!jl->j_list_bitmap) {
-		reiserfs_warning(p_s_sb, "journal-2005",
+		reiserfs_warning(sb, "journal-2005",
 				 "get_list_bitmap failed for journal list 0");
 		goto free_and_return;
 	}
-	if (journal_read(p_s_sb) < 0) {
-		reiserfs_warning(p_s_sb, "reiserfs-2006",
+	if (journal_read(sb) < 0) {
+		reiserfs_warning(sb, "reiserfs-2006",
 				 "Replay Failure, unable to mount");
 		goto free_and_return;
 	}
@@ -2885,10 +2885,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 		commit_wq = create_workqueue("reiserfs");
 
 	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
-	journal->j_work_sb = p_s_sb;
+	journal->j_work_sb = sb;
 	return 0;
       free_and_return:
-	free_journal_ram(p_s_sb);
+	free_journal_ram(sb);
 	return 1;
 }
 
@@ -3004,37 +3004,37 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
 ** expect to use in nblocks.
 */
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *p_s_sb, unsigned long nblocks,
+			      struct super_block *sb, unsigned long nblocks,
 			      int join)
 {
 	time_t now = get_seconds();
 	unsigned int old_trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_transaction_handle myth;
 	int sched_count = 0;
 	int retval;
 
-	reiserfs_check_lock_depth(p_s_sb, "journal_begin");
+	reiserfs_check_lock_depth(sb, "journal_begin");
 	BUG_ON(nblocks > journal->j_trans_max);
 
-	PROC_INFO_INC(p_s_sb, journal.journal_being);
+	PROC_INFO_INC(sb, journal.journal_being);
 	/* set here for journal_join */
 	th->t_refcount = 1;
-	th->t_super = p_s_sb;
+	th->t_super = sb;
 
       relock:
-	lock_journal(p_s_sb);
+	lock_journal(sb);
 	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
-		unlock_journal(p_s_sb);
+		unlock_journal(sb);
 		retval = journal->j_errno;
 		goto out_fail;
 	}
 	journal->j_bcount++;
 
 	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
-		unlock_journal(p_s_sb);
-		reiserfs_wait_on_write_block(p_s_sb);
-		PROC_INFO_INC(p_s_sb, journal.journal_relock_writers);
+		unlock_journal(sb);
+		reiserfs_wait_on_write_block(sb);
+		PROC_INFO_INC(sb, journal.journal_relock_writers);
 		goto relock;
 	}
 	now = get_seconds();
@@ -3055,7 +3055,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
 
 		old_trans_id = journal->j_trans_id;
-		unlock_journal(p_s_sb);	/* allow others to finish this transaction */
+		unlock_journal(sb);	/* allow others to finish this transaction */
 
 		if (!join && (journal->j_len_alloc + nblocks + 2) >=
 		    journal->j_max_batch &&
@@ -3063,7 +3063,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 		    (journal->j_len_alloc * 75)) {
 			if (atomic_read(&journal->j_wcount) > 10) {
 				sched_count++;
-				queue_log_writer(p_s_sb);
+				queue_log_writer(sb);
 				goto relock;
 			}
 		}
@@ -3073,25 +3073,25 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 		if (atomic_read(&journal->j_jlock)) {
 			while (journal->j_trans_id == old_trans_id &&
 			       atomic_read(&journal->j_jlock)) {
-				queue_log_writer(p_s_sb);
+				queue_log_writer(sb);
 			}
 			goto relock;
 		}
-		retval = journal_join(&myth, p_s_sb, 1);
+		retval = journal_join(&myth, sb, 1);
 		if (retval)
 			goto out_fail;
 
 		/* someone might have ended the transaction while we joined */
 		if (old_trans_id != journal->j_trans_id) {
-			retval = do_journal_end(&myth, p_s_sb, 1, 0);
+			retval = do_journal_end(&myth, sb, 1, 0);
 		} else {
-			retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW);
+			retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
 		}
 
 		if (retval)
 			goto out_fail;
 
-		PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount);
+		PROC_INFO_INC(sb, journal.journal_relock_wcount);
 		goto relock;
 	}
 	/* we are the first writer, set trans_id */
@@ -3103,7 +3103,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	th->t_blocks_logged = 0;
 	th->t_blocks_allocated = nblocks;
 	th->t_trans_id = journal->j_trans_id;
-	unlock_journal(p_s_sb);
+	unlock_journal(sb);
 	INIT_LIST_HEAD(&th->t_list);
 	get_fs_excl();
 	return 0;
@@ -3113,7 +3113,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	/* Re-set th->t_super, so we can properly keep track of how many
 	 * persistent transactions there are. We need to do this so if this
 	 * call is part of a failed restart_transaction, we can free it later */
-	th->t_super = p_s_sb;
+	th->t_super = sb;
 	return retval;
 }
 
@@ -3164,7 +3164,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
 }
 
 static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *p_s_sb, unsigned long nblocks)
+			struct super_block *sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -3173,11 +3173,11 @@ static int journal_join(struct reiserfs_transaction_handle *th,
 	 */
 	th->t_handle_save = cur_th;
 	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN);
+	return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
 }
 
 int journal_join_abort(struct reiserfs_transaction_handle *th,
-		       struct super_block *p_s_sb, unsigned long nblocks)
+		       struct super_block *sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -3186,11 +3186,11 @@ int journal_join_abort(struct reiserfs_transaction_handle *th,
 	 */
 	th->t_handle_save = cur_th;
 	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT);
+	return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th,
-		  struct super_block *p_s_sb, unsigned long nblocks)
+		  struct super_block *sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 	int ret;
@@ -3198,12 +3198,12 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 	th->t_handle_save = NULL;
 	if (cur_th) {
 		/* we are nesting into the current transaction */
-		if (cur_th->t_super == p_s_sb) {
+		if (cur_th->t_super == sb) {
 			BUG_ON(!cur_th->t_refcount);
 			cur_th->t_refcount++;
 			memcpy(th, cur_th, sizeof(*th));
 			if (th->t_refcount <= 1)
-				reiserfs_warning(p_s_sb, "reiserfs-2005",
+				reiserfs_warning(sb, "reiserfs-2005",
 						 "BAD: refcount <= 1, but "
 						 "journal_info != 0");
 			return 0;
@@ -3212,7 +3212,7 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 			 ** save it and restore on journal_end.  This should never
 			 ** really happen...
 			 */
-			reiserfs_warning(p_s_sb, "clm-2100",
+			reiserfs_warning(sb, "clm-2100",
 					 "nesting info a different FS");
 			th->t_handle_save = current->journal_info;
 			current->journal_info = th;
@@ -3220,7 +3220,7 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 	} else {
 		current->journal_info = th;
 	}
-	ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG);
+	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
 	BUG_ON(current->journal_info != th);
 
 	/* I guess this boils down to being the reciprocal of clm-2100 above.
@@ -3244,28 +3244,28 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 ** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
 */
 int journal_mark_dirty(struct reiserfs_transaction_handle *th,
-		       struct super_block *p_s_sb, struct buffer_head *bh)
+		       struct super_block *sb, struct buffer_head *bh)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn = NULL;
 	int count_already_incd = 0;
 	int prepared = 0;
 	BUG_ON(!th->t_trans_id);
 
-	PROC_INFO_INC(p_s_sb, journal.mark_dirty);
+	PROC_INFO_INC(sb, journal.mark_dirty);
 	if (th->t_trans_id != journal->j_trans_id) {
 		reiserfs_panic(th->t_super, "journal-1577",
 			       "handle trans id %ld != current trans id %ld",
 			       th->t_trans_id, journal->j_trans_id);
 	}
 
-	p_s_sb->s_dirt = 1;
+	sb->s_dirt = 1;
 
 	prepared = test_clear_buffer_journal_prepared(bh);
 	clear_buffer_journal_restore_dirty(bh);
 	/* already in this transaction, we are done */
 	if (buffer_journaled(bh)) {
-		PROC_INFO_INC(p_s_sb, journal.mark_dirty_already);
+		PROC_INFO_INC(sb, journal.mark_dirty_already);
 		return 0;
 	}
 
@@ -3274,7 +3274,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	 ** could get to disk too early.  NOT GOOD.
 	 */
 	if (!prepared || buffer_dirty(bh)) {
-		reiserfs_warning(p_s_sb, "journal-1777",
+		reiserfs_warning(sb, "journal-1777",
 				 "buffer %llu bad state "
 				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
 				 (unsigned long long)bh->b_blocknr,
@@ -3285,7 +3285,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	}
 
 	if (atomic_read(&(journal->j_wcount)) <= 0) {
-		reiserfs_warning(p_s_sb, "journal-1409",
+		reiserfs_warning(sb, "journal-1409",
 				 "returning because j_wcount was %d",
 				 atomic_read(&(journal->j_wcount)));
 		return 1;
@@ -3301,7 +3301,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 
 	if (buffer_journal_dirty(bh)) {
 		count_already_incd = 1;
-		PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal);
+		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
 		clear_buffer_journal_dirty(bh);
 	}
 
@@ -3313,10 +3313,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 
 	/* now put this guy on the end */
 	if (!cn) {
-		cn = get_cnode(p_s_sb);
+		cn = get_cnode(sb);
 		if (!cn) {
-			reiserfs_panic(p_s_sb, "journal-4",
-				       "get_cnode failed!");
+			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
 		}
 
 		if (th->t_blocks_logged == th->t_blocks_allocated) {
@@ -3328,7 +3327,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 
 		cn->bh = bh;
 		cn->blocknr = bh->b_blocknr;
-		cn->sb = p_s_sb;
+		cn->sb = sb;
 		cn->jlist = NULL;
 		insert_journal_hash(journal->j_hash_table, cn);
 		if (!count_already_incd) {
@@ -3349,10 +3348,10 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 }
 
 int journal_end(struct reiserfs_transaction_handle *th,
-		struct super_block *p_s_sb, unsigned long nblocks)
+		struct super_block *sb, unsigned long nblocks)
 {
 	if (!current->journal_info && th->t_refcount > 1)
-		reiserfs_warning(p_s_sb, "REISER-NESTING",
+		reiserfs_warning(sb, "REISER-NESTING",
 				 "th NULL, refcount %d", th->t_refcount);
 
 	if (!th->t_trans_id) {
@@ -3376,7 +3375,7 @@ int journal_end(struct reiserfs_transaction_handle *th,
 		}
 		return 0;
 	} else {
-		return do_journal_end(th, p_s_sb, nblocks, 0);
+		return do_journal_end(th, sb, nblocks, 0);
 	}
 }
 
@@ -3387,15 +3386,15 @@ int journal_end(struct reiserfs_transaction_handle *th,
 **
 ** returns 1 if it cleaned and relsed the buffer. 0 otherwise
 */
-static int remove_from_transaction(struct super_block *p_s_sb,
+static int remove_from_transaction(struct super_block *sb,
 				   b_blocknr_t blocknr, int already_cleaned)
 {
 	struct buffer_head *bh;
 	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	int ret = 0;
 
-	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
 	if (!cn || !cn->bh) {
 		return ret;
 	}
@@ -3413,7 +3412,7 @@ static int remove_from_transaction(struct super_block *p_s_sb,
 		journal->j_last = cn->prev;
 	}
 	if (bh)
-		remove_journal_hash(p_s_sb, journal->j_hash_table, NULL,
+		remove_journal_hash(sb, journal->j_hash_table, NULL,
 				    bh->b_blocknr, 0);
 	clear_buffer_journaled(bh);	/* don't log this one */
 
@@ -3423,14 +3422,14 @@ static int remove_from_transaction(struct super_block *p_s_sb,
 		clear_buffer_journal_test(bh);
 		put_bh(bh);
 		if (atomic_read(&(bh->b_count)) < 0) {
-			reiserfs_warning(p_s_sb, "journal-1752",
+			reiserfs_warning(sb, "journal-1752",
 					 "b_count < 0");
 		}
 		ret = 1;
 	}
 	journal->j_len--;
 	journal->j_len_alloc--;
-	free_cnode(p_s_sb, cn);
+	free_cnode(sb, cn);
 	return ret;
 }
 
@@ -3481,19 +3480,19 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
 ** will wait until the current transaction is done/committed before returning
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th,
-		     struct super_block *p_s_sb, unsigned long nblocks)
+		     struct super_block *sb, unsigned long nblocks)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	BUG_ON(!th->t_trans_id);
 	/* you can sync while nested, very, very bad */
 	BUG_ON(th->t_refcount > 1);
 	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
 					     1);
-		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
+		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
 	}
-	return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT);
+	return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
 }
 
 /*
@@ -3503,7 +3502,7 @@ static void flush_async_commits(struct work_struct *work)
 {
 	struct reiserfs_journal *journal =
 		container_of(work, struct reiserfs_journal, j_work.work);
-	struct super_block *p_s_sb = journal->j_work_sb;
+	struct super_block *sb = journal->j_work_sb;
 	struct reiserfs_journal_list *jl;
 	struct list_head *entry;
 
@@ -3512,7 +3511,7 @@ static void flush_async_commits(struct work_struct *work)
 		/* last entry is the youngest, commit it and you get everything */
 		entry = journal->j_journal_list.prev;
 		jl = JOURNAL_LIST_ENTRY(entry);
-		flush_commit_list(p_s_sb, jl, 1);
+		flush_commit_list(sb, jl, 1);
 	}
 	unlock_kernel();
 }
@@ -3521,11 +3520,11 @@ static void flush_async_commits(struct work_struct *work)
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
 */
-int reiserfs_flush_old_commits(struct super_block *p_s_sb)
+int reiserfs_flush_old_commits(struct super_block *sb)
 {
 	time_t now;
 	struct reiserfs_transaction_handle th;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	now = get_seconds();
 	/* safety check so we don't flush while we are replaying the log during
@@ -3542,20 +3541,20 @@ int reiserfs_flush_old_commits(struct super_block *p_s_sb)
 	    journal->j_trans_start_time > 0 &&
 	    journal->j_len > 0 &&
 	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
-		if (!journal_join(&th, p_s_sb, 1)) {
-			reiserfs_prepare_for_journal(p_s_sb,
-						     SB_BUFFER_WITH_SB(p_s_sb),
+		if (!journal_join(&th, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
 						     1);
-			journal_mark_dirty(&th, p_s_sb,
-					   SB_BUFFER_WITH_SB(p_s_sb));
+			journal_mark_dirty(&th, sb,
+					   SB_BUFFER_WITH_SB(sb));
 
 			/* we're only being called from kreiserfsd, it makes no sense to do
 			 ** an async commit so that kreiserfsd can do it later
 			 */
-			do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT);
+			do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
 		}
 	}
-	return p_s_sb->s_dirt;
+	return sb->s_dirt;
 }
 
 /*
@@ -3570,7 +3569,7 @@ int reiserfs_flush_old_commits(struct super_block *p_s_sb)
 ** Note, we can't allow the journal_end to proceed while there are still writers in the log.
 */
 static int check_journal_end(struct reiserfs_transaction_handle *th,
-			     struct super_block *p_s_sb, unsigned long nblocks,
+			     struct super_block *sb, unsigned long nblocks,
 			     int flags)
 {
 
@@ -3579,7 +3578,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	int commit_now = flags & COMMIT_NOW;
 	int wait_on_commit = flags & WAIT;
 	struct reiserfs_journal_list *jl;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	BUG_ON(!th->t_trans_id);
 
@@ -3618,31 +3617,31 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 			if (flush) {
 				journal->j_next_full_flush = 1;
 			}
-			unlock_journal(p_s_sb);
+			unlock_journal(sb);
 
 			/* sleep while the current transaction is still j_jlocked */
 			while (journal->j_trans_id == trans_id) {
 				if (atomic_read(&journal->j_jlock)) {
-					queue_log_writer(p_s_sb);
+					queue_log_writer(sb);
 				} else {
-					lock_journal(p_s_sb);
+					lock_journal(sb);
 					if (journal->j_trans_id == trans_id) {
 						atomic_set(&(journal->j_jlock),
 							   1);
 					}
-					unlock_journal(p_s_sb);
+					unlock_journal(sb);
 				}
 			}
 			BUG_ON(journal->j_trans_id == trans_id);
 			
 			if (commit_now
-			    && journal_list_still_alive(p_s_sb, trans_id)
+			    && journal_list_still_alive(sb, trans_id)
 			    && wait_on_commit) {
-				flush_commit_list(p_s_sb, jl, 1);
+				flush_commit_list(sb, jl, 1);
 			}
 			return 0;
 		}
-		unlock_journal(p_s_sb);
+		unlock_journal(sb);
 		return 0;
 	}
 
@@ -3659,12 +3658,12 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	    && journal->j_len_alloc < journal->j_max_batch
 	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
 		journal->j_bcount++;
-		unlock_journal(p_s_sb);
+		unlock_journal(sb);
 		return 0;
 	}
 
-	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-		reiserfs_panic(p_s_sb, "journal-003",
+	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
+		reiserfs_panic(sb, "journal-003",
 			       "j_start (%ld) is too high",
 			       journal->j_start);
 	}
@@ -3686,16 +3685,16 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 ** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
 */
 int journal_mark_freed(struct reiserfs_transaction_handle *th,
-		       struct super_block *p_s_sb, b_blocknr_t blocknr)
+		       struct super_block *sb, b_blocknr_t blocknr)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn = NULL;
 	struct buffer_head *bh = NULL;
 	struct reiserfs_list_bitmap *jb = NULL;
 	int cleaned = 0;
 	BUG_ON(!th->t_trans_id);
 
-	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
 	if (cn && cn->bh) {
 		bh = cn->bh;
 		get_bh(bh);
@@ -3705,15 +3704,15 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 		clear_buffer_journal_new(bh);
 		clear_prepared_bits(bh);
 		reiserfs_clean_and_file_buffer(bh);
-		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
 	} else {
 		/* set the bit for this block in the journal bitmap for this transaction */
 		jb = journal->j_current_jl->j_list_bitmap;
 		if (!jb) {
-			reiserfs_panic(p_s_sb, "journal-1702",
+			reiserfs_panic(sb, "journal-1702",
 				       "journal_list_bitmap is NULL");
 		}
-		set_bit_in_list_bitmap(p_s_sb, blocknr, jb);
+		set_bit_in_list_bitmap(sb, blocknr, jb);
 
 		/* Note, the entire while loop is not allowed to schedule.  */
 
@@ -3721,13 +3720,13 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 			clear_prepared_bits(bh);
 			reiserfs_clean_and_file_buffer(bh);
 		}
-		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
 
 		/* find all older transactions with this block, make sure they don't try to write it out */
-		cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table,
+		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
 					  blocknr);
 		while (cn) {
-			if (p_s_sb == cn->sb && blocknr == cn->blocknr) {
+			if (sb == cn->sb && blocknr == cn->blocknr) {
 				set_bit(BLOCK_FREED, &cn->state);
 				if (cn->bh) {
 					if (!cleaned) {
@@ -3743,7 +3742,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 						put_bh(cn->bh);
 						if (atomic_read
 						    (&(cn->bh->b_count)) < 0) {
-							reiserfs_warning(p_s_sb,
+							reiserfs_warning(sb,
 								 "journal-2138",
 								 "cn->bh->b_count < 0");
 						}
@@ -3850,18 +3849,18 @@ int reiserfs_commit_for_inode(struct inode *inode)
 	return __commit_trans_jl(inode, id, jl);
 }
 
-void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
+void reiserfs_restore_prepared_buffer(struct super_block *sb,
 				      struct buffer_head *bh)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-	PROC_INFO_INC(p_s_sb, journal.restore_prepared);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	PROC_INFO_INC(sb, journal.restore_prepared);
 	if (!bh) {
 		return;
 	}
 	if (test_clear_buffer_journal_restore_dirty(bh) &&
 	    buffer_journal_dirty(bh)) {
 		struct reiserfs_journal_cnode *cn;
-		cn = get_journal_hash_dev(p_s_sb,
+		cn = get_journal_hash_dev(sb,
 					  journal->j_list_hash_table,
 					  bh->b_blocknr);
 		if (cn && can_dirty(cn)) {
@@ -3880,10 +3879,10 @@ extern struct tree_balance *cur_tb;
 ** wait on it.
 **
 */
-int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
+int reiserfs_prepare_for_journal(struct super_block *sb,
 				 struct buffer_head *bh, int wait)
 {
-	PROC_INFO_INC(p_s_sb, journal.prepare);
+	PROC_INFO_INC(sb, journal.prepare);
 
 	if (!trylock_buffer(bh)) {
 		if (!wait)
@@ -3931,10 +3930,10 @@ static void flush_old_journal_lists(struct super_block *s)
 ** journal lists, etc just won't happen.
 */
 static int do_journal_end(struct reiserfs_transaction_handle *th,
-			  struct super_block *p_s_sb, unsigned long nblocks,
+			  struct super_block *sb, unsigned long nblocks,
 			  int flags)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
 	struct reiserfs_journal_cnode *last_cn = NULL;
 	struct reiserfs_journal_desc *desc;
@@ -3964,14 +3963,14 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 
 	put_fs_excl();
 	current->journal_info = th->t_handle_save;
-	reiserfs_check_lock_depth(p_s_sb, "journal end");
+	reiserfs_check_lock_depth(sb, "journal end");
 	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
 					     1);
-		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
+		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
 	}
 
-	lock_journal(p_s_sb);
+	lock_journal(sb);
 	if (journal->j_next_full_flush) {
 		flags |= FLUSH_ALL;
 		flush = 1;
@@ -3984,10 +3983,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	/* check_journal_end locks the journal, and unlocks if it does not return 1
 	 ** it tells us if we should continue with the journal_end, or just return
 	 */
-	if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-		p_s_sb->s_dirt = 1;
-		wake_queued_writers(p_s_sb);
-		reiserfs_async_progress_wait(p_s_sb);
+	if (!check_journal_end(th, sb, nblocks, flags)) {
+		sb->s_dirt = 1;
+		wake_queued_writers(sb);
+		reiserfs_async_progress_wait(sb);
 		goto out;
 	}
 
@@ -4016,8 +4015,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 
 	/* setup description block */
 	d_bh =
-	    journal_getblk(p_s_sb,
-			   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	    journal_getblk(sb,
+			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 			   journal->j_start);
 	set_buffer_uptodate(d_bh);
 	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
@@ -4026,9 +4025,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	set_desc_trans_id(desc, journal->j_trans_id);
 
 	/* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
-	c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 			      ((journal->j_start + journal->j_len +
-				1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
 	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
 	memset(c_bh->b_data, 0, c_bh->b_size);
 	set_commit_trans_id(commit, journal->j_trans_id);
@@ -4061,12 +4060,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 **  for each real block, add it to the journal list hash,
 	 ** copy into real block index array in the commit or desc block
 	 */
-	trans_half = journal_trans_half(p_s_sb->s_blocksize);
+	trans_half = journal_trans_half(sb->s_blocksize);
 	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
 		if (buffer_journaled(cn->bh)) {
-			jl_cn = get_cnode(p_s_sb);
+			jl_cn = get_cnode(sb);
 			if (!jl_cn) {
-				reiserfs_panic(p_s_sb, "journal-1676",
+				reiserfs_panic(sb, "journal-1676",
 					       "get_cnode returned NULL");
 			}
 			if (i == 0) {
@@ -4082,15 +4081,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			   of journal or reserved area */
 
 			if (is_block_in_log_or_reserved_area
-			    (p_s_sb, cn->bh->b_blocknr)) {
-				reiserfs_panic(p_s_sb, "journal-2332",
+			    (sb, cn->bh->b_blocknr)) {
+				reiserfs_panic(sb, "journal-2332",
 					       "Trying to log block %lu, "
 					       "which is a log block",
 					       cn->bh->b_blocknr);
 			}
 			jl_cn->blocknr = cn->bh->b_blocknr;
 			jl_cn->state = 0;
-			jl_cn->sb = p_s_sb;
+			jl_cn->sb = sb;
 			jl_cn->bh = cn->bh;
 			jl_cn->jlist = jl;
 			insert_journal_hash(journal->j_list_hash_table, jl_cn);
@@ -4131,11 +4130,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			char *addr;
 			struct page *page;
 			tmp_bh =
-			    journal_getblk(p_s_sb,
-					   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+			    journal_getblk(sb,
+					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 					   ((cur_write_start +
 					     jindex) %
-					    SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+					    SB_ONDISK_JOURNAL_SIZE(sb)));
 			set_buffer_uptodate(tmp_bh);
 			page = cn->bh->b_page;
 			addr = kmap(page);
@@ -4149,13 +4148,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			clear_buffer_journaled(cn->bh);
 		} else {
 			/* JDirty cleared sometime during transaction.  don't log this one */
-			reiserfs_warning(p_s_sb, "journal-2048",
+			reiserfs_warning(sb, "journal-2048",
 					 "BAD, buffer in journal hash, "
 					 "but not JDirty!");
 			brelse(cn->bh);
 		}
 		next = cn->next;
-		free_cnode(p_s_sb, cn);
+		free_cnode(sb, cn);
 		cn = next;
 		cond_resched();
 	}
@@ -4165,7 +4164,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
 	 */
 
-	journal->j_current_jl = alloc_journal_list(p_s_sb);
+	journal->j_current_jl = alloc_journal_list(sb);
 
 	/* now it is safe to insert this transaction on the main list */
 	list_add_tail(&jl->j_list, &journal->j_journal_list);
@@ -4176,7 +4175,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	old_start = journal->j_start;
 	journal->j_start =
 	    (journal->j_start + journal->j_len +
-	     2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);
+	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
 	atomic_set(&(journal->j_wcount), 0);
 	journal->j_bcount = 0;
 	journal->j_last = NULL;
@@ -4191,7 +4190,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	journal->j_len_alloc = 0;
 	journal->j_next_full_flush = 0;
 	journal->j_next_async_flush = 0;
-	init_journal_hash(p_s_sb);
+	init_journal_hash(sb);
 
 	// make sure reiserfs_add_jh sees the new current_jl before we
 	// write out the tails
@@ -4220,8 +4219,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 ** queue don't wait for this proc to flush journal lists and such.
 	 */
 	if (flush) {
-		flush_commit_list(p_s_sb, jl, 1);
-		flush_journal_list(p_s_sb, jl, 1);
+		flush_commit_list(sb, jl, 1);
+		flush_journal_list(sb, jl, 1);
 	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
 		queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
 
@@ -4235,11 +4234,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		if (journal->j_start <= temp_jl->j_start) {
 			if ((journal->j_start + journal->j_trans_max + 1) >=
 			    temp_jl->j_start) {
-				flush_used_journal_lists(p_s_sb, temp_jl);
+				flush_used_journal_lists(sb, temp_jl);
 				goto first_jl;
 			} else if ((journal->j_start +
 				    journal->j_trans_max + 1) <
-				   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+				   SB_ONDISK_JOURNAL_SIZE(sb)) {
 				/* if we don't cross into the next transaction and we don't
 				 * wrap, there is no way we can overlap any later transactions
 				 * break now
@@ -4248,11 +4247,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			}
 		} else if ((journal->j_start +
 			    journal->j_trans_max + 1) >
-			   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+			   SB_ONDISK_JOURNAL_SIZE(sb)) {
 			if (((journal->j_start + journal->j_trans_max + 1) %
-			     SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=
+			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
 			    temp_jl->j_start) {
-				flush_used_journal_lists(p_s_sb, temp_jl);
+				flush_used_journal_lists(sb, temp_jl);
 				goto first_jl;
 			} else {
 				/* we don't overlap anything from out start to the end of the
@@ -4263,34 +4262,34 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			}
 		}
 	}
-	flush_old_journal_lists(p_s_sb);
+	flush_old_journal_lists(sb);
 
 	journal->j_current_jl->j_list_bitmap =
-	    get_list_bitmap(p_s_sb, journal->j_current_jl);
+	    get_list_bitmap(sb, journal->j_current_jl);
 
 	if (!(journal->j_current_jl->j_list_bitmap)) {
-		reiserfs_panic(p_s_sb, "journal-1996",
+		reiserfs_panic(sb, "journal-1996",
 			       "could not get a list bitmap");
 	}
 
 	atomic_set(&(journal->j_jlock), 0);
-	unlock_journal(p_s_sb);
+	unlock_journal(sb);
 	/* wake up any body waiting to join. */
 	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
 	wake_up(&(journal->j_join_wait));
 
 	if (!flush && wait_on_commit &&
-	    journal_list_still_alive(p_s_sb, commit_trans_id)) {
-		flush_commit_list(p_s_sb, jl, 1);
+	    journal_list_still_alive(sb, commit_trans_id)) {
+		flush_commit_list(sb, jl, 1);
 	}
       out:
-	reiserfs_check_lock_depth(p_s_sb, "journal end2");
+	reiserfs_check_lock_depth(sb, "journal end2");
 
 	memset(th, 0, sizeof(*th));
 	/* Re-set th->t_super, so we can properly keep track of how many
 	 * persistent transactions there are. We need to do this so if this
 	 * call is part of a failed restart_transaction, we can free it later */
-	th->t_super = p_s_sb;
+	th->t_super = sb;
 
 	return journal->j_errno;
 }
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index a65bfee28bb8..00fd879c4a2a 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -245,7 +245,7 @@ static const struct reiserfs_key MAX_KEY = {
 static inline const struct reiserfs_key *get_lkey(const struct treepath
 						  *p_s_chk_path,
 						  const struct super_block
-						  *p_s_sb)
+						  *sb)
 {
 	int n_position, n_path_offset = p_s_chk_path->path_length;
 	struct buffer_head *p_s_parent;
@@ -282,14 +282,14 @@ static inline const struct reiserfs_key *get_lkey(const struct treepath
 	}
 	/* Return MIN_KEY if we are in the root of the buffer tree. */
 	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(p_s_sb))
+	    b_blocknr == SB_ROOT_BLOCK(sb))
 		return &MIN_KEY;
 	return &MAX_KEY;
 }
 
 /* Get delimiting key of the buffer at the path and its right neighbor. */
 inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
-					   const struct super_block *p_s_sb)
+					   const struct super_block *sb)
 {
 	int n_position, n_path_offset = p_s_chk_path->path_length;
 	struct buffer_head *p_s_parent;
@@ -325,7 +325,7 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
 	}
 	/* Return MAX_KEY if we are in the root of the buffer tree. */
 	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(p_s_sb))
+	    b_blocknr == SB_ROOT_BLOCK(sb))
 		return &MAX_KEY;
 	return &MIN_KEY;
 }
@@ -337,7 +337,7 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
    this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
 static inline int key_in_buffer(struct treepath *p_s_chk_path,	/* Path which should be checked.  */
 				const struct cpu_key *p_s_key,	/* Key which should be checked.   */
-				struct super_block *p_s_sb	/* Super block pointer.           */
+				struct super_block *sb	/* Super block pointer.	   */
     )
 {
 
@@ -348,11 +348,11 @@ static inline int key_in_buffer(struct treepath *p_s_chk_path,	/* Path which sho
 	RFALSE(!PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev,
 	       "PAP-5060: device must not be NODEV");
 
-	if (comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1)
+	if (comp_keys(get_lkey(p_s_chk_path, sb), p_s_key) == 1)
 		/* left delimiting key is bigger, that the key we look for */
 		return 0;
-	//  if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 )
-	if (comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1)
+	//  if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, sb)) != -1 )
+	if (comp_keys(get_rkey(p_s_chk_path, sb), p_s_key) != 1)
 		/* p_s_key must be less than right delimitiing key */
 		return 0;
 	return 1;
@@ -546,7 +546,7 @@ static void search_by_key_reada(struct super_block *s,
 /**************************************************************************
  * Algorithm   SearchByKey                                                *
  *             look for item in the Disk S+Tree by its key                *
- * Input:  p_s_sb   -  super block                                        *
+ * Input:  sb   -  super block                                            *
  *         p_s_key  - pointer to the key to search                        *
  * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR                         *
  *         p_s_search_path - path from the root to the needed leaf        *
@@ -566,7 +566,7 @@ static void search_by_key_reada(struct super_block *s,
    correctness of the top of the path but need not be checked for the
    correctness of the bottom of the path */
 /* The function is NOT SCHEDULE-SAFE! */
-int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/* Key to search. */
+int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key to search. */
 		  struct treepath *p_s_search_path,/* This structure was
 						   allocated and initialized
 						   by the calling
@@ -592,7 +592,7 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 	int n_repeat_counter = 0;
 #endif
 
-	PROC_INFO_INC(p_s_sb, search_by_key);
+	PROC_INFO_INC(sb, search_by_key);
 
 	/* As we add each node to a path we increase its count.  This means that
 	   we must be careful to release all nodes in a path before we either
@@ -605,13 +605,13 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 	/* With each iteration of this loop we search through the items in the
 	   current node, and calculate the next current node(next path element)
 	   for the next iteration of this loop.. */
-	n_block_number = SB_ROOT_BLOCK(p_s_sb);
+	n_block_number = SB_ROOT_BLOCK(sb);
 	expected_level = -1;
 	while (1) {
 
 #ifdef CONFIG_REISERFS_CHECK
 		if (!(++n_repeat_counter % 50000))
-			reiserfs_warning(p_s_sb, "PAP-5100",
+			reiserfs_warning(sb, "PAP-5100",
 					 "%s: there were %d iterations of "
 					 "while loop looking for key %K",
 					 current->comm, n_repeat_counter,
@@ -622,14 +622,14 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 		p_s_last_element =
 		    PATH_OFFSET_PELEMENT(p_s_search_path,
 					 ++p_s_search_path->path_length);
-		fs_gen = get_generation(p_s_sb);
+		fs_gen = get_generation(sb);
 
 		/* Read the next tree node, and set the last element in the path to
 		   have a pointer to it. */
 		if ((p_s_bh = p_s_last_element->pe_buffer =
-		     sb_getblk(p_s_sb, n_block_number))) {
+		     sb_getblk(sb, n_block_number))) {
 			if (!buffer_uptodate(p_s_bh) && reada_count > 1) {
-				search_by_key_reada(p_s_sb, reada_bh,
+				search_by_key_reada(sb, reada_bh,
 						    reada_blocks, reada_count);
 			}
 			ll_rw_block(READ, 1, &p_s_bh);
@@ -644,25 +644,25 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 		}
 		reada_count = 0;
 		if (expected_level == -1)
-			expected_level = SB_TREE_HEIGHT(p_s_sb);
+			expected_level = SB_TREE_HEIGHT(sb);
 		expected_level--;
 
 		/* It is possible that schedule occurred. We must check whether the key
 		   to search is still in the tree rooted from the current buffer. If
 		   not then repeat search from the root. */
-		if (fs_changed(fs_gen, p_s_sb) &&
+		if (fs_changed(fs_gen, sb) &&
 		    (!B_IS_IN_TREE(p_s_bh) ||
 		     B_LEVEL(p_s_bh) != expected_level ||
-		     !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) {
-			PROC_INFO_INC(p_s_sb, search_by_key_fs_changed);
-			PROC_INFO_INC(p_s_sb, search_by_key_restarted);
-			PROC_INFO_INC(p_s_sb,
+		     !key_in_buffer(p_s_search_path, p_s_key, sb))) {
+			PROC_INFO_INC(sb, search_by_key_fs_changed);
+			PROC_INFO_INC(sb, search_by_key_restarted);
+			PROC_INFO_INC(sb,
 				      sbk_restarted[expected_level - 1]);
 			pathrelse(p_s_search_path);
 
 			/* Get the root block number so that we can repeat the search
 			   starting from the root. */
-			n_block_number = SB_ROOT_BLOCK(p_s_sb);
+			n_block_number = SB_ROOT_BLOCK(sb);
 			expected_level = -1;
 			right_neighbor_of_leaf_node = 0;
 
@@ -674,12 +674,12 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 		   equal to the MAX_KEY. Latter case is only possible in
 		   "finish_unfinished()" processing during mount. */
 		RFALSE(comp_keys(&MAX_KEY, p_s_key) &&
-		       !key_in_buffer(p_s_search_path, p_s_key, p_s_sb),
+		       !key_in_buffer(p_s_search_path, p_s_key, sb),
 		       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
 		if (cur_tb) {
 			print_cur_tb("5140");
-			reiserfs_panic(p_s_sb, "PAP-5140",
+			reiserfs_panic(sb, "PAP-5140",
 				       "schedule occurred in do_balance!");
 		}
 #endif
@@ -687,7 +687,7 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 		// make sure, that the node contents look like a node of
 		// certain level
 		if (!is_tree_node(p_s_bh, expected_level)) {
-			reiserfs_error(p_s_sb, "vs-5150",
+			reiserfs_error(sb, "vs-5150",
 				       "invalid format found in block %ld. "
 				       "Fsck?", p_s_bh->b_blocknr);
 			pathrelse(p_s_search_path);
@@ -697,7 +697,7 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 		/* ok, we have acquired next formatted node in the tree */
 		n_node_level = B_LEVEL(p_s_bh);
 
-		PROC_INFO_BH_STAT(p_s_sb, p_s_bh, n_node_level - 1);
+		PROC_INFO_BH_STAT(sb, p_s_bh, n_node_level - 1);
 
 		RFALSE(n_node_level < n_stop_level,
 		       "vs-5152: tree level (%d) is less than stop level (%d)",
@@ -776,7 +776,7 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
    units of directory entries.  */
 
 /* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key(struct super_block *p_s_sb,	/* Pointer to the super block.          */
+int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super block.          */
 			       const struct cpu_key *p_cpu_key,	/* Key to search (cpu variable)         */
 			       struct treepath *p_s_search_path	/* Filled up by this function.          */
     )
@@ -789,13 +789,13 @@ int search_for_position_by_key(struct super_block *p_s_sb,	/* Pointer to the sup
 
 	/* If searching for directory entry. */
 	if (is_direntry_cpu_key(p_cpu_key))
-		return search_by_entry_key(p_s_sb, p_cpu_key, p_s_search_path,
+		return search_by_entry_key(sb, p_cpu_key, p_s_search_path,
 					   &de);
 
 	/* If not searching for directory entry. */
 
 	/* If item is found. */
-	retval = search_item(p_s_sb, p_cpu_key, p_s_search_path);
+	retval = search_item(sb, p_cpu_key, p_s_search_path);
 	if (retval == IO_ERROR)
 		return retval;
 	if (retval == ITEM_FOUND) {
@@ -817,7 +817,7 @@ int search_for_position_by_key(struct super_block *p_s_sb,	/* Pointer to the sup
 	p_le_ih =
 	    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path),
 			   --PATH_LAST_POSITION(p_s_search_path));
-	n_blk_size = p_s_sb->s_blocksize;
+	n_blk_size = sb->s_blocksize;
 
 	if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
 		return FILE_NOT_FOUND;
@@ -957,7 +957,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 				      int *p_n_cut_size, unsigned long long n_new_file_length	/* MAX_KEY_OFFSET in case of delete. */
     )
 {
-	struct super_block *p_s_sb = inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_path);
 	struct buffer_head *p_s_bh = PATH_PLAST_BUFFER(p_s_path);
 
@@ -986,7 +986,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 
 	/* Case of an indirect item. */
 	{
-	    int blk_size = p_s_sb->s_blocksize;
+	    int blk_size = sb->s_blocksize;
 	    struct item_head s_ih;
 	    int need_re_search;
 	    int delete = 0;
@@ -1023,9 +1023,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 		    block = get_block_num(unfm, 0);
 
 		    if (block != 0) {
-			reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
+			reiserfs_prepare_for_journal(sb, p_s_bh, 1);
 			put_block_num(unfm, 0, 0);
-			journal_mark_dirty (th, p_s_sb, p_s_bh);
+			journal_mark_dirty (th, sb, p_s_bh);
 			reiserfs_free_block(th, inode, block, 1);
 		    }
 
@@ -1049,9 +1049,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 		/* a trick.  If the buffer has been logged, this will do nothing.  If
 		** we've broken the loop without logging it, it will restore the
 		** buffer */
-		reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
+		reiserfs_restore_prepared_buffer(sb, p_s_bh);
 	    } while (need_re_search &&
-		     search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND);
+		     search_for_position_by_key(sb, p_s_item_key, p_s_path) == POSITION_FOUND);
 	    pos_in_item(p_s_path) = pos * UNFM_P_SIZE;
 
 	    if (*p_n_cut_size == 0) {
@@ -1090,7 +1090,7 @@ static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode)
 
 static void init_tb_struct(struct reiserfs_transaction_handle *th,
 			   struct tree_balance *p_s_tb,
-			   struct super_block *p_s_sb,
+			   struct super_block *sb,
 			   struct treepath *p_s_path, int n_size)
 {
 
@@ -1098,7 +1098,7 @@ static void init_tb_struct(struct reiserfs_transaction_handle *th,
 
 	memset(p_s_tb, '\0', sizeof(struct tree_balance));
 	p_s_tb->transaction_handle = th;
-	p_s_tb->tb_sb = p_s_sb;
+	p_s_tb->tb_sb = sb;
 	p_s_tb->tb_path = p_s_path;
 	PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
 	PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
@@ -1147,7 +1147,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 			 struct inode *p_s_inode,	/* inode is here just to update i_blocks and quotas */
 			 struct buffer_head *p_s_un_bh)
 {				/* NULL or unformatted node pointer.    */
-	struct super_block *p_s_sb = p_s_inode->i_sb;
+	struct super_block *sb = p_s_inode->i_sb;
 	struct tree_balance s_del_balance;
 	struct item_head s_ih;
 	struct item_head *q_ih;
@@ -1161,7 +1161,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 
 	BUG_ON(!th->t_trans_id);
 
-	init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path,
+	init_tb_struct(th, &s_del_balance, sb, p_s_path,
 		       0 /*size is unknown */ );
 
 	while (1) {
@@ -1185,15 +1185,15 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		if (n_ret_value != REPEAT_SEARCH)
 			break;
 
-		PROC_INFO_INC(p_s_sb, delete_item_restarted);
+		PROC_INFO_INC(sb, delete_item_restarted);
 
 		// file system changed, repeat search
 		n_ret_value =
-		    search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
+		    search_for_position_by_key(sb, p_s_item_key, p_s_path);
 		if (n_ret_value == IO_ERROR)
 			break;
 		if (n_ret_value == FILE_NOT_FOUND) {
-			reiserfs_warning(p_s_sb, "vs-5340",
+			reiserfs_warning(sb, "vs-5340",
 					 "no items of the file %K found",
 					 p_s_item_key);
 			break;
@@ -1216,8 +1216,8 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 	 ** the unfm node once
 	 */
 	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
-		if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
-			quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+		if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
 		} else {
 			quota_cut_bytes = 0;
 		}
@@ -1258,7 +1258,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 	do_balance(&s_del_balance, NULL, NULL, M_DELETE);
 
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
 		       quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
 #endif
@@ -1430,8 +1430,8 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
 				    const struct cpu_key *p_s_item_key,
 				    loff_t n_new_file_size, char *p_c_mode)
 {
-	struct super_block *p_s_sb = p_s_inode->i_sb;
-	int n_block_size = p_s_sb->s_blocksize;
+	struct super_block *sb = p_s_inode->i_sb;
+	int n_block_size = sb->s_blocksize;
 	int cut_bytes;
 	BUG_ON(!th->t_trans_id);
 	BUG_ON(n_new_file_size != p_s_inode->i_size);
@@ -1509,7 +1509,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			   struct inode *p_s_inode,
 			   struct page *page, loff_t n_new_file_size)
 {
-	struct super_block *p_s_sb = p_s_inode->i_sb;
+	struct super_block *sb = p_s_inode->i_sb;
 	/* Every function which is going to call do_balance must first
 	   create a tree_balance structure.  Then it must fill up this
 	   structure by using the init_tb_struct and fix_nodes functions.
@@ -1560,7 +1560,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			/* removing of last unformatted node will change value we
 			   have to return to truncate. Save it */
 			retval2 = n_ret_value;
-			/*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1)); */
+			/*retval2 = sb->s_blocksize - (n_new_file_size & (sb->s_blocksize - 1)); */
 
 			/* So, we have performed the first part of the conversion:
 			   inserting the new direct item.  Now we are removing the
@@ -1569,16 +1569,16 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			set_cpu_key_k_type(p_s_item_key, TYPE_INDIRECT);
 			p_s_item_key->key_length = 4;
 			n_new_file_size -=
-			    (n_new_file_size & (p_s_sb->s_blocksize - 1));
+			    (n_new_file_size & (sb->s_blocksize - 1));
 			tail_pos = n_new_file_size;
 			set_cpu_key_k_offset(p_s_item_key, n_new_file_size + 1);
 			if (search_for_position_by_key
-			    (p_s_sb, p_s_item_key,
+			    (sb, p_s_item_key,
 			     p_s_path) == POSITION_NOT_FOUND) {
 				print_block(PATH_PLAST_BUFFER(p_s_path), 3,
 					    PATH_LAST_POSITION(p_s_path) - 1,
 					    PATH_LAST_POSITION(p_s_path) + 1);
-				reiserfs_panic(p_s_sb, "PAP-5580", "item to "
+				reiserfs_panic(sb, "PAP-5580", "item to "
 					       "convert does not exist (%K)",
 					       p_s_item_key);
 			}
@@ -1595,14 +1595,14 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		if (n_ret_value != REPEAT_SEARCH)
 			break;
 
-		PROC_INFO_INC(p_s_sb, cut_from_item_restarted);
+		PROC_INFO_INC(sb, cut_from_item_restarted);
 
 		n_ret_value =
-		    search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
+		    search_for_position_by_key(sb, p_s_item_key, p_s_path);
 		if (n_ret_value == POSITION_FOUND)
 			continue;
 
-		reiserfs_warning(p_s_sb, "PAP-5610", "item %K not found",
+		reiserfs_warning(sb, "PAP-5610", "item %K not found",
 				 p_s_item_key);
 		unfix_nodes(&s_cut_balance);
 		return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
@@ -1616,7 +1616,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			indirect_to_direct_roll_back(th, p_s_inode, p_s_path);
 		}
 		if (n_ret_value == NO_DISK_SPACE)
-			reiserfs_warning(p_s_sb, "reiserfs-5092",
+			reiserfs_warning(sb, "reiserfs-5092",
 					 "NO_DISK_SPACE");
 		unfix_nodes(&s_cut_balance);
 		return -EIO;
@@ -1642,11 +1642,11 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 	p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
 	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
 		if (c_mode == M_DELETE &&
-		    (le_ih_k_offset(p_le_ih) & (p_s_sb->s_blocksize - 1)) ==
+		    (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
 		    1) {
 			// FIXME: this is to keep 3.5 happy
 			REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
-			quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
 		} else {
 			quota_cut_bytes = 0;
 		}
@@ -1659,18 +1659,18 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		   sure, that we exactly remove last unformatted node pointer
 		   of the item */
 		if (!is_indirect_le_ih(le_ih))
-			reiserfs_panic(p_s_sb, "vs-5652",
+			reiserfs_panic(sb, "vs-5652",
 				       "item must be indirect %h", le_ih);
 
 		if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
-			reiserfs_panic(p_s_sb, "vs-5653", "completing "
+			reiserfs_panic(sb, "vs-5653", "completing "
 				       "indirect2direct conversion indirect "
 				       "item %h being deleted must be of "
 				       "4 byte long", le_ih);
 
 		if (c_mode == M_CUT
 		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
-			reiserfs_panic(p_s_sb, "vs-5654", "can not complete "
+			reiserfs_panic(sb, "vs-5654", "can not complete "
 				       "indirect2direct conversion of %h "
 				       "(CUT, insert_size==%d)",
 				       le_ih, s_cut_balance.insert_size[0]);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 0635cfe0f0b7..27311a5f0469 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -175,9 +175,9 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 		    loff_t n_new_file_size,	/* New file size. */
 		    char *p_c_mode)
 {
-	struct super_block *p_s_sb = p_s_inode->i_sb;
+	struct super_block *sb = p_s_inode->i_sb;
 	struct item_head s_ih;
-	unsigned long n_block_size = p_s_sb->s_blocksize;
+	unsigned long n_block_size = sb->s_blocksize;
 	char *tail;
 	int tail_len, round_tail_len;
 	loff_t pos, pos1;	/* position of first byte of the tail */
@@ -185,7 +185,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 
 	BUG_ON(!th->t_trans_id);
 
-	REISERFS_SB(p_s_sb)->s_indirect2direct++;
+	REISERFS_SB(sb)->s_indirect2direct++;
 
 	*p_c_mode = M_SKIP_BALANCING;
 
@@ -200,7 +200,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 
 	pos =
 	    le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
-					 1) * p_s_sb->s_blocksize;
+					 1) * sb->s_blocksize;
 	pos1 = pos;
 
 	// we are protected by i_mutex. The tail can not disapper, not
@@ -211,18 +211,18 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 
 	if (path_changed(&s_ih, p_s_path)) {
 		/* re-search indirect item */
-		if (search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path)
+		if (search_for_position_by_key(sb, p_s_item_key, p_s_path)
 		    == POSITION_NOT_FOUND)
-			reiserfs_panic(p_s_sb, "PAP-5520",
+			reiserfs_panic(sb, "PAP-5520",
 				       "item to be converted %K does not exist",
 				       p_s_item_key);
 		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
 #ifdef CONFIG_REISERFS_CHECK
 		pos = le_ih_k_offset(&s_ih) - 1 +
 		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
-		     1) * p_s_sb->s_blocksize;
+		     1) * sb->s_blocksize;
 		if (pos != pos1)
-			reiserfs_panic(p_s_sb, "vs-5530", "tail position "
+			reiserfs_panic(sb, "vs-5530", "tail position "
 				       "changed while we were reading it");
 #endif
 	}
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index eb4e912e6bd3..9bd7800d989c 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1769,12 +1769,12 @@ int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *,
 int journal_mark_freed(struct reiserfs_transaction_handle *,
 		       struct super_block *, b_blocknr_t blocknr);
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
-int reiserfs_in_journal(struct super_block *p_s_sb, unsigned int bmap_nr,
-			int bit_nr, int searchall, b_blocknr_t *next);
+int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
+			 int bit_nr, int searchall, b_blocknr_t *next);
 int journal_begin(struct reiserfs_transaction_handle *,
-		  struct super_block *p_s_sb, unsigned long);
+		  struct super_block *sb, unsigned long);
 int journal_join_abort(struct reiserfs_transaction_handle *,
-		       struct super_block *p_s_sb, unsigned long);
+		       struct super_block *sb, unsigned long);
 void reiserfs_abort_journal(struct super_block *sb, int errno);
 void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
 int reiserfs_allocate_list_bitmaps(struct super_block *s,
@@ -1830,11 +1830,11 @@ static inline void copy_key(struct reiserfs_key *to,
 
 int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_path);
 const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
-				    const struct super_block *p_s_sb);
+				    const struct super_block *sb);
 int search_by_key(struct super_block *, const struct cpu_key *,
 		  struct treepath *, int);
 #define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
-int search_for_position_by_key(struct super_block *p_s_sb,
+int search_for_position_by_key(struct super_block *sb,
 			       const struct cpu_key *p_s_cpu_key,
 			       struct treepath *p_s_search_path);
 extern void decrement_bcount(struct buffer_head *p_s_bh);
@@ -1978,7 +1978,7 @@ int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
 #define PROC_INFO_MAX( sb, field, value ) VOID_V
 #define PROC_INFO_INC( sb, field ) VOID_V
 #define PROC_INFO_ADD( sb, field, val ) VOID_V
-#define PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level ) VOID_V
+#define PROC_INFO_BH_STAT(sb, p_s_bh, n_node_level) VOID_V
 #endif
 
 /* dir.c */
-- 
cgit v1.2.3-71-gd317


From ad31a4fc0386e8590c51ca4b8f1ae1d8b8b2ac5e Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:46 -0400
Subject: reiserfs: rename p_s_bh to bh

This patch is a simple s/p_s_bh/bh/g to the reiserfs code.  This is the
second in a series of patches to rip out some of the awful variable
naming in reiserfs.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/fix_node.c      | 94 +++++++++++++++++++++------------------------
 fs/reiserfs/stree.c         | 63 +++++++++++++++---------------
 include/linux/reiserfs_fs.h | 35 +++++++++--------
 3 files changed, 93 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 799c0ce24291..ad42c45af44f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -1887,7 +1887,7 @@ static int check_balance(int mode,
 /* Check whether parent at the path is the really parent of the current node.*/
 static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
 {
-	struct buffer_head *p_s_bh;
+	struct buffer_head *bh;
 	struct treepath *p_s_path = p_s_tb->tb_path;
 	int n_position,
 	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
@@ -1909,21 +1909,21 @@ static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
 	}
 
 	if (!B_IS_IN_TREE
-	    (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
+	    (bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
 		return REPEAT_SEARCH;	/* Parent in the path is not in the tree. */
 
 	if ((n_position =
 	     PATH_OFFSET_POSITION(p_s_path,
-				  n_path_offset - 1)) > B_NR_ITEMS(p_s_bh))
+				  n_path_offset - 1)) > B_NR_ITEMS(bh))
 		return REPEAT_SEARCH;
 
-	if (B_N_CHILD_NUM(p_s_bh, n_position) !=
+	if (B_N_CHILD_NUM(bh, n_position) !=
 	    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
 		/* Parent in the path is not parent of the current node in the tree. */
 		return REPEAT_SEARCH;
 
-	if (buffer_locked(p_s_bh)) {
-		__wait_on_buffer(p_s_bh);
+	if (buffer_locked(bh)) {
+		__wait_on_buffer(bh);
 		if (FILESYSTEM_CHANGED_TB(p_s_tb))
 			return REPEAT_SEARCH;
 	}
@@ -1943,29 +1943,29 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
 	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
 	unsigned long n_son_number;
 	struct super_block *sb = p_s_tb->tb_sb;
-	struct buffer_head *p_s_bh;
+	struct buffer_head *bh;
 
 	PROC_INFO_INC(sb, get_neighbors[n_h]);
 
 	if (p_s_tb->lnum[n_h]) {
 		/* We need left neighbor to balance S[n_h]. */
 		PROC_INFO_INC(sb, need_l_neighbor[n_h]);
-		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+		bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
 
-		RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
+		RFALSE(bh == p_s_tb->FL[n_h] &&
 		       !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
 		       "PAP-8270: invalid position in the parent");
 
 		n_child_position =
-		    (p_s_bh ==
+		    (bh ==
 		     p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
 								       FL[n_h]);
 		n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
-		p_s_bh = sb_bread(sb, n_son_number);
-		if (!p_s_bh)
+		bh = sb_bread(sb, n_son_number);
+		if (!bh)
 			return IO_ERROR;
 		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
-			brelse(p_s_bh);
+			brelse(bh);
 			PROC_INFO_INC(sb, get_neighbors_restart[n_h]);
 			return REPEAT_SEARCH;
 		}
@@ -1973,48 +1973,48 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
 		RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
 		       n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
 		       B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
-		       p_s_bh->b_blocknr, "PAP-8275: invalid parent");
-		RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
+		       bh->b_blocknr, "PAP-8275: invalid parent");
+		RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
 		RFALSE(!n_h &&
-		       B_FREE_SPACE(p_s_bh) !=
-		       MAX_CHILD_SIZE(p_s_bh) -
+		       B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
 		       dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
 		       "PAP-8290: invalid child size of left neighbor");
 
 		brelse(p_s_tb->L[n_h]);
-		p_s_tb->L[n_h] = p_s_bh;
+		p_s_tb->L[n_h] = bh;
 	}
 
 	if (p_s_tb->rnum[n_h]) {	/* We need right neighbor to balance S[n_path_offset]. */
 		PROC_INFO_INC(sb, need_r_neighbor[n_h]);
-		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+		bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
 
-		RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
+		RFALSE(bh == p_s_tb->FR[n_h] &&
 		       PATH_OFFSET_POSITION(p_s_tb->tb_path,
 					    n_path_offset) >=
-		       B_NR_ITEMS(p_s_bh),
+		       B_NR_ITEMS(bh),
 		       "PAP-8295: invalid position in the parent");
 
 		n_child_position =
-		    (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
+		    (bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
 		n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
-		p_s_bh = sb_bread(sb, n_son_number);
-		if (!p_s_bh)
+		bh = sb_bread(sb, n_son_number);
+		if (!bh)
 			return IO_ERROR;
 		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
-			brelse(p_s_bh);
+			brelse(bh);
 			PROC_INFO_INC(sb, get_neighbors_restart[n_h]);
 			return REPEAT_SEARCH;
 		}
 		brelse(p_s_tb->R[n_h]);
-		p_s_tb->R[n_h] = p_s_bh;
+		p_s_tb->R[n_h] = bh;
 
 		RFALSE(!n_h
-		       && B_FREE_SPACE(p_s_bh) !=
-		       MAX_CHILD_SIZE(p_s_bh) -
+		       && B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
 		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
 		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
-		       B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh),
+		       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
 		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
 
 	}
@@ -2090,51 +2090,45 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
 
 #ifdef CONFIG_REISERFS_CHECK
 static void tb_buffer_sanity_check(struct super_block *sb,
-				   struct buffer_head *p_s_bh,
+				   struct buffer_head *bh,
 				   const char *descr, int level)
 {
-	if (p_s_bh) {
-		if (atomic_read(&(p_s_bh->b_count)) <= 0) {
+	if (bh) {
+		if (atomic_read(&(bh->b_count)) <= 0)
 
 			reiserfs_panic(sb, "jmacd-1", "negative or zero "
 				       "reference counter for buffer %s[%d] "
-				       "(%b)", descr, level, p_s_bh);
-		}
+				       "(%b)", descr, level, bh);
 
-		if (!buffer_uptodate(p_s_bh)) {
+		if (!buffer_uptodate(bh))
 			reiserfs_panic(sb, "jmacd-2", "buffer is not up "
 				       "to date %s[%d] (%b)",
-				       descr, level, p_s_bh);
-		}
+				       descr, level, bh);
 
-		if (!B_IS_IN_TREE(p_s_bh)) {
+		if (!B_IS_IN_TREE(bh))
 			reiserfs_panic(sb, "jmacd-3", "buffer is not "
 				       "in tree %s[%d] (%b)",
-				       descr, level, p_s_bh);
-		}
+				       descr, level, bh);
 
-		if (p_s_bh->b_bdev != sb->s_bdev) {
+		if (bh->b_bdev != sb->s_bdev)
 			reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
 				       "device %s[%d] (%b)",
-				       descr, level, p_s_bh);
-		}
+				       descr, level, bh);
 
-		if (p_s_bh->b_size != sb->s_blocksize) {
+		if (bh->b_size != sb->s_blocksize)
 			reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
 				       "blocksize %s[%d] (%b)",
-				       descr, level, p_s_bh);
-		}
+				       descr, level, bh);
 
-		if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(sb)) {
+		if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
 			reiserfs_panic(sb, "jmacd-6", "buffer block "
 				       "number too high %s[%d] (%b)",
-				       descr, level, p_s_bh);
-		}
+				       descr, level, bh);
 	}
 }
 #else
 static void tb_buffer_sanity_check(struct super_block *sb,
-				   struct buffer_head *p_s_bh,
+				   struct buffer_head *bh,
 				   const char *descr, int level)
 {;
 }
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 00fd879c4a2a..eb6856f6d323 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -56,13 +56,13 @@
 #include <linux/quotaops.h>
 
 /* Does the buffer contain a disk block which is in the tree. */
-inline int B_IS_IN_TREE(const struct buffer_head *p_s_bh)
+inline int B_IS_IN_TREE(const struct buffer_head *bh)
 {
 
-	RFALSE(B_LEVEL(p_s_bh) > MAX_HEIGHT,
-	       "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh);
+	RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
+	       "PAP-1010: block (%b) has too big level (%z)", bh, bh);
 
-	return (B_LEVEL(p_s_bh) != FREE_LEVEL);
+	return (B_LEVEL(bh) != FREE_LEVEL);
 }
 
 //
@@ -579,7 +579,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 {
 	b_blocknr_t n_block_number;
 	int expected_level;
-	struct buffer_head *p_s_bh;
+	struct buffer_head *bh;
 	struct path_element *p_s_last_element;
 	int n_node_level, n_retval;
 	int right_neighbor_of_leaf_node;
@@ -626,15 +626,14 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 
 		/* Read the next tree node, and set the last element in the path to
 		   have a pointer to it. */
-		if ((p_s_bh = p_s_last_element->pe_buffer =
+		if ((bh = p_s_last_element->pe_buffer =
 		     sb_getblk(sb, n_block_number))) {
-			if (!buffer_uptodate(p_s_bh) && reada_count > 1) {
+			if (!buffer_uptodate(bh) && reada_count > 1)
 				search_by_key_reada(sb, reada_bh,
 						    reada_blocks, reada_count);
-			}
-			ll_rw_block(READ, 1, &p_s_bh);
-			wait_on_buffer(p_s_bh);
-			if (!buffer_uptodate(p_s_bh))
+			ll_rw_block(READ, 1, &bh);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
 				goto io_error;
 		} else {
 		      io_error:
@@ -651,8 +650,8 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 		   to search is still in the tree rooted from the current buffer. If
 		   not then repeat search from the root. */
 		if (fs_changed(fs_gen, sb) &&
-		    (!B_IS_IN_TREE(p_s_bh) ||
-		     B_LEVEL(p_s_bh) != expected_level ||
+		    (!B_IS_IN_TREE(bh) ||
+		     B_LEVEL(bh) != expected_level ||
 		     !key_in_buffer(p_s_search_path, p_s_key, sb))) {
 			PROC_INFO_INC(sb, search_by_key_fs_changed);
 			PROC_INFO_INC(sb, search_by_key_restarted);
@@ -686,25 +685,25 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 
 		// make sure, that the node contents look like a node of
 		// certain level
-		if (!is_tree_node(p_s_bh, expected_level)) {
+		if (!is_tree_node(bh, expected_level)) {
 			reiserfs_error(sb, "vs-5150",
 				       "invalid format found in block %ld. "
-				       "Fsck?", p_s_bh->b_blocknr);
+				       "Fsck?", bh->b_blocknr);
 			pathrelse(p_s_search_path);
 			return IO_ERROR;
 		}
 
 		/* ok, we have acquired next formatted node in the tree */
-		n_node_level = B_LEVEL(p_s_bh);
+		n_node_level = B_LEVEL(bh);
 
-		PROC_INFO_BH_STAT(sb, p_s_bh, n_node_level - 1);
+		PROC_INFO_BH_STAT(sb, bh, n_node_level - 1);
 
 		RFALSE(n_node_level < n_stop_level,
 		       "vs-5152: tree level (%d) is less than stop level (%d)",
 		       n_node_level, n_stop_level);
 
-		n_retval = bin_search(p_s_key, B_N_PITEM_HEAD(p_s_bh, 0),
-				      B_NR_ITEMS(p_s_bh),
+		n_retval = bin_search(p_s_key, B_N_PITEM_HEAD(bh, 0),
+				      B_NR_ITEMS(bh),
 				      (n_node_level ==
 				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
 				      KEY_SIZE,
@@ -726,13 +725,13 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 		   an internal node.  Now we calculate child block number by
 		   position in the node. */
 		n_block_number =
-		    B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position);
+		    B_N_CHILD_NUM(bh, p_s_last_element->pe_position);
 
 		/* if we are going to read leaf nodes, try for read ahead as well */
 		if ((p_s_search_path->reada & PATH_READA) &&
 		    n_node_level == DISK_LEAF_NODE_LEVEL + 1) {
 			int pos = p_s_last_element->pe_position;
-			int limit = B_NR_ITEMS(p_s_bh);
+			int limit = B_NR_ITEMS(bh);
 			struct reiserfs_key *le_key;
 
 			if (p_s_search_path->reada & PATH_READA_BACK)
@@ -741,7 +740,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 				if (pos == limit)
 					break;
 				reada_blocks[reada_count++] =
-				    B_N_CHILD_NUM(p_s_bh, pos);
+				    B_N_CHILD_NUM(bh, pos);
 				if (p_s_search_path->reada & PATH_READA_BACK)
 					pos--;
 				else
@@ -750,7 +749,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 				/*
 				 * check to make sure we're in the same object
 				 */
-				le_key = B_N_PDELIM_KEY(p_s_bh, pos);
+				le_key = B_N_PDELIM_KEY(bh, pos);
 				if (le32_to_cpu(le_key->k_objectid) !=
 				    p_s_key->on_disk_key.k_objectid) {
 					break;
@@ -851,15 +850,15 @@ int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super b
 /* Compare given item and item pointed to by the path. */
 int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_path)
 {
-	struct buffer_head *p_s_bh;
+	struct buffer_head *bh = PATH_PLAST_BUFFER(p_s_path);
 	struct item_head *ih;
 
 	/* Last buffer at the path is not in the tree. */
-	if (!B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)))
+	if (!B_IS_IN_TREE(bh))
 		return 1;
 
 	/* Last path position is invalid. */
-	if (PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh))
+	if (PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(bh))
 		return 1;
 
 	/* we need only to know, whether it is the same item */
@@ -959,7 +958,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 {
 	struct super_block *sb = inode->i_sb;
 	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_path);
-	struct buffer_head *p_s_bh = PATH_PLAST_BUFFER(p_s_path);
+	struct buffer_head *bh = PATH_PLAST_BUFFER(p_s_path);
 
 	BUG_ON(!th->t_trans_id);
 
@@ -1003,7 +1002,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 	    do {
 		need_re_search = 0;
 		*p_n_cut_size = 0;
-		p_s_bh = PATH_PLAST_BUFFER(p_s_path);
+		bh = PATH_PLAST_BUFFER(p_s_path);
 		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
 		pos = I_UNFM_NUM(&s_ih);
 
@@ -1019,13 +1018,13 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 			break;
 		    }
 
-		    unfm = (__le32 *)B_I_PITEM(p_s_bh, &s_ih) + pos - 1;
+		    unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
 		    block = get_block_num(unfm, 0);
 
 		    if (block != 0) {
-			reiserfs_prepare_for_journal(sb, p_s_bh, 1);
+			reiserfs_prepare_for_journal(sb, bh, 1);
 			put_block_num(unfm, 0, 0);
-			journal_mark_dirty (th, sb, p_s_bh);
+			journal_mark_dirty(th, sb, bh);
 			reiserfs_free_block(th, inode, block, 1);
 		    }
 
@@ -1049,7 +1048,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 		/* a trick.  If the buffer has been logged, this will do nothing.  If
 		** we've broken the loop without logging it, it will restore the
 		** buffer */
-		reiserfs_restore_prepared_buffer(sb, p_s_bh);
+		reiserfs_restore_prepared_buffer(sb, bh);
 	    } while (need_re_search &&
 		     search_for_position_by_key(sb, p_s_item_key, p_s_path) == POSITION_FOUND);
 	    pos_in_item(p_s_path) = pos * UNFM_P_SIZE;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 9bd7800d989c..9cfa518c90b6 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -751,25 +751,25 @@ struct block_head {
 #define DISK_LEAF_NODE_LEVEL  1	/* Leaf node level. */
 
 /* Given the buffer head of a formatted node, resolve to the block head of that node. */
-#define B_BLK_HEAD(p_s_bh)            ((struct block_head *)((p_s_bh)->b_data))
+#define B_BLK_HEAD(bh)			((struct block_head *)((bh)->b_data))
 /* Number of items that are in buffer. */
-#define B_NR_ITEMS(p_s_bh)            (blkh_nr_item(B_BLK_HEAD(p_s_bh)))
-#define B_LEVEL(p_s_bh)               (blkh_level(B_BLK_HEAD(p_s_bh)))
-#define B_FREE_SPACE(p_s_bh)          (blkh_free_space(B_BLK_HEAD(p_s_bh)))
+#define B_NR_ITEMS(bh)			(blkh_nr_item(B_BLK_HEAD(bh)))
+#define B_LEVEL(bh)			(blkh_level(B_BLK_HEAD(bh)))
+#define B_FREE_SPACE(bh)		(blkh_free_space(B_BLK_HEAD(bh)))
 
-#define PUT_B_NR_ITEMS(p_s_bh,val)    do { set_blkh_nr_item(B_BLK_HEAD(p_s_bh),val); } while (0)
-#define PUT_B_LEVEL(p_s_bh,val)       do { set_blkh_level(B_BLK_HEAD(p_s_bh),val); } while (0)
-#define PUT_B_FREE_SPACE(p_s_bh,val)  do { set_blkh_free_space(B_BLK_HEAD(p_s_bh),val); } while (0)
+#define PUT_B_NR_ITEMS(bh, val)		do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
+#define PUT_B_LEVEL(bh, val)		do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
+#define PUT_B_FREE_SPACE(bh, val)	do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
 
 /* Get right delimiting key. -- little endian */
-#define B_PRIGHT_DELIM_KEY(p_s_bh)   (&(blk_right_delim_key(B_BLK_HEAD(p_s_bh))))
+#define B_PRIGHT_DELIM_KEY(bh)		(&(blk_right_delim_key(B_BLK_HEAD(bh))))
 
 /* Does the buffer contain a disk leaf. */
-#define B_IS_ITEMS_LEVEL(p_s_bh)     (B_LEVEL(p_s_bh) == DISK_LEAF_NODE_LEVEL)
+#define B_IS_ITEMS_LEVEL(bh)		(B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
 
 /* Does the buffer contain a disk internal node */
-#define B_IS_KEYS_LEVEL(p_s_bh)      (B_LEVEL(p_s_bh) > DISK_LEAF_NODE_LEVEL \
-                                            && B_LEVEL(p_s_bh) <= MAX_HEIGHT)
+#define B_IS_KEYS_LEVEL(bh)      (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
+					    && B_LEVEL(bh) <= MAX_HEIGHT)
 
 /***************************************************************************/
 /*                             STAT DATA                                   */
@@ -1119,12 +1119,13 @@ struct disk_child {
 #define put_dc_size(dc_p, val)   do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
 
 /* Get disk child by buffer header and position in the tree node. */
-#define B_N_CHILD(p_s_bh,n_pos)  ((struct disk_child *)\
-((p_s_bh)->b_data+BLKH_SIZE+B_NR_ITEMS(p_s_bh)*KEY_SIZE+DC_SIZE*(n_pos)))
+#define B_N_CHILD(bh, n_pos)  ((struct disk_child *)\
+((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
 
 /* Get disk child number by buffer header and position in the tree node. */
-#define B_N_CHILD_NUM(p_s_bh,n_pos) (dc_block_number(B_N_CHILD(p_s_bh,n_pos)))
-#define PUT_B_N_CHILD_NUM(p_s_bh,n_pos, val) (put_dc_block_number(B_N_CHILD(p_s_bh,n_pos), val ))
+#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
+#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
+				(put_dc_block_number(B_N_CHILD(bh, n_pos), val))
 
  /* maximal value of field child_size in structure disk_child */
  /* child size is the combined size of all items and their headers */
@@ -1837,7 +1838,7 @@ int search_by_key(struct super_block *, const struct cpu_key *,
 int search_for_position_by_key(struct super_block *sb,
 			       const struct cpu_key *p_s_cpu_key,
 			       struct treepath *p_s_search_path);
-extern void decrement_bcount(struct buffer_head *p_s_bh);
+extern void decrement_bcount(struct buffer_head *bh);
 void decrement_counters_in_path(struct treepath *p_s_search_path);
 void pathrelse(struct treepath *p_s_search_path);
 int reiserfs_check_path(struct treepath *p);
@@ -1978,7 +1979,7 @@ int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
 #define PROC_INFO_MAX( sb, field, value ) VOID_V
 #define PROC_INFO_INC( sb, field ) VOID_V
 #define PROC_INFO_ADD( sb, field, val ) VOID_V
-#define PROC_INFO_BH_STAT(sb, p_s_bh, n_node_level) VOID_V
+#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
 #endif
 
 /* dir.c */
-- 
cgit v1.2.3-71-gd317


From 995c762ea486b48c9777522071fbf132dea96807 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:47 -0400
Subject: reiserfs: rename p_s_inode to inode

This patch is a simple s/p_s_inode/inode/g to the reiserfs code.  This
is the third in a series of patches to rip out some of the awful
variable naming in reiserfs.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/file.c            |  16 +++----
 fs/reiserfs/inode.c           |  43 +++++++++---------
 fs/reiserfs/stree.c           | 103 ++++++++++++++++++++++--------------------
 fs/reiserfs/tail_conversion.c |  18 ++++----
 include/linux/reiserfs_fs.h   |   4 +-
 5 files changed, 95 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index f0160ee03e17..a73579f66214 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -137,17 +137,17 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
 static int reiserfs_sync_file(struct file *p_s_filp,
 			      struct dentry *p_s_dentry, int datasync)
 {
-	struct inode *p_s_inode = p_s_dentry->d_inode;
+	struct inode *inode = p_s_dentry->d_inode;
 	int n_err;
 	int barrier_done;
 
-	BUG_ON(!S_ISREG(p_s_inode->i_mode));
-	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
-	reiserfs_write_lock(p_s_inode->i_sb);
-	barrier_done = reiserfs_commit_for_inode(p_s_inode);
-	reiserfs_write_unlock(p_s_inode->i_sb);
-	if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
-		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
+	BUG_ON(!S_ISREG(inode->i_mode));
+	n_err = sync_mapping_buffers(inode->i_mapping);
+	reiserfs_write_lock(inode->i_sb);
+	barrier_done = reiserfs_commit_for_inode(inode);
+	reiserfs_write_unlock(inode->i_sb);
+	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (n_err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d106edaef64f..b090d2dd2a8e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1987,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 **
 ** on failure, nonzero is returned, page_result and bh_result are untouched.
 */
-static int grab_tail_page(struct inode *p_s_inode,
+static int grab_tail_page(struct inode *inode,
 			  struct page **page_result,
 			  struct buffer_head **bh_result)
 {
@@ -1995,11 +1995,11 @@ static int grab_tail_page(struct inode *p_s_inode,
 	/* we want the page with the last byte in the file,
 	 ** not the page that will hold the next byte for appending
 	 */
-	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
 	unsigned long pos = 0;
 	unsigned long start = 0;
-	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
-	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct page *page;
@@ -2013,7 +2013,7 @@ static int grab_tail_page(struct inode *p_s_inode,
 	if ((offset & (blocksize - 1)) == 0) {
 		return -ENOENT;
 	}
-	page = grab_cache_page(p_s_inode->i_mapping, index);
+	page = grab_cache_page(inode->i_mapping, index);
 	error = -ENOMEM;
 	if (!page) {
 		goto out;
@@ -2042,7 +2042,7 @@ static int grab_tail_page(struct inode *p_s_inode,
 		 ** I've screwed up the code to find the buffer, or the code to
 		 ** call prepare_write
 		 */
-		reiserfs_error(p_s_inode->i_sb, "clm-6000",
+		reiserfs_error(inode->i_sb, "clm-6000",
 			       "error reading block %lu", bh->b_blocknr);
 		error = -EIO;
 		goto unlock;
@@ -2065,27 +2065,28 @@ static int grab_tail_page(struct inode *p_s_inode,
 **
 ** some code taken from block_truncate_page
 */
-int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
+int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 {
 	struct reiserfs_transaction_handle th;
 	/* we want the offset for the first byte after the end of the file */
-	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
-	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
+	unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize = inode->i_sb->s_blocksize;
 	unsigned length;
 	struct page *page = NULL;
 	int error;
 	struct buffer_head *bh = NULL;
 	int err2;
 
-	reiserfs_write_lock(p_s_inode->i_sb);
+	reiserfs_write_lock(inode->i_sb);
 
-	if (p_s_inode->i_size > 0) {
-		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
+	if (inode->i_size > 0) {
+		error = grab_tail_page(inode, &page, &bh);
+		if (error) {
 			// -ENOENT means we truncated past the end of the file,
 			// and get_block_create_0 could not find a block to read in,
 			// which is ok.
 			if (error != -ENOENT)
-				reiserfs_error(p_s_inode->i_sb, "clm-6001",
+				reiserfs_error(inode->i_sb, "clm-6001",
 					       "grab_tail_page failed %d",
 					       error);
 			page = NULL;
@@ -2103,19 +2104,19 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 	/* it is enough to reserve space in transaction for 2 balancings:
 	   one for "save" link adding and another for the first
 	   cut_from_item. 1 is for update_sd */
-	error = journal_begin(&th, p_s_inode->i_sb,
+	error = journal_begin(&th, inode->i_sb,
 			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
-	reiserfs_update_inode_transaction(p_s_inode);
+	reiserfs_update_inode_transaction(inode);
 	if (update_timestamps)
 		/* we are doing real truncate: if the system crashes before the last
 		   transaction of truncating gets committed - on reboot the file
 		   either appears truncated properly or not truncated at all */
-		add_save_link(&th, p_s_inode, 1);
-	err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
+		add_save_link(&th, inode, 1);
+	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
 	error =
-	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+	    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
 
@@ -2126,7 +2127,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 	}
 	
 	if (update_timestamps) {
-		error = remove_save_link(p_s_inode, 1 /* truncate */ );
+		error = remove_save_link(inode, 1 /* truncate */);
 		if (error)
 			goto out;
 	}
@@ -2145,14 +2146,14 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 		page_cache_release(page);
 	}
 
-	reiserfs_write_unlock(p_s_inode->i_sb);
+	reiserfs_write_unlock(inode->i_sb);
 	return 0;
       out:
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	reiserfs_write_unlock(p_s_inode->i_sb);
+	reiserfs_write_unlock(inode->i_sb);
 	return error;
 }
 
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index eb6856f6d323..8f220fb777d7 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1143,10 +1143,11 @@ char head2type(struct item_head *ih)
 /* Delete object item. */
 int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path,	/* Path to the deleted item. */
 			 const struct cpu_key *p_s_item_key,	/* Key to search for the deleted item.  */
-			 struct inode *p_s_inode,	/* inode is here just to update i_blocks and quotas */
+			 struct inode *inode,	/* inode is here just to update
+						 * i_blocks and quotas */
 			 struct buffer_head *p_s_un_bh)
 {				/* NULL or unformatted node pointer.    */
-	struct super_block *sb = p_s_inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	struct tree_balance s_del_balance;
 	struct item_head s_ih;
 	struct item_head *q_ih;
@@ -1170,10 +1171,10 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		n_iter++;
 		c_mode =
 #endif
-		    prepare_for_delete_or_cut(th, p_s_inode, p_s_path,
+		    prepare_for_delete_or_cut(th, inode, p_s_path,
 					      p_s_item_key, &n_removed,
 					      &n_del_size,
-					      max_reiserfs_offset(p_s_inode));
+					      max_reiserfs_offset(inode));
 
 		RFALSE(c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
 
@@ -1214,7 +1215,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 	 ** split into multiple items, and we only want to decrement for
 	 ** the unfm node once
 	 */
-	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
 		if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
 			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
 		} else {
@@ -1259,9 +1260,9 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 #ifdef REISERQUOTA_DEBUG
 	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
-		       quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
+		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+	DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
 
 	/* Return deleted body length */
 	return n_ret_value;
@@ -1423,25 +1424,25 @@ static void unmap_buffers(struct page *page, loff_t pos)
 }
 
 static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
-				    struct inode *p_s_inode,
+				    struct inode *inode,
 				    struct page *page,
 				    struct treepath *p_s_path,
 				    const struct cpu_key *p_s_item_key,
 				    loff_t n_new_file_size, char *p_c_mode)
 {
-	struct super_block *sb = p_s_inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	int n_block_size = sb->s_blocksize;
 	int cut_bytes;
 	BUG_ON(!th->t_trans_id);
-	BUG_ON(n_new_file_size != p_s_inode->i_size);
+	BUG_ON(n_new_file_size != inode->i_size);
 
 	/* the page being sent in could be NULL if there was an i/o error
 	 ** reading in the last block.  The user will hit problems trying to
 	 ** read the file, but for now we just skip the indirect2direct
 	 */
-	if (atomic_read(&p_s_inode->i_count) > 1 ||
-	    !tail_has_to_be_packed(p_s_inode) ||
-	    !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) {
+	if (atomic_read(&inode->i_count) > 1 ||
+	    !tail_has_to_be_packed(inode) ||
+	    !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		/* leave tail in an unformatted node */
 		*p_c_mode = M_SKIP_BALANCING;
 		cut_bytes =
@@ -1450,8 +1451,9 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
 		return cut_bytes;
 	}
 	/* Permorm the conversion to a direct_item. */
-	/*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); */
-	return indirect2direct(th, p_s_inode, page, p_s_path, p_s_item_key,
+	/* return indirect_to_direct(inode, p_s_path, p_s_item_key,
+				  n_new_file_size, p_c_mode); */
+	return indirect2direct(th, inode, page, p_s_path, p_s_item_key,
 			       n_new_file_size, p_c_mode);
 }
 
@@ -1505,10 +1507,10 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
 int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			   struct treepath *p_s_path,
 			   struct cpu_key *p_s_item_key,
-			   struct inode *p_s_inode,
+			   struct inode *inode,
 			   struct page *page, loff_t n_new_file_size)
 {
-	struct super_block *sb = p_s_inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	/* Every function which is going to call do_balance must first
 	   create a tree_balance structure.  Then it must fill up this
 	   structure by using the init_tb_struct and fix_nodes functions.
@@ -1525,7 +1527,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path,
+	init_tb_struct(th, &s_cut_balance, inode->i_sb, p_s_path,
 		       n_cut_size);
 
 	/* Repeat this loop until we either cut the item without needing
@@ -1537,7 +1539,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		   pointers. */
 
 		c_mode =
-		    prepare_for_delete_or_cut(th, p_s_inode, p_s_path,
+		    prepare_for_delete_or_cut(th, inode, p_s_path,
 					      p_s_item_key, &n_removed,
 					      &n_cut_size, n_new_file_size);
 		if (c_mode == M_CONVERT) {
@@ -1547,7 +1549,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			       "PAP-5570: can not convert twice");
 
 			n_ret_value =
-			    maybe_indirect_to_direct(th, p_s_inode, page,
+			    maybe_indirect_to_direct(th, inode, page,
 						     p_s_path, p_s_item_key,
 						     n_new_file_size, &c_mode);
 			if (c_mode == M_SKIP_BALANCING)
@@ -1612,7 +1614,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		if (n_is_inode_locked) {
 			// FIXME: this seems to be not needed: we are always able
 			// to cut item
-			indirect_to_direct_roll_back(th, p_s_inode, p_s_path);
+			indirect_to_direct_roll_back(th, inode, p_s_path);
 		}
 		if (n_ret_value == NO_DISK_SPACE)
 			reiserfs_warning(sb, "reiserfs-5092",
@@ -1639,12 +1641,12 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 	 ** item.
 	 */
 	p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
-	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
 		if (c_mode == M_DELETE &&
 		    (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
 		    1) {
 			// FIXME: this is to keep 3.5 happy
-			REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
+			REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
 			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
 		} else {
 			quota_cut_bytes = 0;
@@ -1687,14 +1689,14 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		 ** unmap and invalidate it
 		 */
 		unmap_buffers(page, tail_pos);
-		REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask;
+		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 	}
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(p_s_inode->i_sb, REISERFS_DEBUG_CODE,
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
-		       quota_cut_bytes, p_s_inode->i_uid, '?');
+		       quota_cut_bytes, inode->i_uid, '?');
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+	DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
 	return n_ret_value;
 }
 
@@ -1715,8 +1717,8 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
 
 /* Truncate file to the new size. Note, this must be called with a transaction
    already started */
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p_s_inode,	/* ->i_size contains new
-												   size */
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
+			  struct inode *inode,	/* ->i_size contains new size */
 			 struct page *page,	/* up to date for last block */
 			 int update_timestamps	/* when it is called by
 						   file_release to convert
@@ -1735,35 +1737,35 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 
 	BUG_ON(!th->t_trans_id);
 	if (!
-	    (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode)
-	     || S_ISLNK(p_s_inode->i_mode)))
+	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+	     || S_ISLNK(inode->i_mode)))
 		return 0;
 
-	if (S_ISDIR(p_s_inode->i_mode)) {
+	if (S_ISDIR(inode->i_mode)) {
 		// deletion of directory - no need to update timestamps
-		truncate_directory(th, p_s_inode);
+		truncate_directory(th, inode);
 		return 0;
 	}
 
 	/* Get new file size. */
-	n_new_file_size = p_s_inode->i_size;
+	n_new_file_size = inode->i_size;
 
 	// FIXME: note, that key type is unimportant here
-	make_cpu_key(&s_item_key, p_s_inode, max_reiserfs_offset(p_s_inode),
+	make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
 		     TYPE_DIRECT, 3);
 
 	retval =
-	    search_for_position_by_key(p_s_inode->i_sb, &s_item_key,
+	    search_for_position_by_key(inode->i_sb, &s_item_key,
 				       &s_search_path);
 	if (retval == IO_ERROR) {
-		reiserfs_error(p_s_inode->i_sb, "vs-5657",
+		reiserfs_error(inode->i_sb, "vs-5657",
 			       "i/o failure occurred trying to truncate %K",
 			       &s_item_key);
 		err = -EIO;
 		goto out;
 	}
 	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
-		reiserfs_error(p_s_inode->i_sb, "PAP-5660",
+		reiserfs_error(inode->i_sb, "PAP-5660",
 			       "wrong result %d of search for %K", retval,
 			       &s_item_key);
 
@@ -1780,7 +1782,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 	else {
 		loff_t offset = le_ih_k_offset(p_le_ih);
 		int bytes =
-		    op_bytes_number(p_le_ih, p_s_inode->i_sb->s_blocksize);
+		    op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
 
 		/* this may mismatch with real file size: if last direct item
 		   had no padding zeros and last unformatted node had no free
@@ -1805,9 +1807,9 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 		/* Cut or delete file item. */
 		n_deleted =
 		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
-					   p_s_inode, page, n_new_file_size);
+					   inode, page, n_new_file_size);
 		if (n_deleted < 0) {
-			reiserfs_warning(p_s_inode->i_sb, "vs-5665",
+			reiserfs_warning(inode->i_sb, "vs-5665",
 					 "reiserfs_cut_from_item failed");
 			reiserfs_check_path(&s_search_path);
 			return 0;
@@ -1837,22 +1839,22 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 			pathrelse(&s_search_path);
 
 			if (update_timestamps) {
-				p_s_inode->i_mtime = p_s_inode->i_ctime =
-				    CURRENT_TIME_SEC;
+				inode->i_mtime = CURRENT_TIME_SEC;
+				inode->i_ctime = CURRENT_TIME_SEC;
 			}
-			reiserfs_update_sd(th, p_s_inode);
+			reiserfs_update_sd(th, inode);
 
-			err = journal_end(th, p_s_inode->i_sb, orig_len_alloc);
+			err = journal_end(th, inode->i_sb, orig_len_alloc);
 			if (err)
 				goto out;
-			err = journal_begin(th, p_s_inode->i_sb,
+			err = journal_begin(th, inode->i_sb,
 					    JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
 			if (err)
 				goto out;
-			reiserfs_update_inode_transaction(p_s_inode);
+			reiserfs_update_inode_transaction(inode);
 		}
 	} while (n_file_size > ROUND_UP(n_new_file_size) &&
-		 search_for_position_by_key(p_s_inode->i_sb, &s_item_key,
+		 search_for_position_by_key(inode->i_sb, &s_item_key,
 					    &s_search_path) == POSITION_FOUND);
 
 	RFALSE(n_file_size > ROUND_UP(n_new_file_size),
@@ -1862,9 +1864,10 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
       update_and_out:
 	if (update_timestamps) {
 		// this is truncate, not file closing
-		p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_mtime = CURRENT_TIME_SEC;
+		inode->i_ctime = CURRENT_TIME_SEC;
 	}
-	reiserfs_update_sd(th, p_s_inode);
+	reiserfs_update_sd(th, inode);
 
       out:
 	pathrelse(&s_search_path);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 27311a5f0469..5c5ee0d0d6a8 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -170,12 +170,14 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
    what we expect from it (number of cut bytes). But when tail remains
    in the unformatted node, we set mode to SKIP_BALANCING and unlock
    inode */
-int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, struct treepath *p_s_path,	/* path to the indirect item. */
+int indirect2direct(struct reiserfs_transaction_handle *th,
+		    struct inode *inode, struct page *page,
+		    struct treepath *p_s_path,	/* path to the indirect item. */
 		    const struct cpu_key *p_s_item_key,	/* Key to look for unformatted node pointer to be cut. */
 		    loff_t n_new_file_size,	/* New file size. */
 		    char *p_c_mode)
 {
-	struct super_block *sb = p_s_inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	struct item_head s_ih;
 	unsigned long n_block_size = sb->s_blocksize;
 	char *tail;
@@ -193,7 +195,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 	copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
 
 	tail_len = (n_new_file_size & (n_block_size - 1));
-	if (get_inode_sd_version(p_s_inode) == STAT_DATA_V2)
+	if (get_inode_sd_version(inode) == STAT_DATA_V2)
 		round_tail_len = ROUND_UP(tail_len);
 	else
 		round_tail_len = tail_len;
@@ -228,7 +230,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 	}
 
 	/* Set direct item header to insert. */
-	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(p_s_inode),
+	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
 			  pos1 + 1, TYPE_DIRECT, round_tail_len,
 			  0xffff /*ih_free_space */ );
 
@@ -244,7 +246,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 	set_cpu_key_k_type(&key, TYPE_DIRECT);
 	key.key_length = 4;
 	/* Insert tail as new direct item in the tree */
-	if (reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
+	if (reiserfs_insert_item(th, p_s_path, &key, &s_ih, inode,
 				 tail ? tail : NULL) < 0) {
 		/* No disk memory. So we can not convert last unformatted node
 		   to the direct item.  In this case we used to adjust
@@ -258,7 +260,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 	kunmap(page);
 
 	/* make sure to get the i_blocks changes from reiserfs_insert_item */
-	reiserfs_update_sd(th, p_s_inode);
+	reiserfs_update_sd(th, inode);
 
 	// note: we have now the same as in above direct2indirect
 	// conversion: there are two keys which have matching first three
@@ -269,8 +271,8 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 	*p_c_mode = M_CUT;
 
 	/* we store position of first direct item in the in-core inode */
-	//mark_file_with_tail (p_s_inode, pos1 + 1);
-	REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1;
+	/* mark_file_with_tail (inode, pos1 + 1); */
+	REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
 
 	return n_block_size - round_tail_len;
 }
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 9cfa518c90b6..3192dc793226 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1870,9 +1870,9 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
 void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 				struct inode *inode, struct reiserfs_key *key);
 int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
-			   struct inode *p_s_inode);
+			   struct inode *inode);
 int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
-			 struct inode *p_s_inode, struct page *,
+			 struct inode *inode, struct page *,
 			 int update_timestamps);
 
 #define i_block_size(inode) ((inode)->i_sb->s_blocksize)
-- 
cgit v1.2.3-71-gd317


From a063ae17925cafabe55ebe1957ca0e8c480bd132 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:48 -0400
Subject: reiserfs: rename p_s_tb to tb

This patch is a simple s/p_s_tb/tb/g to the reiserfs code.  This is the
fourth in a series of patches to rip out some of the awful variable
naming in reiserfs.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/fix_node.c      | 482 ++++++++++++++++++++++----------------------
 fs/reiserfs/stree.c         |  21 +-
 include/linux/reiserfs_fs.h |   2 +-
 3 files changed, 254 insertions(+), 251 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index ad42c45af44f..5236a8829e31 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -749,26 +749,26 @@ else \
 		  -1, -1);\
 }
 
-static void free_buffers_in_tb(struct tree_balance *p_s_tb)
+static void free_buffers_in_tb(struct tree_balance *tb)
 {
 	int n_counter;
 
-	pathrelse(p_s_tb->tb_path);
+	pathrelse(tb->tb_path);
 
 	for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) {
-		brelse(p_s_tb->L[n_counter]);
-		brelse(p_s_tb->R[n_counter]);
-		brelse(p_s_tb->FL[n_counter]);
-		brelse(p_s_tb->FR[n_counter]);
-		brelse(p_s_tb->CFL[n_counter]);
-		brelse(p_s_tb->CFR[n_counter]);
-
-		p_s_tb->L[n_counter] = NULL;
-		p_s_tb->R[n_counter] = NULL;
-		p_s_tb->FL[n_counter] = NULL;
-		p_s_tb->FR[n_counter] = NULL;
-		p_s_tb->CFL[n_counter] = NULL;
-		p_s_tb->CFR[n_counter] = NULL;
+		brelse(tb->L[n_counter]);
+		brelse(tb->R[n_counter]);
+		brelse(tb->FL[n_counter]);
+		brelse(tb->FR[n_counter]);
+		brelse(tb->CFL[n_counter]);
+		brelse(tb->CFR[n_counter]);
+
+		tb->L[n_counter] = NULL;
+		tb->R[n_counter] = NULL;
+		tb->FL[n_counter] = NULL;
+		tb->FR[n_counter] = NULL;
+		tb->CFL[n_counter] = NULL;
+		tb->CFR[n_counter] = NULL;
 	}
 }
 
@@ -778,14 +778,14 @@ static void free_buffers_in_tb(struct tree_balance *p_s_tb)
  *	        NO_DISK_SPACE - no disk space.
  */
 /* The function is NOT SCHEDULE-SAFE! */
-static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
+static int get_empty_nodes(struct tree_balance *tb, int n_h)
 {
 	struct buffer_head *p_s_new_bh,
-	    *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h);
+	    *p_s_Sh = PATH_H_PBUFFER(tb->tb_path, n_h);
 	b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
 	int n_counter, n_number_of_freeblk, n_amount_needed,	/* number of needed empty blocks */
 	 n_retval = CARRY_ON;
-	struct super_block *sb = p_s_tb->tb_sb;
+	struct super_block *sb = tb->tb_sb;
 
 	/* number_of_freeblk is the number of empty blocks which have been
 	   acquired for use by the balancing algorithm minus the number of
@@ -803,15 +803,15 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
 	   the analysis or 0 if not restarted, then subtract the amount needed
 	   by all of the levels of the tree below n_h. */
 	/* blknum includes S[n_h], so we subtract 1 in this calculation */
-	for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum;
+	for (n_counter = 0, n_number_of_freeblk = tb->cur_blknum;
 	     n_counter < n_h; n_counter++)
 		n_number_of_freeblk -=
-		    (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] -
+		    (tb->blknum[n_counter]) ? (tb->blknum[n_counter] -
 						   1) : 0;
 
 	/* Allocate missing empty blocks. */
 	/* if p_s_Sh == 0  then we are getting a new root */
-	n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1;
+	n_amount_needed = (p_s_Sh) ? (tb->blknum[n_h] - 1) : 1;
 	/*  Amount_needed = the amount that we need more than the amount that we have. */
 	if (n_amount_needed > n_number_of_freeblk)
 		n_amount_needed -= n_number_of_freeblk;
@@ -819,7 +819,7 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
 		return CARRY_ON;
 
 	/* No need to check quota - is not allocated for blocks used for formatted nodes */
-	if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs,
+	if (reiserfs_new_form_blocknrs(tb, a_n_blocknrs,
 				       n_amount_needed) == NO_DISK_SPACE)
 		return NO_DISK_SPACE;
 
@@ -838,14 +838,14 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
 		       p_s_new_bh);
 
 		/* Put empty buffers into the array. */
-		RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum],
+		RFALSE(tb->FEB[tb->cur_blknum],
 		       "PAP-8141: busy slot for new buffer");
 
 		set_buffer_journal_new(p_s_new_bh);
-		p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
+		tb->FEB[tb->cur_blknum++] = p_s_new_bh;
 	}
 
-	if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb))
+	if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
 		n_retval = REPEAT_SEARCH;
 
 	return n_retval;
@@ -896,33 +896,34 @@ static int get_rfree(struct tree_balance *tb, int h)
 }
 
 /* Check whether left neighbor is in memory. */
-static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
+static int is_left_neighbor_in_cache(struct tree_balance *tb, int n_h)
 {
 	struct buffer_head *p_s_father, *left;
-	struct super_block *sb = p_s_tb->tb_sb;
+	struct super_block *sb = tb->tb_sb;
 	b_blocknr_t n_left_neighbor_blocknr;
 	int n_left_neighbor_position;
 
-	if (!p_s_tb->FL[n_h])	/* Father of the left neighbor does not exist. */
+	/* Father of the left neighbor does not exist. */
+	if (!tb->FL[n_h])
 		return 0;
 
 	/* Calculate father of the node to be balanced. */
-	p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
+	p_s_father = PATH_H_PBUFFER(tb->tb_path, n_h + 1);
 
 	RFALSE(!p_s_father ||
 	       !B_IS_IN_TREE(p_s_father) ||
-	       !B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
+	       !B_IS_IN_TREE(tb->FL[n_h]) ||
 	       !buffer_uptodate(p_s_father) ||
-	       !buffer_uptodate(p_s_tb->FL[n_h]),
+	       !buffer_uptodate(tb->FL[n_h]),
 	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
-	       p_s_father, p_s_tb->FL[n_h]);
+	       p_s_father, tb->FL[n_h]);
 
 	/* Get position of the pointer to the left neighbor into the left father. */
-	n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ?
-	    p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]);
+	n_left_neighbor_position = (p_s_father == tb->FL[n_h]) ?
+	    tb->lkey[n_h] : B_NR_ITEMS(tb->FL[n_h]);
 	/* Get left neighbor block number. */
 	n_left_neighbor_blocknr =
-	    B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
+	    B_N_CHILD_NUM(tb->FL[n_h], n_left_neighbor_position);
 	/* Look for the left neighbor in the cache. */
 	if ((left = sb_find_get_block(sb, n_left_neighbor_blocknr))) {
 
@@ -953,14 +954,14 @@ static void decrement_key(struct cpu_key *p_s_key)
  		SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON         - schedule didn't occur while the function worked;
  */
-static int get_far_parent(struct tree_balance *p_s_tb,
+static int get_far_parent(struct tree_balance *tb,
 			  int n_h,
 			  struct buffer_head **pp_s_father,
 			  struct buffer_head **pp_s_com_father, char c_lr_par)
 {
 	struct buffer_head *p_s_parent;
 	INITIALIZE_PATH(s_path_to_neighbor_father);
-	struct treepath *p_s_path = p_s_tb->tb_path;
+	struct treepath *p_s_path = tb->tb_path;
 	struct cpu_key s_lr_father_key;
 	int n_counter,
 	    n_position = INT_MAX,
@@ -1005,9 +1006,9 @@ static int get_far_parent(struct tree_balance *p_s_tb,
 	if (n_counter == FIRST_PATH_ELEMENT_OFFSET) {
 		/* Check whether first buffer in the path is the root of the tree. */
 		if (PATH_OFFSET_PBUFFER
-		    (p_s_tb->tb_path,
+		    (tb->tb_path,
 		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-		    SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
+		    SB_ROOT_BLOCK(tb->tb_sb)) {
 			*pp_s_father = *pp_s_com_father = NULL;
 			return CARRY_ON;
 		}
@@ -1022,7 +1023,7 @@ static int get_far_parent(struct tree_balance *p_s_tb,
 
 	if (buffer_locked(*pp_s_com_father)) {
 		__wait_on_buffer(*pp_s_com_father);
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+		if (FILESYSTEM_CHANGED_TB(tb)) {
 			brelse(*pp_s_com_father);
 			return REPEAT_SEARCH;
 		}
@@ -1035,9 +1036,9 @@ static int get_far_parent(struct tree_balance *p_s_tb,
 	le_key2cpu_key(&s_lr_father_key,
 		       B_N_PDELIM_KEY(*pp_s_com_father,
 				      (c_lr_par ==
-				       LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] =
+				       LEFT_PARENTS) ? (tb->lkey[n_h - 1] =
 							n_position -
-							1) : (p_s_tb->rkey[n_h -
+							1) : (tb->rkey[n_h -
 									   1] =
 							      n_position)));
 
@@ -1045,12 +1046,12 @@ static int get_far_parent(struct tree_balance *p_s_tb,
 		decrement_key(&s_lr_father_key);
 
 	if (search_by_key
-	    (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
+	    (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
 	     n_h + 1) == IO_ERROR)
 		// path is released
 		return IO_ERROR;
 
-	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+	if (FILESYSTEM_CHANGED_TB(tb)) {
 		pathrelse(&s_path_to_neighbor_father);
 		brelse(*pp_s_com_father);
 		return REPEAT_SEARCH;
@@ -1075,24 +1076,26 @@ static int get_far_parent(struct tree_balance *p_s_tb,
  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON - schedule didn't occur while the function worked;
  */
-static int get_parents(struct tree_balance *p_s_tb, int n_h)
+static int get_parents(struct tree_balance *tb, int n_h)
 {
-	struct treepath *p_s_path = p_s_tb->tb_path;
+	struct treepath *p_s_path = tb->tb_path;
 	int n_position,
 	    n_ret_value,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
+	    n_path_offset = PATH_H_PATH_OFFSET(tb->tb_path, n_h);
 	struct buffer_head *p_s_curf, *p_s_curcf;
 
 	/* Current node is the root of the tree or will be root of the tree */
 	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
 		/* The root can not have parents.
 		   Release nodes which previously were obtained as parents of the current node neighbors. */
-		brelse(p_s_tb->FL[n_h]);
-		brelse(p_s_tb->CFL[n_h]);
-		brelse(p_s_tb->FR[n_h]);
-		brelse(p_s_tb->CFR[n_h]);
-		p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] =
-		    p_s_tb->CFR[n_h] = NULL;
+		brelse(tb->FL[n_h]);
+		brelse(tb->CFL[n_h]);
+		brelse(tb->FR[n_h]);
+		brelse(tb->CFR[n_h]);
+		tb->FL[n_h] = NULL;
+		tb->CFL[n_h] = NULL;
+		tb->FR[n_h] = NULL;
+		tb->CFR[n_h] = NULL;
 		return CARRY_ON;
 	}
 
@@ -1104,22 +1107,22 @@ static int get_parents(struct tree_balance *p_s_tb, int n_h)
 		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
 		get_bh(p_s_curf);
 		get_bh(p_s_curf);
-		p_s_tb->lkey[n_h] = n_position - 1;
+		tb->lkey[n_h] = n_position - 1;
 	} else {
 		/* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
 		   Calculate current common parent of L[n_path_offset] and the current node. Note that
 		   CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
 		   Calculate lkey[n_path_offset]. */
-		if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
+		if ((n_ret_value = get_far_parent(tb, n_h + 1, &p_s_curf,
 						  &p_s_curcf,
 						  LEFT_PARENTS)) != CARRY_ON)
 			return n_ret_value;
 	}
 
-	brelse(p_s_tb->FL[n_h]);
-	p_s_tb->FL[n_h] = p_s_curf;	/* New initialization of FL[n_h]. */
-	brelse(p_s_tb->CFL[n_h]);
-	p_s_tb->CFL[n_h] = p_s_curcf;	/* New initialization of CFL[n_h]. */
+	brelse(tb->FL[n_h]);
+	tb->FL[n_h] = p_s_curf;	/* New initialization of FL[n_h]. */
+	brelse(tb->CFL[n_h]);
+	tb->CFL[n_h] = p_s_curcf;	/* New initialization of CFL[n_h]. */
 
 	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
 	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
@@ -1133,7 +1136,7 @@ static int get_parents(struct tree_balance *p_s_tb, int n_h)
    Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
    not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
 		if ((n_ret_value =
-		     get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf,
+		     get_far_parent(tb, n_h + 1, &p_s_curf, &p_s_curcf,
 				    RIGHT_PARENTS)) != CARRY_ON)
 			return n_ret_value;
 	} else {
@@ -1143,14 +1146,16 @@ static int get_parents(struct tree_balance *p_s_tb, int n_h)
 		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
 		get_bh(p_s_curf);
 		get_bh(p_s_curf);
-		p_s_tb->rkey[n_h] = n_position;
+		tb->rkey[n_h] = n_position;
 	}
 
-	brelse(p_s_tb->FR[n_h]);
-	p_s_tb->FR[n_h] = p_s_curf;	/* New initialization of FR[n_path_offset]. */
+	brelse(tb->FR[n_h]);
+	/* New initialization of FR[n_path_offset]. */
+	tb->FR[n_h] = p_s_curf;
 
-	brelse(p_s_tb->CFR[n_h]);
-	p_s_tb->CFR[n_h] = p_s_curcf;	/* New initialization of CFR[n_path_offset]. */
+	brelse(tb->CFR[n_h]);
+	/* New initialization of CFR[n_path_offset]. */
+	tb->CFR[n_h] = p_s_curcf;
 
 	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
 	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
@@ -1885,12 +1890,12 @@ static int check_balance(int mode,
 }
 
 /* Check whether parent at the path is the really parent of the current node.*/
-static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
+static int get_direct_parent(struct tree_balance *tb, int n_h)
 {
 	struct buffer_head *bh;
-	struct treepath *p_s_path = p_s_tb->tb_path;
+	struct treepath *p_s_path = tb->tb_path;
 	int n_position,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
+	    n_path_offset = PATH_H_PATH_OFFSET(tb->tb_path, n_h);
 
 	/* We are in the root or in the new root. */
 	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
@@ -1899,7 +1904,7 @@ static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
 		       "PAP-8260: invalid offset in the path");
 
 		if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
-		    b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
+		    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
 			/* Root is not changed. */
 			PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
 			PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
@@ -1924,7 +1929,7 @@ static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
 
 	if (buffer_locked(bh)) {
 		__wait_on_buffer(bh);
-		if (FILESYSTEM_CHANGED_TB(p_s_tb))
+		if (FILESYSTEM_CHANGED_TB(tb))
 			return REPEAT_SEARCH;
 	}
 
@@ -1937,85 +1942,86 @@ static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON - schedule didn't occur while the function worked;
  */
-static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
+static int get_neighbors(struct tree_balance *tb, int n_h)
 {
 	int n_child_position,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
+	    n_path_offset = PATH_H_PATH_OFFSET(tb->tb_path, n_h + 1);
 	unsigned long n_son_number;
-	struct super_block *sb = p_s_tb->tb_sb;
+	struct super_block *sb = tb->tb_sb;
 	struct buffer_head *bh;
 
 	PROC_INFO_INC(sb, get_neighbors[n_h]);
 
-	if (p_s_tb->lnum[n_h]) {
+	if (tb->lnum[n_h]) {
 		/* We need left neighbor to balance S[n_h]. */
 		PROC_INFO_INC(sb, need_l_neighbor[n_h]);
-		bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, n_path_offset);
 
-		RFALSE(bh == p_s_tb->FL[n_h] &&
-		       !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
+		RFALSE(bh == tb->FL[n_h] &&
+		       !PATH_OFFSET_POSITION(tb->tb_path, n_path_offset),
 		       "PAP-8270: invalid position in the parent");
 
 		n_child_position =
 		    (bh ==
-		     p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
+		     tb->FL[n_h]) ? tb->lkey[n_h] : B_NR_ITEMS(tb->
 								       FL[n_h]);
-		n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
+		n_son_number = B_N_CHILD_NUM(tb->FL[n_h], n_child_position);
 		bh = sb_bread(sb, n_son_number);
 		if (!bh)
 			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+		if (FILESYSTEM_CHANGED_TB(tb)) {
 			brelse(bh);
 			PROC_INFO_INC(sb, get_neighbors_restart[n_h]);
 			return REPEAT_SEARCH;
 		}
 
-		RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
-		       n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
-		       B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
+		RFALSE(!B_IS_IN_TREE(tb->FL[n_h]) ||
+		       n_child_position > B_NR_ITEMS(tb->FL[n_h]) ||
+		       B_N_CHILD_NUM(tb->FL[n_h], n_child_position) !=
 		       bh->b_blocknr, "PAP-8275: invalid parent");
 		RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
 		RFALSE(!n_h &&
 		       B_FREE_SPACE(bh) !=
 		       MAX_CHILD_SIZE(bh) -
-		       dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
+		       dc_size(B_N_CHILD(tb->FL[0], n_child_position)),
 		       "PAP-8290: invalid child size of left neighbor");
 
-		brelse(p_s_tb->L[n_h]);
-		p_s_tb->L[n_h] = bh;
+		brelse(tb->L[n_h]);
+		tb->L[n_h] = bh;
 	}
 
-	if (p_s_tb->rnum[n_h]) {	/* We need right neighbor to balance S[n_path_offset]. */
+	/* We need right neighbor to balance S[n_path_offset]. */
+	if (tb->rnum[n_h]) {
 		PROC_INFO_INC(sb, need_r_neighbor[n_h]);
-		bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, n_path_offset);
 
-		RFALSE(bh == p_s_tb->FR[n_h] &&
-		       PATH_OFFSET_POSITION(p_s_tb->tb_path,
+		RFALSE(bh == tb->FR[n_h] &&
+		       PATH_OFFSET_POSITION(tb->tb_path,
 					    n_path_offset) >=
 		       B_NR_ITEMS(bh),
 		       "PAP-8295: invalid position in the parent");
 
 		n_child_position =
-		    (bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
-		n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
+		    (bh == tb->FR[n_h]) ? tb->rkey[n_h] + 1 : 0;
+		n_son_number = B_N_CHILD_NUM(tb->FR[n_h], n_child_position);
 		bh = sb_bread(sb, n_son_number);
 		if (!bh)
 			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+		if (FILESYSTEM_CHANGED_TB(tb)) {
 			brelse(bh);
 			PROC_INFO_INC(sb, get_neighbors_restart[n_h]);
 			return REPEAT_SEARCH;
 		}
-		brelse(p_s_tb->R[n_h]);
-		p_s_tb->R[n_h] = bh;
+		brelse(tb->R[n_h]);
+		tb->R[n_h] = bh;
 
 		RFALSE(!n_h
 		       && B_FREE_SPACE(bh) !=
 		       MAX_CHILD_SIZE(bh) -
-		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
+		       dc_size(B_N_CHILD(tb->FR[0], n_child_position)),
 		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
 		       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
-		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
+		       dc_size(B_N_CHILD(tb->FR[0], n_child_position)));
 
 	}
 	return CARRY_ON;
@@ -2139,7 +2145,7 @@ static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
 	return reiserfs_prepare_for_journal(s, bh, 0);
 }
 
-static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
+static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
 {
 	struct buffer_head *locked;
 #ifdef CONFIG_REISERFS_CHECK
@@ -2151,95 +2157,94 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
 
 		locked = NULL;
 
-		for (i = p_s_tb->tb_path->path_length;
+		for (i = tb->tb_path->path_length;
 		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
-			if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
+			if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
 				/* if I understand correctly, we can only be sure the last buffer
 				 ** in the path is in the tree --clm
 				 */
 #ifdef CONFIG_REISERFS_CHECK
-				if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
-				    PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
+				if (PATH_PLAST_BUFFER(tb->tb_path) ==
+				    PATH_OFFSET_PBUFFER(tb->tb_path, i))
+					tb_buffer_sanity_check(tb->tb_sb,
 							       PATH_OFFSET_PBUFFER
-							       (p_s_tb->tb_path,
+							       (tb->tb_path,
 								i), "S",
-							       p_s_tb->tb_path->
+							       tb->tb_path->
 							       path_length - i);
-				}
 #endif
-				if (!clear_all_dirty_bits(p_s_tb->tb_sb,
+				if (!clear_all_dirty_bits(tb->tb_sb,
 							  PATH_OFFSET_PBUFFER
-							  (p_s_tb->tb_path,
+							  (tb->tb_path,
 							   i))) {
 					locked =
-					    PATH_OFFSET_PBUFFER(p_s_tb->tb_path,
+					    PATH_OFFSET_PBUFFER(tb->tb_path,
 								i);
 				}
 			}
 		}
 
-		for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i];
+		for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
 		     i++) {
 
-			if (p_s_tb->lnum[i]) {
+			if (tb->lnum[i]) {
 
-				if (p_s_tb->L[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->L[i],
+				if (tb->L[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->L[i],
 							       "L", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->L[i]))
-						locked = p_s_tb->L[i];
+					    (tb->tb_sb, tb->L[i]))
+						locked = tb->L[i];
 				}
 
-				if (!locked && p_s_tb->FL[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->FL[i],
+				if (!locked && tb->FL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FL[i],
 							       "FL", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->FL[i]))
-						locked = p_s_tb->FL[i];
+					    (tb->tb_sb, tb->FL[i]))
+						locked = tb->FL[i];
 				}
 
-				if (!locked && p_s_tb->CFL[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->CFL[i],
+				if (!locked && tb->CFL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFL[i],
 							       "CFL", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->CFL[i]))
-						locked = p_s_tb->CFL[i];
+					    (tb->tb_sb, tb->CFL[i]))
+						locked = tb->CFL[i];
 				}
 
 			}
 
-			if (!locked && (p_s_tb->rnum[i])) {
+			if (!locked && (tb->rnum[i])) {
 
-				if (p_s_tb->R[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->R[i],
+				if (tb->R[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->R[i],
 							       "R", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->R[i]))
-						locked = p_s_tb->R[i];
+					    (tb->tb_sb, tb->R[i]))
+						locked = tb->R[i];
 				}
 
-				if (!locked && p_s_tb->FR[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->FR[i],
+				if (!locked && tb->FR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FR[i],
 							       "FR", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->FR[i]))
-						locked = p_s_tb->FR[i];
+					    (tb->tb_sb, tb->FR[i]))
+						locked = tb->FR[i];
 				}
 
-				if (!locked && p_s_tb->CFR[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->CFR[i],
+				if (!locked && tb->CFR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFR[i],
 							       "CFR", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->CFR[i]))
-						locked = p_s_tb->CFR[i];
+					    (tb->tb_sb, tb->CFR[i]))
+						locked = tb->CFR[i];
 				}
 			}
 		}
@@ -2252,10 +2257,10 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
 		 ** --clm
 		 */
 		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
-			if (p_s_tb->FEB[i]) {
+			if (tb->FEB[i]) {
 				if (!clear_all_dirty_bits
-				    (p_s_tb->tb_sb, p_s_tb->FEB[i]))
-					locked = p_s_tb->FEB[i];
+				    (tb->tb_sb, tb->FEB[i]))
+					locked = tb->FEB[i];
 			}
 		}
 
@@ -2263,21 +2268,20 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
 #ifdef CONFIG_REISERFS_CHECK
 			repeat_counter++;
 			if ((repeat_counter % 10000) == 0) {
-				reiserfs_warning(p_s_tb->tb_sb, "reiserfs-8200",
+				reiserfs_warning(tb->tb_sb, "reiserfs-8200",
 						 "too many iterations waiting "
 						 "for buffer to unlock "
 						 "(%b)", locked);
 
 				/* Don't loop forever.  Try to recover from possible error. */
 
-				return (FILESYSTEM_CHANGED_TB(p_s_tb)) ?
+				return (FILESYSTEM_CHANGED_TB(tb)) ?
 				    REPEAT_SEARCH : CARRY_ON;
 			}
 #endif
 			__wait_on_buffer(locked);
-			if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+			if (FILESYSTEM_CHANGED_TB(tb))
 				return REPEAT_SEARCH;
-			}
 		}
 
 	} while (locked);
@@ -2307,138 +2311,136 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
  *	tb	tree_balance structure;
  *	inum	item number in S[h];
  *      pos_in_item - comment this if you can
- *      ins_ih & ins_sd are used when inserting
+ *      ins_ih	item head of item being inserted
+ *	data	inserted item or data to be pasted
  * Returns:	1 - schedule occurred while the function worked;
  *	        0 - schedule didn't occur while the function worked;
  *             -1 - if no_disk_space
  */
 
-int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih,	// item head of item being inserted
-	      const void *data	// inserted item or data to be pasted
-    )
+int fix_nodes(int n_op_mode, struct tree_balance *tb,
+	      struct item_head *p_s_ins_ih, const void *data)
 {
-	int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
+	int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(tb->tb_path);
 	int n_pos_in_item;
 
 	/* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
 	 ** during wait_tb_buffers_run
 	 */
 	int wait_tb_buffers_run = 0;
-	struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 
-	++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes;
+	++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
 
-	n_pos_in_item = p_s_tb->tb_path->pos_in_item;
+	n_pos_in_item = tb->tb_path->pos_in_item;
 
-	p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb);
+	tb->fs_gen = get_generation(tb->tb_sb);
 
 	/* we prepare and log the super here so it will already be in the
 	 ** transaction when do_balance needs to change it.
 	 ** This way do_balance won't have to schedule when trying to prepare
 	 ** the super for logging
 	 */
-	reiserfs_prepare_for_journal(p_s_tb->tb_sb,
-				     SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1);
-	journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
-			   SB_BUFFER_WITH_SB(p_s_tb->tb_sb));
-	if (FILESYSTEM_CHANGED_TB(p_s_tb))
+	reiserfs_prepare_for_journal(tb->tb_sb,
+				     SB_BUFFER_WITH_SB(tb->tb_sb), 1);
+	journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
+			   SB_BUFFER_WITH_SB(tb->tb_sb));
+	if (FILESYSTEM_CHANGED_TB(tb))
 		return REPEAT_SEARCH;
 
 	/* if it possible in indirect_to_direct conversion */
-	if (buffer_locked(p_s_tbS0)) {
-		__wait_on_buffer(p_s_tbS0);
-		if (FILESYSTEM_CHANGED_TB(p_s_tb))
+	if (buffer_locked(tbS0)) {
+		__wait_on_buffer(tbS0);
+		if (FILESYSTEM_CHANGED_TB(tb))
 			return REPEAT_SEARCH;
 	}
 #ifdef CONFIG_REISERFS_CHECK
 	if (cur_tb) {
 		print_cur_tb("fix_nodes");
-		reiserfs_panic(p_s_tb->tb_sb, "PAP-8305",
+		reiserfs_panic(tb->tb_sb, "PAP-8305",
 			       "there is pending do_balance");
 	}
 
-	if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
-		reiserfs_panic(p_s_tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
+	if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
+		reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
 			       "not uptodate at the beginning of fix_nodes "
 			       "or not in tree (mode %c)",
-			       p_s_tbS0, p_s_tbS0, n_op_mode);
-	}
+			       tbS0, tbS0, n_op_mode);
 
 	/* Check parameters. */
 	switch (n_op_mode) {
 	case M_INSERT:
-		if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
-			reiserfs_panic(p_s_tb->tb_sb, "PAP-8330", "Incorrect "
+		if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(tbS0))
+			reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
 				       "item number %d (in S0 - %d) in case "
 				       "of insert", n_item_num,
-				       B_NR_ITEMS(p_s_tbS0));
+				       B_NR_ITEMS(tbS0));
 		break;
 	case M_PASTE:
 	case M_DELETE:
 	case M_CUT:
-		if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
-			print_block(p_s_tbS0, 0, -1, -1);
-			reiserfs_panic(p_s_tb->tb_sb, "PAP-8335", "Incorrect "
+		if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(tbS0)) {
+			print_block(tbS0, 0, -1, -1);
+			reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
 				       "item number(%d); mode = %c "
 				       "insert_size = %d",
 				       n_item_num, n_op_mode,
-				       p_s_tb->insert_size[0]);
+				       tb->insert_size[0]);
 		}
 		break;
 	default:
-		reiserfs_panic(p_s_tb->tb_sb, "PAP-8340", "Incorrect mode "
+		reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
 			       "of operation");
 	}
 #endif
 
-	if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH)
+	if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
 		// FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
 		return REPEAT_SEARCH;
 
 	/* Starting from the leaf level; for all levels n_h of the tree. */
-	for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) {
-		if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) {
+	for (n_h = 0; n_h < MAX_HEIGHT && tb->insert_size[n_h]; n_h++) {
+		n_ret_value = get_direct_parent(tb, n_h);
+		if (n_ret_value != CARRY_ON)
 			goto repeat;
-		}
 
-		if ((n_ret_value =
-		     check_balance(n_op_mode, p_s_tb, n_h, n_item_num,
-				   n_pos_in_item, p_s_ins_ih,
-				   data)) != CARRY_ON) {
+		n_ret_value = check_balance(n_op_mode, tb, n_h, n_item_num,
+					    n_pos_in_item, p_s_ins_ih, data);
+		if (n_ret_value != CARRY_ON) {
 			if (n_ret_value == NO_BALANCING_NEEDED) {
 				/* No balancing for higher levels needed. */
-				if ((n_ret_value =
-				     get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
+				n_ret_value = get_neighbors(tb, n_h);
+				if (n_ret_value != CARRY_ON)
 					goto repeat;
-				}
 				if (n_h != MAX_HEIGHT - 1)
-					p_s_tb->insert_size[n_h + 1] = 0;
+					tb->insert_size[n_h + 1] = 0;
 				/* ok, analysis and resource gathering are complete */
 				break;
 			}
 			goto repeat;
 		}
 
-		if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
+		n_ret_value = get_neighbors(tb, n_h);
+		if (n_ret_value != CARRY_ON)
 			goto repeat;
-		}
 
-		if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) {
-			goto repeat;	/* No disk space, or schedule occurred and
-					   analysis may be invalid and needs to be redone. */
-		}
+		/* No disk space, or schedule occurred and analysis may be
+		 * invalid and needs to be redone. */
+		n_ret_value = get_empty_nodes(tb, n_h);
+		if (n_ret_value != CARRY_ON)
+			goto repeat;
 
-		if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) {
+		if (!PATH_H_PBUFFER(tb->tb_path, n_h)) {
 			/* We have a positive insert size but no nodes exist on this
 			   level, this means that we are creating a new root. */
 
-			RFALSE(p_s_tb->blknum[n_h] != 1,
+			RFALSE(tb->blknum[n_h] != 1,
 			       "PAP-8350: creating new empty root");
 
 			if (n_h < MAX_HEIGHT - 1)
-				p_s_tb->insert_size[n_h + 1] = 0;
-		} else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) {
-			if (p_s_tb->blknum[n_h] > 1) {
+				tb->insert_size[n_h + 1] = 0;
+		} else if (!PATH_H_PBUFFER(tb->tb_path, n_h + 1)) {
+			if (tb->blknum[n_h] > 1) {
 				/* The tree needs to be grown, so this node S[n_h]
 				   which is the root node is split into two nodes,
 				   and a new node (S[n_h+1]) will be created to
@@ -2447,19 +2449,20 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_
 				RFALSE(n_h == MAX_HEIGHT - 1,
 				       "PAP-8355: attempt to create too high of a tree");
 
-				p_s_tb->insert_size[n_h + 1] =
+				tb->insert_size[n_h + 1] =
 				    (DC_SIZE +
-				     KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) +
+				     KEY_SIZE) * (tb->blknum[n_h] - 1) +
 				    DC_SIZE;
 			} else if (n_h < MAX_HEIGHT - 1)
-				p_s_tb->insert_size[n_h + 1] = 0;
+				tb->insert_size[n_h + 1] = 0;
 		} else
-			p_s_tb->insert_size[n_h + 1] =
-			    (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
+			tb->insert_size[n_h + 1] =
+			    (DC_SIZE + KEY_SIZE) * (tb->blknum[n_h] - 1);
 	}
 
-	if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) {
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+	n_ret_value = wait_tb_buffers_until_unlocked(tb);
+	if (n_ret_value == CARRY_ON) {
+		if (FILESYSTEM_CHANGED_TB(tb)) {
 			wait_tb_buffers_run = 1;
 			n_ret_value = REPEAT_SEARCH;
 			goto repeat;
@@ -2482,50 +2485,49 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_
 
 		/* Release path buffers. */
 		if (wait_tb_buffers_run) {
-			pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path);
+			pathrelse_and_restore(tb->tb_sb, tb->tb_path);
 		} else {
-			pathrelse(p_s_tb->tb_path);
+			pathrelse(tb->tb_path);
 		}
 		/* brelse all resources collected for balancing */
 		for (i = 0; i < MAX_HEIGHT; i++) {
 			if (wait_tb_buffers_run) {
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->L[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->R[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->FL[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->FR[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->L[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->R[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FL[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FR[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
 								 CFL[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
 								 CFR[i]);
 			}
 
-			brelse(p_s_tb->L[i]);
-			brelse(p_s_tb->R[i]);
-			brelse(p_s_tb->FL[i]);
-			brelse(p_s_tb->FR[i]);
-			brelse(p_s_tb->CFL[i]);
-			brelse(p_s_tb->CFR[i]);
-
-			p_s_tb->L[i] = NULL;
-			p_s_tb->R[i] = NULL;
-			p_s_tb->FL[i] = NULL;
-			p_s_tb->FR[i] = NULL;
-			p_s_tb->CFL[i] = NULL;
-			p_s_tb->CFR[i] = NULL;
+			brelse(tb->L[i]);
+			brelse(tb->R[i]);
+			brelse(tb->FL[i]);
+			brelse(tb->FR[i]);
+			brelse(tb->CFL[i]);
+			brelse(tb->CFR[i]);
+
+			tb->L[i] = NULL;
+			tb->R[i] = NULL;
+			tb->FL[i] = NULL;
+			tb->FR[i] = NULL;
+			tb->CFL[i] = NULL;
+			tb->CFR[i] = NULL;
 		}
 
 		if (wait_tb_buffers_run) {
 			for (i = 0; i < MAX_FEB_SIZE; i++) {
-				if (p_s_tb->FEB[i]) {
+				if (tb->FEB[i])
 					reiserfs_restore_prepared_buffer
-					    (p_s_tb->tb_sb, p_s_tb->FEB[i]);
-				}
+					    (tb->tb_sb, tb->FEB[i]);
 			}
 		}
 		return n_ret_value;
@@ -2533,7 +2535,7 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_
 
 }
 
-/* Anatoly will probably forgive me renaming p_s_tb to tb. I just
+/* Anatoly will probably forgive me renaming tb to tb. I just
    wanted to make lines shorter */
 void unfix_nodes(struct tree_balance *tb)
 {
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 8f220fb777d7..5e867be559ea 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1063,17 +1063,17 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 }
 
 /* Calculate number of bytes which will be deleted or cut during balance */
-static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode)
+static int calc_deleted_bytes_number(struct tree_balance *tb, char c_mode)
 {
 	int n_del_size;
-	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path);
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path);
 
 	if (is_statdata_le_ih(p_le_ih))
 		return 0;
 
 	n_del_size =
 	    (c_mode ==
-	     M_DELETE) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
+	     M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
 	if (is_direntry_le_ih(p_le_ih)) {
 		// return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
 		// we can't use EMPTY_DIR_SIZE, as old format dirs have a different
@@ -1083,25 +1083,26 @@ static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode)
 	}
 
 	if (is_indirect_le_ih(p_le_ih))
-		n_del_size = (n_del_size / UNFM_P_SIZE) * (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);	// - get_ih_free_space (p_le_ih);
+		n_del_size = (n_del_size / UNFM_P_SIZE) *
+				(PATH_PLAST_BUFFER(tb->tb_path)->b_size);
 	return n_del_size;
 }
 
 static void init_tb_struct(struct reiserfs_transaction_handle *th,
-			   struct tree_balance *p_s_tb,
+			   struct tree_balance *tb,
 			   struct super_block *sb,
 			   struct treepath *p_s_path, int n_size)
 {
 
 	BUG_ON(!th->t_trans_id);
 
-	memset(p_s_tb, '\0', sizeof(struct tree_balance));
-	p_s_tb->transaction_handle = th;
-	p_s_tb->tb_sb = sb;
-	p_s_tb->tb_path = p_s_path;
+	memset(tb, '\0', sizeof(struct tree_balance));
+	tb->transaction_handle = th;
+	tb->tb_sb = sb;
+	tb->tb_path = p_s_path;
 	PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
 	PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
-	p_s_tb->insert_size[0] = n_size;
+	tb->insert_size[0] = n_size;
 }
 
 void padd_item(char *item, int total_length, int length)
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 3192dc793226..b72dc2095478 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2004,7 +2004,7 @@ extern const struct address_space_operations reiserfs_address_space_operations;
 
 /* fix_nodes.c */
 
-int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb,
+int fix_nodes(int n_op_mode, struct tree_balance *tb,
 	      struct item_head *p_s_ins_ih, const void *);
 void unfix_nodes(struct tree_balance *);
 
-- 
cgit v1.2.3-71-gd317


From d68caa9530a8ba54f97002e02bf6a0ad2462b8c0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 30 Mar 2009 14:02:49 -0400
Subject: reiserfs: rename p_._ variables

This patch is a simple s/p_._//g to the reiserfs code.  This is the
fifth in a series of patches to rip out some of the awful variable
naming in reiserfs.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/file.c            |   6 +-
 fs/reiserfs/fix_node.c        | 169 ++++++++-------
 fs/reiserfs/stree.c           | 472 +++++++++++++++++++++---------------------
 fs/reiserfs/tail_conversion.c |  28 +--
 include/linux/reiserfs_fs.h   |  46 ++--
 5 files changed, 365 insertions(+), 356 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index a73579f66214..cde16429ff00 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,10 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
  * be removed...
  */
 
-static int reiserfs_sync_file(struct file *p_s_filp,
-			      struct dentry *p_s_dentry, int datasync)
+static int reiserfs_sync_file(struct file *filp,
+			      struct dentry *dentry, int datasync)
 {
-	struct inode *inode = p_s_dentry->d_inode;
+	struct inode *inode = dentry->d_inode;
 	int n_err;
 	int barrier_done;
 
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5236a8829e31..d97a55574ba9 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -780,9 +780,9 @@ static void free_buffers_in_tb(struct tree_balance *tb)
 /* The function is NOT SCHEDULE-SAFE! */
 static int get_empty_nodes(struct tree_balance *tb, int n_h)
 {
-	struct buffer_head *p_s_new_bh,
-	    *p_s_Sh = PATH_H_PBUFFER(tb->tb_path, n_h);
-	b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
+	struct buffer_head *new_bh,
+	    *Sh = PATH_H_PBUFFER(tb->tb_path, n_h);
+	b_blocknr_t *blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
 	int n_counter, n_number_of_freeblk, n_amount_needed,	/* number of needed empty blocks */
 	 n_retval = CARRY_ON;
 	struct super_block *sb = tb->tb_sb;
@@ -810,8 +810,8 @@ static int get_empty_nodes(struct tree_balance *tb, int n_h)
 						   1) : 0;
 
 	/* Allocate missing empty blocks. */
-	/* if p_s_Sh == 0  then we are getting a new root */
-	n_amount_needed = (p_s_Sh) ? (tb->blknum[n_h] - 1) : 1;
+	/* if Sh == 0  then we are getting a new root */
+	n_amount_needed = (Sh) ? (tb->blknum[n_h] - 1) : 1;
 	/*  Amount_needed = the amount that we need more than the amount that we have. */
 	if (n_amount_needed > n_number_of_freeblk)
 		n_amount_needed -= n_number_of_freeblk;
@@ -824,25 +824,25 @@ static int get_empty_nodes(struct tree_balance *tb, int n_h)
 		return NO_DISK_SPACE;
 
 	/* for each blocknumber we just got, get a buffer and stick it on FEB */
-	for (p_n_blocknr = a_n_blocknrs, n_counter = 0;
-	     n_counter < n_amount_needed; p_n_blocknr++, n_counter++) {
+	for (blocknr = a_n_blocknrs, n_counter = 0;
+	     n_counter < n_amount_needed; blocknr++, n_counter++) {
 
-		RFALSE(!*p_n_blocknr,
+		RFALSE(!*blocknr,
 		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
 
-		p_s_new_bh = sb_getblk(sb, *p_n_blocknr);
-		RFALSE(buffer_dirty(p_s_new_bh) ||
-		       buffer_journaled(p_s_new_bh) ||
-		       buffer_journal_dirty(p_s_new_bh),
+		new_bh = sb_getblk(sb, *blocknr);
+		RFALSE(buffer_dirty(new_bh) ||
+		       buffer_journaled(new_bh) ||
+		       buffer_journal_dirty(new_bh),
 		       "PAP-8140: journlaled or dirty buffer %b for the new block",
-		       p_s_new_bh);
+		       new_bh);
 
 		/* Put empty buffers into the array. */
 		RFALSE(tb->FEB[tb->cur_blknum],
 		       "PAP-8141: busy slot for new buffer");
 
-		set_buffer_journal_new(p_s_new_bh);
-		tb->FEB[tb->cur_blknum++] = p_s_new_bh;
+		set_buffer_journal_new(new_bh);
+		tb->FEB[tb->cur_blknum++] = new_bh;
 	}
 
 	if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
@@ -898,7 +898,7 @@ static int get_rfree(struct tree_balance *tb, int h)
 /* Check whether left neighbor is in memory. */
 static int is_left_neighbor_in_cache(struct tree_balance *tb, int n_h)
 {
-	struct buffer_head *p_s_father, *left;
+	struct buffer_head *father, *left;
 	struct super_block *sb = tb->tb_sb;
 	b_blocknr_t n_left_neighbor_blocknr;
 	int n_left_neighbor_position;
@@ -908,18 +908,18 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int n_h)
 		return 0;
 
 	/* Calculate father of the node to be balanced. */
-	p_s_father = PATH_H_PBUFFER(tb->tb_path, n_h + 1);
+	father = PATH_H_PBUFFER(tb->tb_path, n_h + 1);
 
-	RFALSE(!p_s_father ||
-	       !B_IS_IN_TREE(p_s_father) ||
+	RFALSE(!father ||
+	       !B_IS_IN_TREE(father) ||
 	       !B_IS_IN_TREE(tb->FL[n_h]) ||
-	       !buffer_uptodate(p_s_father) ||
+	       !buffer_uptodate(father) ||
 	       !buffer_uptodate(tb->FL[n_h]),
 	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
-	       p_s_father, tb->FL[n_h]);
+	       father, tb->FL[n_h]);
 
 	/* Get position of the pointer to the left neighbor into the left father. */
-	n_left_neighbor_position = (p_s_father == tb->FL[n_h]) ?
+	n_left_neighbor_position = (father == tb->FL[n_h]) ?
 	    tb->lkey[n_h] : B_NR_ITEMS(tb->FL[n_h]);
 	/* Get left neighbor block number. */
 	n_left_neighbor_blocknr =
@@ -940,10 +940,10 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int n_h)
 #define LEFT_PARENTS  'l'
 #define RIGHT_PARENTS 'r'
 
-static void decrement_key(struct cpu_key *p_s_key)
+static void decrement_key(struct cpu_key *key)
 {
 	// call item specific function for this key
-	item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key);
+	item_ops[cpu_key_k_type(key)]->decrement_key(key);
 }
 
 /* Calculate far left/right parent of the left/right neighbor of the current node, that
@@ -956,17 +956,17 @@ static void decrement_key(struct cpu_key *p_s_key)
  */
 static int get_far_parent(struct tree_balance *tb,
 			  int n_h,
-			  struct buffer_head **pp_s_father,
-			  struct buffer_head **pp_s_com_father, char c_lr_par)
+			  struct buffer_head **pfather,
+			  struct buffer_head **pcom_father, char c_lr_par)
 {
-	struct buffer_head *p_s_parent;
+	struct buffer_head *parent;
 	INITIALIZE_PATH(s_path_to_neighbor_father);
-	struct treepath *p_s_path = tb->tb_path;
+	struct treepath *path = tb->tb_path;
 	struct cpu_key s_lr_father_key;
 	int n_counter,
 	    n_position = INT_MAX,
 	    n_first_last_position = 0,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
+	    n_path_offset = PATH_H_PATH_OFFSET(path, n_h);
 
 	/* Starting from F[n_h] go upwards in the tree, and look for the common
 	   ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
@@ -979,25 +979,25 @@ static int get_far_parent(struct tree_balance *tb,
 	for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) {
 		/* Check whether parent of the current buffer in the path is really parent in the tree. */
 		if (!B_IS_IN_TREE
-		    (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)))
+		    (parent = PATH_OFFSET_PBUFFER(path, n_counter - 1)))
 			return REPEAT_SEARCH;
 		/* Check whether position in the parent is correct. */
 		if ((n_position =
-		     PATH_OFFSET_POSITION(p_s_path,
+		     PATH_OFFSET_POSITION(path,
 					  n_counter - 1)) >
-		    B_NR_ITEMS(p_s_parent))
+		    B_NR_ITEMS(parent))
 			return REPEAT_SEARCH;
 		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
-		    PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr)
+		if (B_N_CHILD_NUM(parent, n_position) !=
+		    PATH_OFFSET_PBUFFER(path, n_counter)->b_blocknr)
 			return REPEAT_SEARCH;
 		/* Return delimiting key if position in the parent is not equal to first/last one. */
 		if (c_lr_par == RIGHT_PARENTS)
-			n_first_last_position = B_NR_ITEMS(p_s_parent);
+			n_first_last_position = B_NR_ITEMS(parent);
 		if (n_position != n_first_last_position) {
-			*pp_s_com_father = p_s_parent;
-			get_bh(*pp_s_com_father);
-			/*(*pp_s_com_father = p_s_parent)->b_count++; */
+			*pcom_father = parent;
+			get_bh(*pcom_father);
+			/*(*pcom_father = parent)->b_count++; */
 			break;
 		}
 	}
@@ -1009,22 +1009,22 @@ static int get_far_parent(struct tree_balance *tb,
 		    (tb->tb_path,
 		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
 		    SB_ROOT_BLOCK(tb->tb_sb)) {
-			*pp_s_father = *pp_s_com_father = NULL;
+			*pfather = *pcom_father = NULL;
 			return CARRY_ON;
 		}
 		return REPEAT_SEARCH;
 	}
 
-	RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
+	RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
 	       "PAP-8185: (%b %z) level too small",
-	       *pp_s_com_father, *pp_s_com_father);
+	       *pcom_father, *pcom_father);
 
 	/* Check whether the common parent is locked. */
 
-	if (buffer_locked(*pp_s_com_father)) {
-		__wait_on_buffer(*pp_s_com_father);
+	if (buffer_locked(*pcom_father)) {
+		__wait_on_buffer(*pcom_father);
 		if (FILESYSTEM_CHANGED_TB(tb)) {
-			brelse(*pp_s_com_father);
+			brelse(*pcom_father);
 			return REPEAT_SEARCH;
 		}
 	}
@@ -1034,7 +1034,7 @@ static int get_far_parent(struct tree_balance *tb,
 
 	/* Form key to get parent of the left/right neighbor. */
 	le_key2cpu_key(&s_lr_father_key,
-		       B_N_PDELIM_KEY(*pp_s_com_father,
+		       B_N_PDELIM_KEY(*pcom_father,
 				      (c_lr_par ==
 				       LEFT_PARENTS) ? (tb->lkey[n_h - 1] =
 							n_position -
@@ -1053,14 +1053,14 @@ static int get_far_parent(struct tree_balance *tb,
 
 	if (FILESYSTEM_CHANGED_TB(tb)) {
 		pathrelse(&s_path_to_neighbor_father);
-		brelse(*pp_s_com_father);
+		brelse(*pcom_father);
 		return REPEAT_SEARCH;
 	}
 
-	*pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
+	*pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
 
-	RFALSE(B_LEVEL(*pp_s_father) != n_h + 1,
-	       "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
+	RFALSE(B_LEVEL(*pfather) != n_h + 1,
+	       "PAP-8190: (%b %z) level too small", *pfather, *pfather);
 	RFALSE(s_path_to_neighbor_father.path_length <
 	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
 
@@ -1078,11 +1078,11 @@ static int get_far_parent(struct tree_balance *tb,
  */
 static int get_parents(struct tree_balance *tb, int n_h)
 {
-	struct treepath *p_s_path = tb->tb_path;
+	struct treepath *path = tb->tb_path;
 	int n_position,
 	    n_ret_value,
 	    n_path_offset = PATH_H_PATH_OFFSET(tb->tb_path, n_h);
-	struct buffer_head *p_s_curf, *p_s_curcf;
+	struct buffer_head *curf, *curcf;
 
 	/* Current node is the root of the tree or will be root of the tree */
 	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
@@ -1100,66 +1100,65 @@ static int get_parents(struct tree_balance *tb, int n_h)
 	}
 
 	/* Get parent FL[n_path_offset] of L[n_path_offset]. */
-	if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) {
+	n_position = PATH_OFFSET_POSITION(path, n_path_offset - 1);
+	if (n_position) {
 		/* Current node is not the first child of its parent. */
-		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
-		p_s_curf = p_s_curcf =
-		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
-		get_bh(p_s_curf);
-		get_bh(p_s_curf);
+		curf = PATH_OFFSET_PBUFFER(path, n_path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, n_path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
 		tb->lkey[n_h] = n_position - 1;
 	} else {
 		/* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
 		   Calculate current common parent of L[n_path_offset] and the current node. Note that
 		   CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
 		   Calculate lkey[n_path_offset]. */
-		if ((n_ret_value = get_far_parent(tb, n_h + 1, &p_s_curf,
-						  &p_s_curcf,
+		if ((n_ret_value = get_far_parent(tb, n_h + 1, &curf,
+						  &curcf,
 						  LEFT_PARENTS)) != CARRY_ON)
 			return n_ret_value;
 	}
 
 	brelse(tb->FL[n_h]);
-	tb->FL[n_h] = p_s_curf;	/* New initialization of FL[n_h]. */
+	tb->FL[n_h] = curf;	/* New initialization of FL[n_h]. */
 	brelse(tb->CFL[n_h]);
-	tb->CFL[n_h] = p_s_curcf;	/* New initialization of CFL[n_h]. */
+	tb->CFL[n_h] = curcf;	/* New initialization of CFL[n_h]. */
 
-	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
-	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
-	       "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
 
 /* Get parent FR[n_h] of R[n_h]. */
 
 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
-	if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) {
+	if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(path, n_h + 1))) {
 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
    Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
    not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
 		if ((n_ret_value =
-		     get_far_parent(tb, n_h + 1, &p_s_curf, &p_s_curcf,
+		     get_far_parent(tb, n_h + 1, &curf, &curcf,
 				    RIGHT_PARENTS)) != CARRY_ON)
 			return n_ret_value;
 	} else {
 /* Current node is not the last child of its parent F[n_h]. */
-		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
-		p_s_curf = p_s_curcf =
-		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
-		get_bh(p_s_curf);
-		get_bh(p_s_curf);
+		curf = PATH_OFFSET_PBUFFER(path, n_path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, n_path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
 		tb->rkey[n_h] = n_position;
 	}
 
 	brelse(tb->FR[n_h]);
 	/* New initialization of FR[n_path_offset]. */
-	tb->FR[n_h] = p_s_curf;
+	tb->FR[n_h] = curf;
 
 	brelse(tb->CFR[n_h]);
 	/* New initialization of CFR[n_path_offset]. */
-	tb->CFR[n_h] = p_s_curcf;
+	tb->CFR[n_h] = curcf;
 
-	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
-	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
-	       "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
 
 	return CARRY_ON;
 }
@@ -1893,7 +1892,7 @@ static int check_balance(int mode,
 static int get_direct_parent(struct tree_balance *tb, int n_h)
 {
 	struct buffer_head *bh;
-	struct treepath *p_s_path = tb->tb_path;
+	struct treepath *path = tb->tb_path;
 	int n_position,
 	    n_path_offset = PATH_H_PATH_OFFSET(tb->tb_path, n_h);
 
@@ -1903,27 +1902,27 @@ static int get_direct_parent(struct tree_balance *tb, int n_h)
 		RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
 		       "PAP-8260: invalid offset in the path");
 
-		if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
+		if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
 		    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
 			/* Root is not changed. */
-			PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
-			PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
+			PATH_OFFSET_PBUFFER(path, n_path_offset - 1) = NULL;
+			PATH_OFFSET_POSITION(path, n_path_offset - 1) = 0;
 			return CARRY_ON;
 		}
 		return REPEAT_SEARCH;	/* Root is changed and we must recalculate the path. */
 	}
 
 	if (!B_IS_IN_TREE
-	    (bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
+	    (bh = PATH_OFFSET_PBUFFER(path, n_path_offset - 1)))
 		return REPEAT_SEARCH;	/* Parent in the path is not in the tree. */
 
 	if ((n_position =
-	     PATH_OFFSET_POSITION(p_s_path,
+	     PATH_OFFSET_POSITION(path,
 				  n_path_offset - 1)) > B_NR_ITEMS(bh))
 		return REPEAT_SEARCH;
 
 	if (B_N_CHILD_NUM(bh, n_position) !=
-	    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
+	    PATH_OFFSET_PBUFFER(path, n_path_offset)->b_blocknr)
 		/* Parent in the path is not parent of the current node in the tree. */
 		return REPEAT_SEARCH;
 
@@ -2319,7 +2318,7 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
  */
 
 int fix_nodes(int n_op_mode, struct tree_balance *tb,
-	      struct item_head *p_s_ins_ih, const void *data)
+	      struct item_head *ins_ih, const void *data)
 {
 	int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(tb->tb_path);
 	int n_pos_in_item;
@@ -2405,7 +2404,7 @@ int fix_nodes(int n_op_mode, struct tree_balance *tb,
 			goto repeat;
 
 		n_ret_value = check_balance(n_op_mode, tb, n_h, n_item_num,
-					    n_pos_in_item, p_s_ins_ih, data);
+					    n_pos_in_item, ins_ih, data);
 		if (n_ret_value != CARRY_ON) {
 			if (n_ret_value == NO_BALANCING_NEEDED) {
 				/* No balancing for higher levels needed. */
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 5e867be559ea..fd769c8dac32 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -68,10 +68,10 @@ inline int B_IS_IN_TREE(const struct buffer_head *bh)
 //
 // to gets item head in le form
 //
-inline void copy_item_head(struct item_head *p_v_to,
-			   const struct item_head *p_v_from)
+inline void copy_item_head(struct item_head *to,
+			   const struct item_head *from)
 {
-	memcpy(p_v_to, p_v_from, IH_SIZE);
+	memcpy(to, from, IH_SIZE);
 }
 
 /* k1 is pointer to on-disk structure which is stored in little-endian
@@ -135,15 +135,15 @@ static inline int comp_keys(const struct reiserfs_key *le_key,
 inline int comp_short_le_keys(const struct reiserfs_key *key1,
 			      const struct reiserfs_key *key2)
 {
-	__u32 *p_s_1_u32, *p_s_2_u32;
+	__u32 *k1_u32, *k2_u32;
 	int n_key_length = REISERFS_SHORT_KEY_LEN;
 
-	p_s_1_u32 = (__u32 *) key1;
-	p_s_2_u32 = (__u32 *) key2;
-	for (; n_key_length--; ++p_s_1_u32, ++p_s_2_u32) {
-		if (le32_to_cpu(*p_s_1_u32) < le32_to_cpu(*p_s_2_u32))
+	k1_u32 = (__u32 *) key1;
+	k2_u32 = (__u32 *) key2;
+	for (; n_key_length--; ++k1_u32, ++k2_u32) {
+		if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
 			return -1;
-		if (le32_to_cpu(*p_s_1_u32) > le32_to_cpu(*p_s_2_u32))
+		if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
 			return 1;
 	}
 	return 0;
@@ -174,8 +174,8 @@ inline int comp_le_keys(const struct reiserfs_key *k1,
  *  Binary search toolkit function                                        *
  *  Search for an item in the array by the item key                       *
  *  Returns:    1 if found,  0 if not found;                              *
- *        *p_n_pos = number of the searched element if found, else the    *
- *        number of the first element that is larger than p_v_key.        *
+ *        *pos = number of the searched element if found, else the        *
+ *        number of the first element that is larger than key.            *
  **************************************************************************/
 /* For those not familiar with binary search: n_lbound is the leftmost item that it
  could be, n_rbound the rightmost item that it could be.  We examine the item
@@ -184,28 +184,28 @@ inline int comp_le_keys(const struct reiserfs_key *k1,
  there are no possible items, and we have not found it. With each examination we
  cut the number of possible items it could be by one more than half rounded down,
  or we find it. */
-static inline int bin_search(const void *p_v_key,	/* Key to search for.                   */
-			     const void *p_v_base,	/* First item in the array.             */
-			     int p_n_num,	/* Number of items in the array.        */
-			     int p_n_width,	/* Item size in the array.
-						   searched. Lest the reader be
-						   confused, note that this is crafted
-						   as a general function, and when it
-						   is applied specifically to the array
-						   of item headers in a node, p_n_width
-						   is actually the item header size not
-						   the item size.                      */
-			     int *p_n_pos	/* Number of the searched for element. */
+static inline int bin_search(const void *key,	/* Key to search for. */
+			     const void *base,	/* First item in the array. */
+			     int num,	/* Number of items in the array. */
+			     int width,	/* Item size in the array.
+					   searched. Lest the reader be
+					   confused, note that this is crafted
+					   as a general function, and when it
+					   is applied specifically to the array
+					   of item headers in a node, width
+					   is actually the item header size not
+					   the item size. */
+			     int *pos /* Number of the searched for element. */
     )
 {
 	int n_rbound, n_lbound, n_j;
 
-	for (n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0)) / 2;
+	for (n_j = ((n_rbound = num - 1) + (n_lbound = 0)) / 2;
 	     n_lbound <= n_rbound; n_j = (n_rbound + n_lbound) / 2)
 		switch (comp_keys
-			((struct reiserfs_key *)((char *)p_v_base +
-						 n_j * p_n_width),
-			 (struct cpu_key *)p_v_key)) {
+			((struct reiserfs_key *)((char *)base +
+						 n_j * width),
+			 (struct cpu_key *)key)) {
 		case -1:
 			n_lbound = n_j + 1;
 			continue;
@@ -213,13 +213,13 @@ static inline int bin_search(const void *p_v_key,	/* Key to search for.
 			n_rbound = n_j - 1;
 			continue;
 		case 0:
-			*p_n_pos = n_j;
+			*pos = n_j;
 			return ITEM_FOUND;	/* Key found in the array.  */
 		}
 
 	/* bin_search did not find given key, it returns position of key,
 	   that is minimal and greater than the given one. */
-	*p_n_pos = n_lbound;
+	*pos = n_lbound;
 	return ITEM_NOT_FOUND;
 }
 
@@ -243,12 +243,12 @@ static const struct reiserfs_key MAX_KEY = {
    the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
    case we return a special key, either MIN_KEY or MAX_KEY. */
 static inline const struct reiserfs_key *get_lkey(const struct treepath
-						  *p_s_chk_path,
+						  *chk_path,
 						  const struct super_block
 						  *sb)
 {
-	int n_position, n_path_offset = p_s_chk_path->path_length;
-	struct buffer_head *p_s_parent;
+	int n_position, n_path_offset = chk_path->path_length;
+	struct buffer_head *parent;
 
 	RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
 	       "PAP-5010: invalid offset in the path");
@@ -257,42 +257,42 @@ static inline const struct reiserfs_key *get_lkey(const struct treepath
 	while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
 
 		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
+		       (PATH_OFFSET_PBUFFER(chk_path, n_path_offset)),
 		       "PAP-5020: parent is not uptodate");
 
 		/* Parent at the path is not in the tree now. */
 		if (!B_IS_IN_TREE
-		    (p_s_parent =
-		     PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)))
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, n_path_offset)))
 			return &MAX_KEY;
 		/* Check whether position in the parent is correct. */
 		if ((n_position =
-		     PATH_OFFSET_POSITION(p_s_chk_path,
+		     PATH_OFFSET_POSITION(chk_path,
 					  n_path_offset)) >
-		    B_NR_ITEMS(p_s_parent))
+		    B_NR_ITEMS(parent))
 			return &MAX_KEY;
 		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
-		    PATH_OFFSET_PBUFFER(p_s_chk_path,
+		if (B_N_CHILD_NUM(parent, n_position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
 					n_path_offset + 1)->b_blocknr)
 			return &MAX_KEY;
 		/* Return delimiting key if position in the parent is not equal to zero. */
 		if (n_position)
-			return B_N_PDELIM_KEY(p_s_parent, n_position - 1);
+			return B_N_PDELIM_KEY(parent, n_position - 1);
 	}
 	/* Return MIN_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
 	    b_blocknr == SB_ROOT_BLOCK(sb))
 		return &MIN_KEY;
 	return &MAX_KEY;
 }
 
 /* Get delimiting key of the buffer at the path and its right neighbor. */
-inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
+inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
 					   const struct super_block *sb)
 {
-	int n_position, n_path_offset = p_s_chk_path->path_length;
-	struct buffer_head *p_s_parent;
+	int n_position, n_path_offset = chk_path->path_length;
+	struct buffer_head *parent;
 
 	RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
 	       "PAP-5030: invalid offset in the path");
@@ -300,31 +300,31 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
 	while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
 
 		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
+		       (PATH_OFFSET_PBUFFER(chk_path, n_path_offset)),
 		       "PAP-5040: parent is not uptodate");
 
 		/* Parent at the path is not in the tree now. */
 		if (!B_IS_IN_TREE
-		    (p_s_parent =
-		     PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)))
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, n_path_offset)))
 			return &MIN_KEY;
 		/* Check whether position in the parent is correct. */
 		if ((n_position =
-		     PATH_OFFSET_POSITION(p_s_chk_path,
+		     PATH_OFFSET_POSITION(chk_path,
 					  n_path_offset)) >
-		    B_NR_ITEMS(p_s_parent))
+		    B_NR_ITEMS(parent))
 			return &MIN_KEY;
 		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
-		    PATH_OFFSET_PBUFFER(p_s_chk_path,
+		if (B_N_CHILD_NUM(parent, n_position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
 					n_path_offset + 1)->b_blocknr)
 			return &MIN_KEY;
 		/* Return delimiting key if position in the parent is not the last one. */
-		if (n_position != B_NR_ITEMS(p_s_parent))
-			return B_N_PDELIM_KEY(p_s_parent, n_position);
+		if (n_position != B_NR_ITEMS(parent))
+			return B_N_PDELIM_KEY(parent, n_position);
 	}
 	/* Return MAX_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
 	    b_blocknr == SB_ROOT_BLOCK(sb))
 		return &MAX_KEY;
 	return &MIN_KEY;
@@ -335,25 +335,25 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
    the path.  These delimiting keys are stored at least one level above that buffer in the tree. If the
    buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
    this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
-static inline int key_in_buffer(struct treepath *p_s_chk_path,	/* Path which should be checked.  */
-				const struct cpu_key *p_s_key,	/* Key which should be checked.   */
-				struct super_block *sb	/* Super block pointer.	   */
+static inline int key_in_buffer(struct treepath *chk_path,	/* Path which should be checked.  */
+				const struct cpu_key *key,	/* Key which should be checked.   */
+				struct super_block *sb
     )
 {
 
-	RFALSE(!p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
-	       || p_s_chk_path->path_length > MAX_HEIGHT,
+	RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
+	       || chk_path->path_length > MAX_HEIGHT,
 	       "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
-	       p_s_key, p_s_chk_path->path_length);
-	RFALSE(!PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev,
+	       key, chk_path->path_length);
+	RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
 	       "PAP-5060: device must not be NODEV");
 
-	if (comp_keys(get_lkey(p_s_chk_path, sb), p_s_key) == 1)
+	if (comp_keys(get_lkey(chk_path, sb), key) == 1)
 		/* left delimiting key is bigger, that the key we look for */
 		return 0;
-	//  if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, sb)) != -1 )
-	if (comp_keys(get_rkey(p_s_chk_path, sb), p_s_key) != 1)
-		/* p_s_key must be less than right delimitiing key */
+	/*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
+	if (comp_keys(get_rkey(chk_path, sb), key) != 1)
+		/* key must be less than right delimitiing key */
 		return 0;
 	return 1;
 }
@@ -369,34 +369,34 @@ int reiserfs_check_path(struct treepath *p)
  * dirty bits clean when preparing the buffer for the log.
  * This version should only be called from fix_nodes() */
 void pathrelse_and_restore(struct super_block *sb,
-			   struct treepath *p_s_search_path)
+			   struct treepath *search_path)
 {
-	int n_path_offset = p_s_search_path->path_length;
+	int n_path_offset = search_path->path_length;
 
 	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
 	       "clm-4000: invalid path offset");
 
 	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
 		struct buffer_head *bh;
-		bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--);
+		bh = PATH_OFFSET_PBUFFER(search_path, n_path_offset--);
 		reiserfs_restore_prepared_buffer(sb, bh);
 		brelse(bh);
 	}
-	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
 }
 
 /* Drop the reference to each buffer in a path */
-void pathrelse(struct treepath *p_s_search_path)
+void pathrelse(struct treepath *search_path)
 {
-	int n_path_offset = p_s_search_path->path_length;
+	int n_path_offset = search_path->path_length;
 
 	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
 	       "PAP-5090: invalid path offset");
 
 	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
-		brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
+		brelse(PATH_OFFSET_PBUFFER(search_path, n_path_offset--));
 
-	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
 }
 
 static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
@@ -547,9 +547,9 @@ static void search_by_key_reada(struct super_block *s,
  * Algorithm   SearchByKey                                                *
  *             look for item in the Disk S+Tree by its key                *
  * Input:  sb   -  super block                                            *
- *         p_s_key  - pointer to the key to search                        *
+ *         key  - pointer to the key to search                            *
  * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR                         *
- *         p_s_search_path - path from the root to the needed leaf        *
+ *         search_path - path from the root to the needed leaf            *
  **************************************************************************/
 
 /* This function fills up the path from the root to the leaf as it
@@ -566,8 +566,8 @@ static void search_by_key_reada(struct super_block *s,
    correctness of the top of the path but need not be checked for the
    correctness of the bottom of the path */
 /* The function is NOT SCHEDULE-SAFE! */
-int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key to search. */
-		  struct treepath *p_s_search_path,/* This structure was
+int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to search. */
+		  struct treepath *search_path,/* This structure was
 						   allocated and initialized
 						   by the calling
 						   function. It is filled up
@@ -580,7 +580,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 	b_blocknr_t n_block_number;
 	int expected_level;
 	struct buffer_head *bh;
-	struct path_element *p_s_last_element;
+	struct path_element *last_element;
 	int n_node_level, n_retval;
 	int right_neighbor_of_leaf_node;
 	int fs_gen;
@@ -598,7 +598,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 	   we must be careful to release all nodes in a path before we either
 	   discard the path struct or re-use the path struct, as we do here. */
 
-	pathrelse(p_s_search_path);
+	pathrelse(search_path);
 
 	right_neighbor_of_leaf_node = 0;
 
@@ -615,18 +615,18 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 					 "%s: there were %d iterations of "
 					 "while loop looking for key %K",
 					 current->comm, n_repeat_counter,
-					 p_s_key);
+					 key);
 #endif
 
 		/* prep path to have another element added to it. */
-		p_s_last_element =
-		    PATH_OFFSET_PELEMENT(p_s_search_path,
-					 ++p_s_search_path->path_length);
+		last_element =
+		    PATH_OFFSET_PELEMENT(search_path,
+					 ++search_path->path_length);
 		fs_gen = get_generation(sb);
 
 		/* Read the next tree node, and set the last element in the path to
 		   have a pointer to it. */
-		if ((bh = p_s_last_element->pe_buffer =
+		if ((bh = last_element->pe_buffer =
 		     sb_getblk(sb, n_block_number))) {
 			if (!buffer_uptodate(bh) && reada_count > 1)
 				search_by_key_reada(sb, reada_bh,
@@ -637,8 +637,8 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 				goto io_error;
 		} else {
 		      io_error:
-			p_s_search_path->path_length--;
-			pathrelse(p_s_search_path);
+			search_path->path_length--;
+			pathrelse(search_path);
 			return IO_ERROR;
 		}
 		reada_count = 0;
@@ -652,12 +652,12 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 		if (fs_changed(fs_gen, sb) &&
 		    (!B_IS_IN_TREE(bh) ||
 		     B_LEVEL(bh) != expected_level ||
-		     !key_in_buffer(p_s_search_path, p_s_key, sb))) {
+		     !key_in_buffer(search_path, key, sb))) {
 			PROC_INFO_INC(sb, search_by_key_fs_changed);
 			PROC_INFO_INC(sb, search_by_key_restarted);
 			PROC_INFO_INC(sb,
 				      sbk_restarted[expected_level - 1]);
-			pathrelse(p_s_search_path);
+			pathrelse(search_path);
 
 			/* Get the root block number so that we can repeat the search
 			   starting from the root. */
@@ -669,11 +669,11 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 			continue;
 		}
 
-		/* only check that the key is in the buffer if p_s_key is not
+		/* only check that the key is in the buffer if key is not
 		   equal to the MAX_KEY. Latter case is only possible in
 		   "finish_unfinished()" processing during mount. */
-		RFALSE(comp_keys(&MAX_KEY, p_s_key) &&
-		       !key_in_buffer(p_s_search_path, p_s_key, sb),
+		RFALSE(comp_keys(&MAX_KEY, key) &&
+		       !key_in_buffer(search_path, key, sb),
 		       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
 		if (cur_tb) {
@@ -689,7 +689,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 			reiserfs_error(sb, "vs-5150",
 				       "invalid format found in block %ld. "
 				       "Fsck?", bh->b_blocknr);
-			pathrelse(p_s_search_path);
+			pathrelse(search_path);
 			return IO_ERROR;
 		}
 
@@ -702,12 +702,12 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 		       "vs-5152: tree level (%d) is less than stop level (%d)",
 		       n_node_level, n_stop_level);
 
-		n_retval = bin_search(p_s_key, B_N_PITEM_HEAD(bh, 0),
+		n_retval = bin_search(key, B_N_PITEM_HEAD(bh, 0),
 				      B_NR_ITEMS(bh),
 				      (n_node_level ==
 				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
 				      KEY_SIZE,
-				      &(p_s_last_element->pe_position));
+				      &(last_element->pe_position));
 		if (n_node_level == n_stop_level) {
 			return n_retval;
 		}
@@ -715,7 +715,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 		/* we are not in the stop level */
 		if (n_retval == ITEM_FOUND)
 			/* item has been found, so we choose the pointer which is to the right of the found one */
-			p_s_last_element->pe_position++;
+			last_element->pe_position++;
 
 		/* if item was not found we choose the position which is to
 		   the left of the found item. This requires no code,
@@ -725,23 +725,23 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 		   an internal node.  Now we calculate child block number by
 		   position in the node. */
 		n_block_number =
-		    B_N_CHILD_NUM(bh, p_s_last_element->pe_position);
+		    B_N_CHILD_NUM(bh, last_element->pe_position);
 
 		/* if we are going to read leaf nodes, try for read ahead as well */
-		if ((p_s_search_path->reada & PATH_READA) &&
+		if ((search_path->reada & PATH_READA) &&
 		    n_node_level == DISK_LEAF_NODE_LEVEL + 1) {
-			int pos = p_s_last_element->pe_position;
+			int pos = last_element->pe_position;
 			int limit = B_NR_ITEMS(bh);
 			struct reiserfs_key *le_key;
 
-			if (p_s_search_path->reada & PATH_READA_BACK)
+			if (search_path->reada & PATH_READA_BACK)
 				limit = 0;
 			while (reada_count < SEARCH_BY_KEY_READA) {
 				if (pos == limit)
 					break;
 				reada_blocks[reada_count++] =
 				    B_N_CHILD_NUM(bh, pos);
-				if (p_s_search_path->reada & PATH_READA_BACK)
+				if (search_path->reada & PATH_READA_BACK)
 					pos--;
 				else
 					pos++;
@@ -751,7 +751,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 				 */
 				le_key = B_N_PDELIM_KEY(bh, pos);
 				if (le32_to_cpu(le_key->k_objectid) !=
-				    p_s_key->on_disk_key.k_objectid) {
+				    key->on_disk_key.k_objectid) {
 					break;
 				}
 			}
@@ -760,11 +760,11 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 }
 
 /* Form the path to an item and position in this item which contains
-   file byte defined by p_s_key. If there is no such item
+   file byte defined by key. If there is no such item
    corresponding to the key, we point the path to the item with
-   maximal key less than p_s_key, and *p_n_pos_in_item is set to one
+   maximal key less than key, and *pos_in_item is set to one
    past the last entry/byte in the item.  If searching for entry in a
-   directory item, and it is not found, *p_n_pos_in_item is set to one
+   directory item, and it is not found, *pos_in_item is set to one
    entry more than the entry with maximal key which is less than the
    sought key.
 
@@ -777,7 +777,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *p_s_key,	/* Key
 /* The function is NOT SCHEDULE-SAFE! */
 int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super block.          */
 			       const struct cpu_key *p_cpu_key,	/* Key to search (cpu variable)         */
-			       struct treepath *p_s_search_path	/* Filled up by this function.          */
+			       struct treepath *search_path	/* Filled up by this function.          */
     )
 {
 	struct item_head *p_le_ih;	/* pointer to on-disk structure */
@@ -788,34 +788,34 @@ int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super b
 
 	/* If searching for directory entry. */
 	if (is_direntry_cpu_key(p_cpu_key))
-		return search_by_entry_key(sb, p_cpu_key, p_s_search_path,
+		return search_by_entry_key(sb, p_cpu_key, search_path,
 					   &de);
 
 	/* If not searching for directory entry. */
 
 	/* If item is found. */
-	retval = search_item(sb, p_cpu_key, p_s_search_path);
+	retval = search_item(sb, p_cpu_key, search_path);
 	if (retval == IO_ERROR)
 		return retval;
 	if (retval == ITEM_FOUND) {
 
 		RFALSE(!ih_item_len
 		       (B_N_PITEM_HEAD
-			(PATH_PLAST_BUFFER(p_s_search_path),
-			 PATH_LAST_POSITION(p_s_search_path))),
+			(PATH_PLAST_BUFFER(search_path),
+			 PATH_LAST_POSITION(search_path))),
 		       "PAP-5165: item length equals zero");
 
-		pos_in_item(p_s_search_path) = 0;
+		pos_in_item(search_path) = 0;
 		return POSITION_FOUND;
 	}
 
-	RFALSE(!PATH_LAST_POSITION(p_s_search_path),
+	RFALSE(!PATH_LAST_POSITION(search_path),
 	       "PAP-5170: position equals zero");
 
 	/* Item is not found. Set path to the previous item. */
 	p_le_ih =
-	    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path),
-			   --PATH_LAST_POSITION(p_s_search_path));
+	    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path),
+			   --PATH_LAST_POSITION(search_path));
 	n_blk_size = sb->s_blocksize;
 
 	if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
@@ -829,9 +829,9 @@ int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super b
 	/* Needed byte is contained in the item pointed to by the path. */
 	if (item_offset <= offset &&
 	    item_offset + op_bytes_number(p_le_ih, n_blk_size) > offset) {
-		pos_in_item(p_s_search_path) = offset - item_offset;
+		pos_in_item(search_path) = offset - item_offset;
 		if (is_indirect_le_ih(p_le_ih)) {
-			pos_in_item(p_s_search_path) /= n_blk_size;
+			pos_in_item(search_path) /= n_blk_size;
 		}
 		return POSITION_FOUND;
 	}
@@ -839,18 +839,18 @@ int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super b
 	/* Needed byte is not contained in the item pointed to by the
 	   path. Set pos_in_item out of the item. */
 	if (is_indirect_le_ih(p_le_ih))
-		pos_in_item(p_s_search_path) =
+		pos_in_item(search_path) =
 		    ih_item_len(p_le_ih) / UNFM_P_SIZE;
 	else
-		pos_in_item(p_s_search_path) = ih_item_len(p_le_ih);
+		pos_in_item(search_path) = ih_item_len(p_le_ih);
 
 	return POSITION_NOT_FOUND;
 }
 
 /* Compare given item and item pointed to by the path. */
-int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_path)
+int comp_items(const struct item_head *stored_ih, const struct treepath *path)
 {
-	struct buffer_head *bh = PATH_PLAST_BUFFER(p_s_path);
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
 	struct item_head *ih;
 
 	/* Last buffer at the path is not in the tree. */
@@ -858,11 +858,11 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_pat
 		return 1;
 
 	/* Last path position is invalid. */
-	if (PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(bh))
+	if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
 		return 1;
 
 	/* we need only to know, whether it is the same item */
-	ih = get_ih(p_s_path);
+	ih = get_ih(path);
 	return memcmp(stored_ih, ih, IH_SIZE);
 }
 
@@ -951,14 +951,14 @@ static inline int prepare_for_direntry_item(struct treepath *path,
     In case of file truncate calculate whether this item must be deleted/truncated or last
     unformatted node of this item will be converted to a direct item.
     This function returns a determination of what balance mode the calling function should employ. */
-static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *p_s_path, const struct cpu_key *p_s_item_key, int *p_n_removed,	/* Number of unformatted nodes which were removed
+static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed,	/* Number of unformatted nodes which were removed
 																						   from end of the file. */
-				      int *p_n_cut_size, unsigned long long n_new_file_length	/* MAX_KEY_OFFSET in case of delete. */
+				      int *cut_size, unsigned long long n_new_file_length	/* MAX_KEY_OFFSET in case of delete. */
     )
 {
 	struct super_block *sb = inode->i_sb;
-	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_path);
-	struct buffer_head *bh = PATH_PLAST_BUFFER(p_s_path);
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
 
 	BUG_ON(!th->t_trans_id);
 
@@ -968,20 +968,20 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 		RFALSE(n_new_file_length != max_reiserfs_offset(inode),
 		       "PAP-5210: mode must be M_DELETE");
 
-		*p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
+		*cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
 		return M_DELETE;
 	}
 
 	/* Directory item. */
 	if (is_direntry_le_ih(p_le_ih))
-		return prepare_for_direntry_item(p_s_path, p_le_ih, inode,
+		return prepare_for_direntry_item(path, p_le_ih, inode,
 						 n_new_file_length,
-						 p_n_cut_size);
+						 cut_size);
 
 	/* Direct item. */
 	if (is_direct_le_ih(p_le_ih))
-		return prepare_for_direct_item(p_s_path, p_le_ih, inode,
-					       n_new_file_length, p_n_cut_size);
+		return prepare_for_direct_item(path, p_le_ih, inode,
+					       n_new_file_length, cut_size);
 
 	/* Case of an indirect item. */
 	{
@@ -1001,9 +1001,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 
 	    do {
 		need_re_search = 0;
-		*p_n_cut_size = 0;
-		bh = PATH_PLAST_BUFFER(p_s_path);
-		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+		*cut_size = 0;
+		bh = PATH_PLAST_BUFFER(path);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
 		pos = I_UNFM_NUM(&s_ih);
 
 		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > n_new_file_length) {
@@ -1013,10 +1013,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 		    /* Each unformatted block deletion may involve one additional
 		     * bitmap block into the transaction, thereby the initial
 		     * journal space reservation might not be enough. */
-		    if (!delete && (*p_n_cut_size) != 0 &&
-			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
+		    if (!delete && (*cut_size) != 0 &&
+			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
 			break;
-		    }
 
 		    unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
 		    block = get_block_num(unfm, 0);
@@ -1030,17 +1029,17 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 
 		    cond_resched();
 
-		    if (item_moved (&s_ih, p_s_path))  {
+		    if (item_moved (&s_ih, path))  {
 			need_re_search = 1;
 			break;
 		    }
 
 		    pos --;
-		    (*p_n_removed) ++;
-		    (*p_n_cut_size) -= UNFM_P_SIZE;
+		    (*removed)++;
+		    (*cut_size) -= UNFM_P_SIZE;
 
 		    if (pos == 0) {
-			(*p_n_cut_size) -= IH_SIZE;
+			(*cut_size) -= IH_SIZE;
 			result = M_DELETE;
 			break;
 		    }
@@ -1050,10 +1049,10 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 		** buffer */
 		reiserfs_restore_prepared_buffer(sb, bh);
 	    } while (need_re_search &&
-		     search_for_position_by_key(sb, p_s_item_key, p_s_path) == POSITION_FOUND);
-	    pos_in_item(p_s_path) = pos * UNFM_P_SIZE;
+		     search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
+	    pos_in_item(path) = pos * UNFM_P_SIZE;
 
-	    if (*p_n_cut_size == 0) {
+	    if (*cut_size == 0) {
 		/* Nothing were cut. maybe convert last unformatted node to the
 		 * direct item? */
 		result = M_CONVERT;
@@ -1091,7 +1090,7 @@ static int calc_deleted_bytes_number(struct tree_balance *tb, char c_mode)
 static void init_tb_struct(struct reiserfs_transaction_handle *th,
 			   struct tree_balance *tb,
 			   struct super_block *sb,
-			   struct treepath *p_s_path, int n_size)
+			   struct treepath *path, int n_size)
 {
 
 	BUG_ON(!th->t_trans_id);
@@ -1099,9 +1098,9 @@ static void init_tb_struct(struct reiserfs_transaction_handle *th,
 	memset(tb, '\0', sizeof(struct tree_balance));
 	tb->transaction_handle = th;
 	tb->tb_sb = sb;
-	tb->tb_path = p_s_path;
-	PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
-	PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
+	tb->tb_path = path;
+	PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
+	PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
 	tb->insert_size[0] = n_size;
 }
 
@@ -1141,13 +1140,17 @@ char head2type(struct item_head *ih)
 }
 #endif
 
-/* Delete object item. */
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path,	/* Path to the deleted item. */
-			 const struct cpu_key *p_s_item_key,	/* Key to search for the deleted item.  */
-			 struct inode *inode,	/* inode is here just to update
-						 * i_blocks and quotas */
-			 struct buffer_head *p_s_un_bh)
-{				/* NULL or unformatted node pointer.    */
+/* Delete object item.
+ * th       - active transaction handle
+ * path     - path to the deleted item
+ * item_key - key to search for the deleted item
+ * indode   - used for updating i_blocks and quotas
+ * un_bh    - NULL or unformatted node pointer
+ */
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *item_key,
+			 struct inode *inode, struct buffer_head *un_bh)
+{
 	struct super_block *sb = inode->i_sb;
 	struct tree_balance s_del_balance;
 	struct item_head s_ih;
@@ -1162,7 +1165,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 
 	BUG_ON(!th->t_trans_id);
 
-	init_tb_struct(th, &s_del_balance, sb, p_s_path,
+	init_tb_struct(th, &s_del_balance, sb, path,
 		       0 /*size is unknown */ );
 
 	while (1) {
@@ -1172,14 +1175,14 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		n_iter++;
 		c_mode =
 #endif
-		    prepare_for_delete_or_cut(th, inode, p_s_path,
-					      p_s_item_key, &n_removed,
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &n_removed,
 					      &n_del_size,
 					      max_reiserfs_offset(inode));
 
 		RFALSE(c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
 
-		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
 		s_del_balance.insert_size[0] = n_del_size;
 
 		n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
@@ -1190,13 +1193,13 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 
 		// file system changed, repeat search
 		n_ret_value =
-		    search_for_position_by_key(sb, p_s_item_key, p_s_path);
+		    search_for_position_by_key(sb, item_key, path);
 		if (n_ret_value == IO_ERROR)
 			break;
 		if (n_ret_value == FILE_NOT_FOUND) {
 			reiserfs_warning(sb, "vs-5340",
 					 "no items of the file %K found",
-					 p_s_item_key);
+					 item_key);
 			break;
 		}
 	}			/* while (1) */
@@ -1207,7 +1210,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 	}
 	// reiserfs_delete_item returns item length when success
 	n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
-	q_ih = get_ih(p_s_path);
+	q_ih = get_ih(path);
 	quota_cut_bytes = ih_item_len(q_ih);
 
 	/* hack so the quota code doesn't have to guess if the file
@@ -1224,7 +1227,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		}
 	}
 
-	if (p_s_un_bh) {
+	if (un_bh) {
 		int off;
 		char *data;
 
@@ -1242,16 +1245,16 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		 ** The unformatted node must be dirtied later on.  We can't be
 		 ** sure here if the entire tail has been deleted yet.
 		 **
-		 ** p_s_un_bh is from the page cache (all unformatted nodes are
+		 ** un_bh is from the page cache (all unformatted nodes are
 		 ** from the page cache) and might be a highmem page.  So, we
-		 ** can't use p_s_un_bh->b_data.
+		 ** can't use un_bh->b_data.
 		 ** -clm
 		 */
 
-		data = kmap_atomic(p_s_un_bh->b_page, KM_USER0);
+		data = kmap_atomic(un_bh->b_page, KM_USER0);
 		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
 		memcpy(data + off,
-		       B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih),
+		       B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
 		       n_ret_value);
 		kunmap_atomic(data, KM_USER0);
 	}
@@ -1427,9 +1430,9 @@ static void unmap_buffers(struct page *page, loff_t pos)
 static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
 				    struct inode *inode,
 				    struct page *page,
-				    struct treepath *p_s_path,
-				    const struct cpu_key *p_s_item_key,
-				    loff_t n_new_file_size, char *p_c_mode)
+				    struct treepath *path,
+				    const struct cpu_key *item_key,
+				    loff_t n_new_file_size, char *mode)
 {
 	struct super_block *sb = inode->i_sb;
 	int n_block_size = sb->s_blocksize;
@@ -1445,17 +1448,17 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
 	    !tail_has_to_be_packed(inode) ||
 	    !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		/* leave tail in an unformatted node */
-		*p_c_mode = M_SKIP_BALANCING;
+		*mode = M_SKIP_BALANCING;
 		cut_bytes =
 		    n_block_size - (n_new_file_size & (n_block_size - 1));
-		pathrelse(p_s_path);
+		pathrelse(path);
 		return cut_bytes;
 	}
-	/* Permorm the conversion to a direct_item. */
-	/* return indirect_to_direct(inode, p_s_path, p_s_item_key,
-				  n_new_file_size, p_c_mode); */
-	return indirect2direct(th, inode, page, p_s_path, p_s_item_key,
-			       n_new_file_size, p_c_mode);
+	/* Perform the conversion to a direct_item. */
+	/* return indirect_to_direct(inode, path, item_key,
+				  n_new_file_size, mode); */
+	return indirect2direct(th, inode, page, path, item_key,
+			       n_new_file_size, mode);
 }
 
 /* we did indirect_to_direct conversion. And we have inserted direct
@@ -1506,8 +1509,8 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
 
 /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
 int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
-			   struct treepath *p_s_path,
-			   struct cpu_key *p_s_item_key,
+			   struct treepath *path,
+			   struct cpu_key *item_key,
 			   struct inode *inode,
 			   struct page *page, loff_t n_new_file_size)
 {
@@ -1528,7 +1531,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	init_tb_struct(th, &s_cut_balance, inode->i_sb, p_s_path,
+	init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
 		       n_cut_size);
 
 	/* Repeat this loop until we either cut the item without needing
@@ -1540,8 +1543,8 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		   pointers. */
 
 		c_mode =
-		    prepare_for_delete_or_cut(th, inode, p_s_path,
-					      p_s_item_key, &n_removed,
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &n_removed,
 					      &n_cut_size, n_new_file_size);
 		if (c_mode == M_CONVERT) {
 			/* convert last unformatted node to direct item or leave
@@ -1551,7 +1554,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 
 			n_ret_value =
 			    maybe_indirect_to_direct(th, inode, page,
-						     p_s_path, p_s_item_key,
+						     path, item_key,
 						     n_new_file_size, &c_mode);
 			if (c_mode == M_SKIP_BALANCING)
 				/* tail has been left in the unformatted node */
@@ -1568,26 +1571,26 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 			   inserting the new direct item.  Now we are removing the
 			   last unformatted node pointer. Set key to search for
 			   it. */
-			set_cpu_key_k_type(p_s_item_key, TYPE_INDIRECT);
-			p_s_item_key->key_length = 4;
+			set_cpu_key_k_type(item_key, TYPE_INDIRECT);
+			item_key->key_length = 4;
 			n_new_file_size -=
 			    (n_new_file_size & (sb->s_blocksize - 1));
 			tail_pos = n_new_file_size;
-			set_cpu_key_k_offset(p_s_item_key, n_new_file_size + 1);
+			set_cpu_key_k_offset(item_key, n_new_file_size + 1);
 			if (search_for_position_by_key
-			    (sb, p_s_item_key,
-			     p_s_path) == POSITION_NOT_FOUND) {
-				print_block(PATH_PLAST_BUFFER(p_s_path), 3,
-					    PATH_LAST_POSITION(p_s_path) - 1,
-					    PATH_LAST_POSITION(p_s_path) + 1);
+			    (sb, item_key,
+			     path) == POSITION_NOT_FOUND) {
+				print_block(PATH_PLAST_BUFFER(path), 3,
+					    PATH_LAST_POSITION(path) - 1,
+					    PATH_LAST_POSITION(path) + 1);
 				reiserfs_panic(sb, "PAP-5580", "item to "
 					       "convert does not exist (%K)",
-					       p_s_item_key);
+					       item_key);
 			}
 			continue;
 		}
 		if (n_cut_size == 0) {
-			pathrelse(p_s_path);
+			pathrelse(path);
 			return 0;
 		}
 
@@ -1600,12 +1603,12 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		PROC_INFO_INC(sb, cut_from_item_restarted);
 
 		n_ret_value =
-		    search_for_position_by_key(sb, p_s_item_key, p_s_path);
+		    search_for_position_by_key(sb, item_key, path);
 		if (n_ret_value == POSITION_FOUND)
 			continue;
 
 		reiserfs_warning(sb, "PAP-5610", "item %K not found",
-				 p_s_item_key);
+				 item_key);
 		unfix_nodes(&s_cut_balance);
 		return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
 	}			/* while */
@@ -1615,7 +1618,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		if (n_is_inode_locked) {
 			// FIXME: this seems to be not needed: we are always able
 			// to cut item
-			indirect_to_direct_roll_back(th, inode, p_s_path);
+			indirect_to_direct_roll_back(th, inode, path);
 		}
 		if (n_ret_value == NO_DISK_SPACE)
 			reiserfs_warning(sb, "reiserfs-5092",
@@ -1631,7 +1634,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 	/* Calculate number of bytes that need to be cut from the item. */
 	quota_cut_bytes =
 	    (c_mode ==
-	     M_DELETE) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.
+	     M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance.
 	    insert_size[0];
 	if (retval2 == -1)
 		n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
@@ -1878,7 +1881,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
 #ifdef CONFIG_REISERFS_CHECK
 // this makes sure, that we __append__, not overwrite or add holes
 static void check_research_for_paste(struct treepath *path,
-				     const struct cpu_key *p_s_key)
+				     const struct cpu_key *key)
 {
 	struct item_head *found_ih = get_ih(path);
 
@@ -1886,35 +1889,35 @@ static void check_research_for_paste(struct treepath *path,
 		if (le_ih_k_offset(found_ih) +
 		    op_bytes_number(found_ih,
 				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(p_s_key)
+		    cpu_key_k_offset(key)
 		    || op_bytes_number(found_ih,
 				       get_last_bh(path)->b_size) !=
 		    pos_in_item(path))
 			reiserfs_panic(NULL, "PAP-5720", "found direct item "
 				       "%h or position (%d) does not match "
 				       "to key %K", found_ih,
-				       pos_in_item(path), p_s_key);
+				       pos_in_item(path), key);
 	}
 	if (is_indirect_le_ih(found_ih)) {
 		if (le_ih_k_offset(found_ih) +
 		    op_bytes_number(found_ih,
 				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(p_s_key)
+		    cpu_key_k_offset(key)
 		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
 		    || get_ih_free_space(found_ih) != 0)
 			reiserfs_panic(NULL, "PAP-5730", "found indirect "
 				       "item (%h) or position (%d) does not "
 				       "match to key (%K)",
-				       found_ih, pos_in_item(path), p_s_key);
+				       found_ih, pos_in_item(path), key);
 	}
 }
 #endif				/* config reiserfs check */
 
 /* Paste bytes to the existing item. Returns bytes number pasted into the item. */
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_search_path,	/* Path to the pasted item.          */
-			     const struct cpu_key *p_s_key,	/* Key to search for the needed item. */
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path,	/* Path to the pasted item.	  */
+			     const struct cpu_key *key,	/* Key to search for the needed item. */
 			     struct inode *inode,	/* Inode item belongs to */
-			     const char *p_c_body,	/* Pointer to the bytes to paste.    */
+			     const char *body,	/* Pointer to the bytes to paste.    */
 			     int n_pasted_size)
 {				/* Size of pasted bytes.             */
 	struct tree_balance s_paste_balance;
@@ -1929,17 +1932,17 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 		       "reiserquota paste_into_item(): allocating %u id=%u type=%c",
 		       n_pasted_size, inode->i_uid,
-		       key2type(&(p_s_key->on_disk_key)));
+		       key2type(&(key->on_disk_key)));
 #endif
 
 	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
-		pathrelse(p_s_search_path);
+		pathrelse(search_path);
 		return -EDQUOT;
 	}
-	init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path,
+	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
 		       n_pasted_size);
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	s_paste_balance.key = p_s_key->on_disk_key;
+	s_paste_balance.key = key->on_disk_key;
 #endif
 
 	/* DQUOT_* can schedule, must check before the fix_nodes */
@@ -1949,13 +1952,13 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 
 	while ((retval =
 		fix_nodes(M_PASTE, &s_paste_balance, NULL,
-			  p_c_body)) == REPEAT_SEARCH) {
+			  body)) == REPEAT_SEARCH) {
 	      search_again:
 		/* file system changed while we were in the fix_nodes */
 		PROC_INFO_INC(th->t_super, paste_into_item_restarted);
 		retval =
-		    search_for_position_by_key(th->t_super, p_s_key,
-					       p_s_search_path);
+		    search_for_position_by_key(th->t_super, key,
+					       search_path);
 		if (retval == IO_ERROR) {
 			retval = -EIO;
 			goto error_out;
@@ -1963,19 +1966,19 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		if (retval == POSITION_FOUND) {
 			reiserfs_warning(inode->i_sb, "PAP-5710",
 					 "entry or pasted byte (%K) exists",
-					 p_s_key);
+					 key);
 			retval = -EEXIST;
 			goto error_out;
 		}
 #ifdef CONFIG_REISERFS_CHECK
-		check_research_for_paste(p_s_search_path, p_s_key);
+		check_research_for_paste(search_path, key);
 #endif
 	}
 
 	/* Perform balancing after all resources are collected by fix_nodes, and
 	   accessing them will not risk triggering schedule. */
 	if (retval == CARRY_ON) {
-		do_balance(&s_paste_balance, NULL /*ih */ , p_c_body, M_PASTE);
+		do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
 		return 0;
 	}
 	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
@@ -1986,17 +1989,23 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 		       "reiserquota paste_into_item(): freeing %u id=%u type=%c",
 		       n_pasted_size, inode->i_uid,
-		       key2type(&(p_s_key->on_disk_key)));
+		       key2type(&(key->on_disk_key)));
 #endif
 	DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
 	return retval;
 }
 
-/* Insert new item into the buffer at the path. */
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path,	/* Path to the inserteded item.         */
-			 const struct cpu_key *key, struct item_head *p_s_ih,	/* Pointer to the item header to insert. */
-			 struct inode *inode, const char *p_c_body)
-{				/* Pointer to the bytes to insert.      */
+/* Insert new item into the buffer at the path.
+ * th   - active transaction handle
+ * path - path to the inserted item
+ * ih   - pointer to the item header to insert
+ * body - pointer to the bytes to insert
+ */
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *key,
+			 struct item_head *ih, struct inode *inode,
+			 const char *body)
+{
 	struct tree_balance s_ins_balance;
 	int retval;
 	int fs_gen = 0;
@@ -2006,28 +2015,27 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 
 	if (inode) {		/* Do we count quotas for item? */
 		fs_gen = get_generation(inode->i_sb);
-		quota_bytes = ih_item_len(p_s_ih);
+		quota_bytes = ih_item_len(ih);
 
 		/* hack so the quota code doesn't have to guess if the file has
 		 ** a tail, links are always tails, so there's no guessing needed
 		 */
-		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_s_ih)) {
+		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
 			quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
-		}
 #ifdef REISERQUOTA_DEBUG
 		reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 			       "reiserquota insert_item(): allocating %u id=%u type=%c",
-			       quota_bytes, inode->i_uid, head2type(p_s_ih));
+			       quota_bytes, inode->i_uid, head2type(ih));
 #endif
 		/* We can't dirty inode here. It would be immediately written but
 		 * appropriate stat item isn't inserted yet... */
 		if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
-			pathrelse(p_s_path);
+			pathrelse(path);
 			return -EDQUOT;
 		}
 	}
-	init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path,
-		       IH_SIZE + ih_item_len(p_s_ih));
+	init_tb_struct(th, &s_ins_balance, th->t_super, path,
+		       IH_SIZE + ih_item_len(ih));
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	s_ins_balance.key = key->on_disk_key;
 #endif
@@ -2037,12 +2045,12 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 	}
 
 	while ((retval =
-		fix_nodes(M_INSERT, &s_ins_balance, p_s_ih,
-			  p_c_body)) == REPEAT_SEARCH) {
+		fix_nodes(M_INSERT, &s_ins_balance, ih,
+			  body)) == REPEAT_SEARCH) {
 	      search_again:
 		/* file system changed while we were in the fix_nodes */
 		PROC_INFO_INC(th->t_super, insert_item_restarted);
-		retval = search_item(th->t_super, key, p_s_path);
+		retval = search_item(th->t_super, key, path);
 		if (retval == IO_ERROR) {
 			retval = -EIO;
 			goto error_out;
@@ -2058,7 +2066,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 
 	/* make balancing after all resources will be collected at a time */
 	if (retval == CARRY_ON) {
-		do_balance(&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
+		do_balance(&s_ins_balance, ih, body, M_INSERT);
 		return 0;
 	}
 
@@ -2069,7 +2077,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 #ifdef REISERQUOTA_DEBUG
 	reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
 		       "reiserquota insert_item(): freeing %u id=%u type=%c",
-		       quota_bytes, inode->i_uid, head2type(p_s_ih));
+		       quota_bytes, inode->i_uid, head2type(ih));
 #endif
 	if (inode)
 		DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 5c5ee0d0d6a8..2b90c0e5697c 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -172,10 +172,12 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
    inode */
 int indirect2direct(struct reiserfs_transaction_handle *th,
 		    struct inode *inode, struct page *page,
-		    struct treepath *p_s_path,	/* path to the indirect item. */
-		    const struct cpu_key *p_s_item_key,	/* Key to look for unformatted node pointer to be cut. */
+		    struct treepath *path,	/* path to the indirect item. */
+		    const struct cpu_key *item_key,	/* Key to look for
+							 * unformatted node
+							 * pointer to be cut. */
 		    loff_t n_new_file_size,	/* New file size. */
-		    char *p_c_mode)
+		    char *mode)
 {
 	struct super_block *sb = inode->i_sb;
 	struct item_head s_ih;
@@ -189,10 +191,10 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
 
 	REISERFS_SB(sb)->s_indirect2direct++;
 
-	*p_c_mode = M_SKIP_BALANCING;
+	*mode = M_SKIP_BALANCING;
 
 	/* store item head path points to. */
-	copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+	copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
 
 	tail_len = (n_new_file_size & (n_block_size - 1));
 	if (get_inode_sd_version(inode) == STAT_DATA_V2)
@@ -211,14 +213,14 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
 
 	tail = (char *)kmap(page);	/* this can schedule */
 
-	if (path_changed(&s_ih, p_s_path)) {
+	if (path_changed(&s_ih, path)) {
 		/* re-search indirect item */
-		if (search_for_position_by_key(sb, p_s_item_key, p_s_path)
+		if (search_for_position_by_key(sb, item_key, path)
 		    == POSITION_NOT_FOUND)
 			reiserfs_panic(sb, "PAP-5520",
 				       "item to be converted %K does not exist",
-				       p_s_item_key);
-		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+				       item_key);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
 #ifdef CONFIG_REISERFS_CHECK
 		pos = le_ih_k_offset(&s_ih) - 1 +
 		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
@@ -240,13 +242,13 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
 	 */
 	tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
 
-	PATH_LAST_POSITION(p_s_path)++;
+	PATH_LAST_POSITION(path)++;
 
-	key = *p_s_item_key;
+	key = *item_key;
 	set_cpu_key_k_type(&key, TYPE_DIRECT);
 	key.key_length = 4;
 	/* Insert tail as new direct item in the tree */
-	if (reiserfs_insert_item(th, p_s_path, &key, &s_ih, inode,
+	if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
 				 tail ? tail : NULL) < 0) {
 		/* No disk memory. So we can not convert last unformatted node
 		   to the direct item.  In this case we used to adjust
@@ -268,7 +270,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
 
 	/* We have inserted new direct item and must remove last
 	   unformatted node. */
-	*p_c_mode = M_CUT;
+	*mode = M_CUT;
 
 	/* we store position of first direct item in the in-core inode */
 	/* mark_file_with_tail (inode, pos1 + 1); */
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index b72dc2095478..e711c796e9d1 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -694,9 +694,9 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key)
 #define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
 #define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
 
-#define I_K_KEY_IN_ITEM(p_s_ih, p_s_key, n_blocksize) \
-    ( ! COMP_SHORT_KEYS(p_s_ih, p_s_key) && \
-          I_OFF_BYTE_IN_ITEM(p_s_ih, k_offset (p_s_key), n_blocksize) )
+#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
+    (!COMP_SHORT_KEYS(ih, key) && \
+	  I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
 
 /* maximal length of item */
 #define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
@@ -1196,33 +1196,33 @@ struct treepath {
 struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
 
 /* Get path element by path and path position. */
-#define PATH_OFFSET_PELEMENT(p_s_path,n_offset)  ((p_s_path)->path_elements +(n_offset))
+#define PATH_OFFSET_PELEMENT(path, n_offset)  ((path)->path_elements + (n_offset))
 
 /* Get buffer header at the path by path and path position. */
-#define PATH_OFFSET_PBUFFER(p_s_path,n_offset)   (PATH_OFFSET_PELEMENT(p_s_path,n_offset)->pe_buffer)
+#define PATH_OFFSET_PBUFFER(path, n_offset)   (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
 
 /* Get position in the element at the path by path and path position. */
-#define PATH_OFFSET_POSITION(p_s_path,n_offset) (PATH_OFFSET_PELEMENT(p_s_path,n_offset)->pe_position)
+#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
 
-#define PATH_PLAST_BUFFER(p_s_path) (PATH_OFFSET_PBUFFER((p_s_path), (p_s_path)->path_length))
+#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
 				/* you know, to the person who didn't
 				   write this the macro name does not
 				   at first suggest what it does.
 				   Maybe POSITION_FROM_PATH_END? Or
 				   maybe we should just focus on
 				   dumping paths... -Hans */
-#define PATH_LAST_POSITION(p_s_path) (PATH_OFFSET_POSITION((p_s_path), (p_s_path)->path_length))
+#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
 
-#define PATH_PITEM_HEAD(p_s_path)    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_path),PATH_LAST_POSITION(p_s_path))
+#define PATH_PITEM_HEAD(path)    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path))
 
 /* in do_balance leaf has h == 0 in contrast with path structure,
    where root has level == 0. That is why we need these defines */
-#define PATH_H_PBUFFER(p_s_path, h) PATH_OFFSET_PBUFFER (p_s_path, p_s_path->path_length - (h))	/* tb->S[h] */
+#define PATH_H_PBUFFER(path, h) PATH_OFFSET_PBUFFER (path, path->path_length - (h))	/* tb->S[h] */
 #define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1)	/* tb->F[h] or tb->S[0]->b_parent */
 #define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h))
 #define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)	/* tb->S[h]->b_item_order */
 
-#define PATH_H_PATH_OFFSET(p_s_path, n_h) ((p_s_path)->path_length - (n_h))
+#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
 
 #define get_last_bh(path) PATH_PLAST_BUFFER(path)
 #define get_ih(path) PATH_PITEM_HEAD(path)
@@ -1512,7 +1512,7 @@ extern struct item_operations *item_ops[TYPE_ANY + 1];
 #define COMP_SHORT_KEYS comp_short_keys
 
 /* number of blocks pointed to by the indirect item */
-#define I_UNFM_NUM(p_s_ih)	( ih_item_len(p_s_ih) / UNFM_P_SIZE )
+#define I_UNFM_NUM(ih)	(ih_item_len(ih) / UNFM_P_SIZE)
 
 /* the used space within the unformatted node corresponding to pos within the item pointed to by ih */
 #define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
@@ -1793,8 +1793,8 @@ int reiserfs_convert_objectid_map_v1(struct super_block *);
 
 /* stree.c */
 int B_IS_IN_TREE(const struct buffer_head *);
-extern void copy_item_head(struct item_head *p_v_to,
-			   const struct item_head *p_v_from);
+extern void copy_item_head(struct item_head *to,
+			   const struct item_head *from);
 
 // first key is in cpu form, second - le
 extern int comp_short_keys(const struct reiserfs_key *le_key,
@@ -1829,20 +1829,20 @@ static inline void copy_key(struct reiserfs_key *to,
 	memcpy(to, from, KEY_SIZE);
 }
 
-int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_path);
-const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
+int comp_items(const struct item_head *stored_ih, const struct treepath *path);
+const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
 				    const struct super_block *sb);
 int search_by_key(struct super_block *, const struct cpu_key *,
 		  struct treepath *, int);
 #define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
 int search_for_position_by_key(struct super_block *sb,
-			       const struct cpu_key *p_s_cpu_key,
-			       struct treepath *p_s_search_path);
+			       const struct cpu_key *cpu_key,
+			       struct treepath *search_path);
 extern void decrement_bcount(struct buffer_head *bh);
-void decrement_counters_in_path(struct treepath *p_s_search_path);
-void pathrelse(struct treepath *p_s_search_path);
+void decrement_counters_in_path(struct treepath *search_path);
+void pathrelse(struct treepath *search_path);
 int reiserfs_check_path(struct treepath *p);
-void pathrelse_and_restore(struct super_block *s, struct treepath *p_s_search_path);
+void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
 
 int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
 			 struct treepath *path,
@@ -1865,7 +1865,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
 			 struct treepath *path,
 			 const struct cpu_key *key,
-			 struct inode *inode, struct buffer_head *p_s_un_bh);
+			 struct inode *inode, struct buffer_head *un_bh);
 
 void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 				struct inode *inode, struct reiserfs_key *key);
@@ -2005,7 +2005,7 @@ extern const struct address_space_operations reiserfs_address_space_operations;
 /* fix_nodes.c */
 
 int fix_nodes(int n_op_mode, struct tree_balance *tb,
-	      struct item_head *p_s_ins_ih, const void *);
+	      struct item_head *ins_ih, const void *);
 void unfix_nodes(struct tree_balance *);
 
 /* prints.c */
-- 
cgit v1.2.3-71-gd317


From e7a19c5624c66afa8118b10cd59f87ee407646bc Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 30 Mar 2009 21:46:44 +0200
Subject: dmi: Let dmi_walk() users pass private data

At the moment, dmi_walk() lacks flexibility, users can't pass data to
the callback function. Add a pointer for private data to make this
function more flexible.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Roland Dreier <rolandd@cisco.com>
---
 drivers/firmware/dmi_scan.c        | 18 +++++++++++-------
 drivers/hwmon/fschmd.c             |  4 ++--
 drivers/platform/x86/dell-laptop.c |  4 ++--
 drivers/watchdog/hpwdt.c           |  4 ++--
 include/linux/dmi.h                |  7 ++++---
 5 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 8f0f7c449305..5f1b5400d96a 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -68,7 +68,8 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s)
  *	pointing to completely the wrong place for example
  */
 static void dmi_table(u8 *buf, int len, int num,
-		      void (*decode)(const struct dmi_header *))
+		      void (*decode)(const struct dmi_header *, void *),
+		      void *private_data)
 {
 	u8 *data = buf;
 	int i = 0;
@@ -89,7 +90,7 @@ static void dmi_table(u8 *buf, int len, int num,
 		while ((data - buf < len - 1) && (data[0] || data[1]))
 			data++;
 		if (data - buf < len - 1)
-			decode(dm);
+			decode(dm, private_data);
 		data += 2;
 		i++;
 	}
@@ -99,7 +100,8 @@ static u32 dmi_base;
 static u16 dmi_len;
 static u16 dmi_num;
 
-static int __init dmi_walk_early(void (*decode)(const struct dmi_header *))
+static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
+		void *))
 {
 	u8 *buf;
 
@@ -107,7 +109,7 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *))
 	if (buf == NULL)
 		return -1;
 
-	dmi_table(buf, dmi_len, dmi_num, decode);
+	dmi_table(buf, dmi_len, dmi_num, decode, NULL);
 
 	dmi_iounmap(buf, dmi_len);
 	return 0;
@@ -295,7 +297,7 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm)
  *	and machine entries. For 2.5 we should pull the smbus controller info
  *	out of here.
  */
-static void __init dmi_decode(const struct dmi_header *dm)
+static void __init dmi_decode(const struct dmi_header *dm, void *dummy)
 {
 	switch(dm->type) {
 	case 0:		/* BIOS Information */
@@ -598,10 +600,12 @@ int dmi_get_year(int field)
 /**
  *	dmi_walk - Walk the DMI table and get called back for every record
  *	@decode: Callback function
+ *	@private_data: Private data to be passed to the callback function
  *
  *	Returns -1 when the DMI table can't be reached, 0 on success.
  */
-int dmi_walk(void (*decode)(const struct dmi_header *))
+int dmi_walk(void (*decode)(const struct dmi_header *, void *),
+	     void *private_data)
 {
 	u8 *buf;
 
@@ -612,7 +616,7 @@ int dmi_walk(void (*decode)(const struct dmi_header *))
 	if (buf == NULL)
 		return -1;
 
-	dmi_table(buf, dmi_len, dmi_num, decode);
+	dmi_table(buf, dmi_len, dmi_num, decode, private_data);
 
 	iounmap(buf);
 	return 0;
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index d07f4ef75092..b557f2ebd9ae 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -856,7 +856,7 @@ static struct file_operations watchdog_fops = {
 
 /* DMI decode routine to read voltage scaling factors from special DMI tables,
    which are available on FSC machines with an fscher or later chip. */
-static void fschmd_dmi_decode(const struct dmi_header *header)
+static void fschmd_dmi_decode(const struct dmi_header *header, void *dummy)
 {
 	int i, mult[3] = { 0 }, offset[3] = { 0 }, vref = 0, found = 0;
 
@@ -991,7 +991,7 @@ static int fschmd_probe(struct i2c_client *client,
 
 	/* Read the special DMI table for fscher and newer chips */
 	if ((kind == fscher || kind >= fschrc) && dmi_vref == -1) {
-		dmi_walk(fschmd_dmi_decode);
+		dmi_walk(fschmd_dmi_decode, NULL);
 		if (dmi_vref == -1) {
 			dev_warn(&client->dev,
 				"Couldn't get voltage scaling factors from "
diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index 16e11c2ee19a..af9f43021172 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -103,7 +103,7 @@ static void parse_da_table(const struct dmi_header *dm)
 	da_num_tokens += tokens;
 }
 
-static void find_tokens(const struct dmi_header *dm)
+static void find_tokens(const struct dmi_header *dm, void *dummy)
 {
 	switch (dm->type) {
 	case 0xd4: /* Indexed IO */
@@ -356,7 +356,7 @@ static int __init dell_init(void)
 	if (!dmi_check_system(dell_device_table))
 		return -ENODEV;
 
-	dmi_walk(find_tokens);
+	dmi_walk(find_tokens, NULL);
 
 	if (!da_tokens)  {
 		printk(KERN_INFO "dell-laptop: Unable to find dmi tokens\n");
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 6cf155d6b350..3137361ccbfe 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -380,7 +380,7 @@ asm(".text                      \n\t"
  *	This function checks whether or not a SMBIOS/DMI record is
  *	the 64bit CRU info or not
  */
-static void __devinit dmi_find_cru(const struct dmi_header *dm)
+static void __devinit dmi_find_cru(const struct dmi_header *dm, void *dummy)
 {
 	struct smbios_cru64_info *smbios_cru64_ptr;
 	unsigned long cru_physical_address;
@@ -403,7 +403,7 @@ static int __devinit detect_cru_service(void)
 {
 	cru_rom_addr = NULL;
 
-	dmi_walk(dmi_find_cru);
+	dmi_walk(dmi_find_cru, NULL);
 
 	/* if cru_rom_addr has been set then we found a CRU service */
 	return ((cru_rom_addr != NULL) ? 0 : -ENODEV);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index d741b9ceb0e0..bb5489c82c99 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -47,7 +47,8 @@ extern int dmi_get_year(int field);
 extern int dmi_name_in_vendors(const char *str);
 extern int dmi_name_in_serial(const char *str);
 extern int dmi_available;
-extern int dmi_walk(void (*decode)(const struct dmi_header *));
+extern int dmi_walk(void (*decode)(const struct dmi_header *, void *),
+	void *private_data);
 extern bool dmi_match(enum dmi_field f, const char *str);
 
 #else
@@ -61,8 +62,8 @@ static inline int dmi_get_year(int year) { return 0; }
 static inline int dmi_name_in_vendors(const char *s) { return 0; }
 static inline int dmi_name_in_serial(const char *s) { return 0; }
 #define dmi_available 0
-static inline int dmi_walk(void (*decode)(const struct dmi_header *))
-	{ return -1; }
+static inline int dmi_walk(void (*decode)(const struct dmi_header *, void *),
+	void *private_data) { return -1; }
 static inline bool dmi_match(enum dmi_field f, const char *str)
 	{ return false; }
 static inline const struct dmi_system_id *
-- 
cgit v1.2.3-71-gd317


From 0a0c5168df270a50e3518e4f12bddb31f8f5f38f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:33:49 +0100
Subject: PM: Introduce functions for suspending and resuming device interrupts

Introduce helper functions allowing us to prevent device drivers from
getting any interrupts (without disabling interrupts on the CPU)
during suspend (or hibernation) and to make them start to receive
interrupts again during the subsequent resume.  These functions make it
possible to keep timer interrupts enabled while the "late" suspend and
"early" resume callbacks provided by device drivers are being
executed.  In turn, this allows device drivers' "late" suspend and
"early" resume callbacks to sleep, execute ACPI callbacks etc.

The functions introduced here will be used to rework the handling of
interrupts during suspend (hibernation) and resume.  Namely,
interrupts will only be disabled on the CPU right before suspending
sysdevs, while device drivers will be prevented from receiving
interrupts, with the help of the new helper function, before their
"late" suspend callbacks run (and analogously during resume).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  9 ++++++
 include/linux/irq.h       |  1 +
 kernel/irq/Makefile       |  1 +
 kernel/irq/internals.h    |  2 ++
 kernel/irq/manage.c       | 31 ++++++++++++++-----
 kernel/irq/pm.c           | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 116 insertions(+), 7 deletions(-)
 create mode 100644 kernel/irq/pm.c

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0c9cb63e6895..c68bffd182bb 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -117,6 +117,15 @@ extern void disable_irq_nosync(unsigned int irq);
 extern void disable_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 
+/* The following three functions are for the core kernel use only. */
+extern void suspend_device_irqs(void);
+extern void resume_device_irqs(void);
+#ifdef CONFIG_PM_SLEEP
+extern int check_wakeup_irqs(void);
+#else
+static inline int check_wakeup_irqs(void) { return 0; }
+#endif
+
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 
 extern cpumask_var_t irq_default_affinity;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9c62fbe2ef30..974890b3c52f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -67,6 +67,7 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_SPURIOUS_DISABLED	0x00800000	/* IRQ was disabled by the spurious trap */
 #define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
 #define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
+#define IRQ_SUSPENDED		0x04000000	/* IRQ has gone through suspend sequence */
 
 #ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 4dd5b1edac98..3394f8f52964 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ee1aa9f8e8b9..01ce20eab38f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,6 +12,8 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99984c0..1516ab77355c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -162,6 +162,20 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 }
 #endif
 
+void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+{
+	if (suspend) {
+		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+			return;
+		desc->status |= IRQ_SUSPENDED;
+	}
+
+	if (!desc->depth++) {
+		desc->status |= IRQ_DISABLED;
+		desc->chip->disable(irq);
+	}
+}
+
 /**
  *	disable_irq_nosync - disable an irq without waiting
  *	@irq: Interrupt to disable
@@ -182,10 +196,7 @@ void disable_irq_nosync(unsigned int irq)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	if (!desc->depth++) {
-		desc->status |= IRQ_DISABLED;
-		desc->chip->disable(irq);
-	}
+	__disable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -215,15 +226,21 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
 
-static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
+	if (resume)
+		desc->status &= ~IRQ_SUSPENDED;
+
 	switch (desc->depth) {
 	case 0:
+ err_out:
 		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
 
+		if (desc->status & IRQ_SUSPENDED)
+			goto err_out;
 		/* Prevent probing on this irq: */
 		desc->status = status | IRQ_NOPROBE;
 		check_irq_resend(desc, irq);
@@ -253,7 +270,7 @@ void enable_irq(unsigned int irq)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	__enable_irq(desc, irq);
+	__enable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -511,7 +528,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 */
 	if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
 		desc->status &= ~IRQ_SPURIOUS_DISABLED;
-		__enable_irq(desc, irq);
+		__enable_irq(desc, irq, false);
 	}
 
 	spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 000000000000..638d8bedec14
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,79 @@
+/*
+ * linux/kernel/irq/pm.c
+ *
+ * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file contains power management functions related to interrupts.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+/**
+ * suspend_device_irqs - disable all currently enabled interrupt lines
+ *
+ * During system-wide suspend or hibernation device interrupts need to be
+ * disabled at the chip level and this function is provided for this purpose.
+ * It disables all interrupt lines that are enabled at the moment and sets the
+ * IRQ_SUSPENDED flag for them.
+ */
+void suspend_device_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		__disable_irq(desc, irq, true);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+
+	for_each_irq_desc(irq, desc)
+		if (desc->status & IRQ_SUSPENDED)
+			synchronize_irq(irq);
+}
+EXPORT_SYMBOL_GPL(suspend_device_irqs);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all interrupt lines previously disabled by suspend_device_irqs() that
+ * have the IRQ_SUSPENDED flag set.
+ */
+void resume_device_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc) {
+		unsigned long flags;
+
+		if (!(desc->status & IRQ_SUSPENDED))
+			continue;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		__enable_irq(desc, irq, true);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
+EXPORT_SYMBOL_GPL(resume_device_irqs);
+
+/**
+ * check_wakeup_irqs - check if any wake-up interrupts are pending
+ */
+int check_wakeup_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc)
+		if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+			return -EBUSY;
+
+	return 0;
+}
-- 
cgit v1.2.3-71-gd317


From 0e5dd46b761195356065a30611f265adec286d0d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 26 Mar 2009 22:51:40 +0100
Subject: PCI PM: Introduce __pci_[start|complete]_power_transition() (rev. 2)

The radeonfb driver needs to program the device's PMCSR directly due
to some quirky hardware it has to handle (see
http://bugzilla.kernel.org/show_bug.cgi?id=12846 for details) and
after doing that it needs to call the platform (usually ACPI) to
finish the power transition of the device.  Currently it uses
pci_set_power_state() for this purpose, however making a specific
assumption about the internal behavior of this function, which has
changed recently so that this assumption is no longer satisfied.
For this reason, introduce __pci_complete_power_transition() that may
be called by the radeonfb driver to complete the power transition of
the device.  For symmetry, introduce __pci_start_power_transition().

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c   | 69 +++++++++++++++++++++++++++++++++++++++--------------
 include/linux/pci.h |  1 +
 2 files changed, 52 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 979ceb1d37e8..de54fd643baf 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -539,6 +539,53 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
 	}
 }
 
+/**
+ * pci_platform_power_transition - Use platform to change device power state
+ * @dev: PCI device to handle.
+ * @state: State to put the device into.
+ */
+static int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state)
+{
+	int error;
+
+	if (platform_pci_power_manageable(dev)) {
+		error = platform_pci_set_power_state(dev, state);
+		if (!error)
+			pci_update_current_state(dev, state);
+	} else {
+		error = -ENODEV;
+		/* Fall back to PCI_D0 if native PM is not supported */
+		pci_update_current_state(dev, PCI_D0);
+	}
+
+	return error;
+}
+
+/**
+ * __pci_start_power_transition - Start power transition of a PCI device
+ * @dev: PCI device to handle.
+ * @state: State to put the device into.
+ */
+static void __pci_start_power_transition(struct pci_dev *dev, pci_power_t state)
+{
+	if (state == PCI_D0)
+		pci_platform_power_transition(dev, PCI_D0);
+}
+
+/**
+ * __pci_complete_power_transition - Complete power transition of a PCI device
+ * @dev: PCI device to handle.
+ * @state: State to put the device into.
+ *
+ * This function should not be called directly by device drivers.
+ */
+int __pci_complete_power_transition(struct pci_dev *dev, pci_power_t state)
+{
+	return state > PCI_D0 ?
+			pci_platform_power_transition(dev, state) : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(__pci_complete_power_transition);
+
 /**
  * pci_set_power_state - Set the power state of a PCI device
  * @dev: PCI device to handle.
@@ -575,16 +622,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 	if (dev->current_state == state)
 		return 0;
 
-	if (state == PCI_D0) {
-		/*
-		 * Allow the platform to change the state, for example via ACPI
-		 * _PR0, _PS0 and some such, but do not trust it.
-		 */
-		int ret = platform_pci_power_manageable(dev) ?
-			platform_pci_set_power_state(dev, PCI_D0) : 0;
-		if (!ret)
-			pci_update_current_state(dev, PCI_D0);
-	}
+	__pci_start_power_transition(dev, state);
+
 	/* This device is quirked not to be put into D3, so
 	   don't put it in D3 */
 	if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))
@@ -592,14 +631,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 
 	error = pci_raw_set_power_state(dev, state);
 
-	if (state > PCI_D0 && platform_pci_power_manageable(dev)) {
-		/* Allow the platform to finalize the transition */
-		int ret = platform_pci_set_power_state(dev, state);
-		if (!ret) {
-			pci_update_current_state(dev, state);
-			error = 0;
-		}
-	}
+	if (!__pci_complete_power_transition(dev, state))
+		error = 0;
 
 	return error;
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7bd624bfdcfd..df3644132617 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -689,6 +689,7 @@ size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size);
 /* Power management related routines */
 int pci_save_state(struct pci_dev *dev);
 int pci_restore_state(struct pci_dev *dev);
+int __pci_complete_power_transition(struct pci_dev *dev, pci_power_t state);
 int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 bool pci_pme_capable(struct pci_dev *dev, pci_power_t state);
-- 
cgit v1.2.3-71-gd317


From 99b76233803beab302123d243eea9e41149804f3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 25 Mar 2009 22:48:06 +0300
Subject: proc 2/2: remove struct proc_dir_entry::owner

Setting ->owner as done currently (pde->owner = THIS_MODULE) is racy
as correctly noted at bug #12454. Someone can lookup entry with NULL
->owner, thus not pinning enything, and release it later resulting
in module refcount underflow.

We can keep ->owner and supply it at registration time like ->proc_fops
and ->data.

But this leaves ->owner as easy-manipulative field (just one C assignment)
and somebody will forget to unpin previous/pin current module when
switching ->owner. ->proc_fops is declared as "const" which should give
some thoughts.

->read_proc/->write_proc were just fixed to not require ->owner for
protection.

rmmod'ed directories will be empty and return "." and ".." -- no harm.
And directories with tricky enough readdir and lookup shouldn't be modular.
We definitely don't want such modular code.

Removing ->owner will also make PDE smaller.

So, let's nuke it.

Kudos to Jeff Layton for reminding about this, let's say, oversight.

http://bugzilla.kernel.org/show_bug.cgi?id=12454

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 Documentation/DocBook/procfs_example.c  |  9 ---------
 arch/alpha/kernel/srm_env.c             |  5 -----
 arch/blackfin/mm/sram-alloc.c           |  1 -
 arch/ia64/kernel/palinfo.c              |  2 --
 arch/ia64/sn/kernel/sn2/prominfo_proc.c |  9 ++-------
 arch/powerpc/kernel/rtas_flash.c        |  1 -
 arch/sparc/kernel/led.c                 |  1 -
 arch/x86/kernel/cpu/mtrr/if.c           | 10 +---------
 drivers/acpi/ac.c                       |  1 -
 drivers/acpi/battery.c                  |  1 -
 drivers/acpi/button.c                   |  3 ---
 drivers/acpi/fan.c                      |  2 --
 drivers/acpi/processor_core.c           |  2 --
 drivers/acpi/sbs.c                      |  1 -
 drivers/acpi/thermal.c                  |  2 --
 drivers/acpi/video.c                    |  5 -----
 drivers/block/ps3vram.c                 |  2 --
 drivers/char/ipmi/ipmi_msghandler.c     | 12 ++++-------
 drivers/char/ipmi/ipmi_si_intf.c        |  6 +++---
 drivers/input/input.c                   |  2 --
 drivers/isdn/hardware/eicon/divasi.c    |  1 -
 drivers/media/video/cpia.c              |  4 +---
 drivers/message/i2o/i2o_proc.c          |  2 --
 drivers/net/bonding/bond_main.c         | 35 ++-------------------------------
 drivers/net/irda/vlsi_ir.c              |  7 -------
 drivers/net/wireless/airo.c             |  1 -
 drivers/platform/x86/asus_acpi.c        |  3 ---
 drivers/platform/x86/thinkpad_acpi.c    |  2 --
 drivers/platform/x86/toshiba_acpi.c     |  3 ---
 drivers/rtc/rtc-proc.c                  | 10 ++--------
 drivers/s390/block/dasd_proc.c          |  2 --
 drivers/scsi/scsi_devinfo.c             |  2 --
 drivers/scsi/scsi_proc.c                |  3 ---
 drivers/video/via/viafbdev.c            |  5 -----
 fs/afs/proc.c                           |  1 -
 fs/cifs/cifs_debug.c                    |  1 -
 fs/jfs/jfs_debug.c                      |  1 -
 fs/nfs/client.c                         |  2 --
 fs/proc/inode.c                         | 19 +++---------------
 fs/proc/proc_tty.c                      |  1 -
 fs/reiserfs/procfs.c                    |  5 +----
 include/linux/ipmi_smi.h                |  2 +-
 include/linux/proc_fs.h                 |  4 ----
 net/appletalk/atalk_proc.c              |  1 -
 net/atm/mpoa_proc.c                     |  1 -
 net/atm/proc.c                          |  1 -
 net/can/bcm.c                           |  4 ----
 net/can/proc.c                          |  2 --
 net/core/pktgen.c                       |  1 -
 net/irda/irproc.c                       |  1 -
 net/llc/llc_proc.c                      |  1 -
 net/sctp/protocol.c                     |  8 ++------
 net/sunrpc/cache.c                      |  4 ----
 net/sunrpc/stats.c                      | 10 ++--------
 sound/core/info.c                       | 31 ++---------------------------
 55 files changed, 26 insertions(+), 232 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c
index 8c6396e4bf31..a5b11793b1e0 100644
--- a/Documentation/DocBook/procfs_example.c
+++ b/Documentation/DocBook/procfs_example.c
@@ -117,9 +117,6 @@ static int __init init_procfs_example(void)
 		rv = -ENOMEM;
 		goto out;
 	}
-	
-	example_dir->owner = THIS_MODULE;
-	
 	/* create jiffies using convenience function */
 	jiffies_file = create_proc_read_entry("jiffies", 
 					      0444, example_dir, 
@@ -130,8 +127,6 @@ static int __init init_procfs_example(void)
 		goto no_jiffies;
 	}
 
-	jiffies_file->owner = THIS_MODULE;
-
 	/* create foo and bar files using same callback
 	 * functions 
 	 */
@@ -146,7 +141,6 @@ static int __init init_procfs_example(void)
 	foo_file->data = &foo_data;
 	foo_file->read_proc = proc_read_foobar;
 	foo_file->write_proc = proc_write_foobar;
-	foo_file->owner = THIS_MODULE;
 		
 	bar_file = create_proc_entry("bar", 0644, example_dir);
 	if(bar_file == NULL) {
@@ -159,7 +153,6 @@ static int __init init_procfs_example(void)
 	bar_file->data = &bar_data;
 	bar_file->read_proc = proc_read_foobar;
 	bar_file->write_proc = proc_write_foobar;
-	bar_file->owner = THIS_MODULE;
 		
 	/* create symlink */
 	symlink = proc_symlink("jiffies_too", example_dir, 
@@ -169,8 +162,6 @@ static int __init init_procfs_example(void)
 		goto no_symlink;
 	}
 
-	symlink->owner = THIS_MODULE;
-
 	/* everything OK */
 	printk(KERN_INFO "%s %s initialised\n",
 	       MODULE_NAME, MODULE_VERS);
diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c
index 78ad7cd1bbd6..d12af472e1c0 100644
--- a/arch/alpha/kernel/srm_env.c
+++ b/arch/alpha/kernel/srm_env.c
@@ -218,7 +218,6 @@ srm_env_init(void)
 				BASE_DIR);
 		goto cleanup;
 	}
-	base_dir->owner = THIS_MODULE;
 
 	/*
 	 * Create per-name subdirectory
@@ -229,7 +228,6 @@ srm_env_init(void)
 				BASE_DIR, NAMED_DIR);
 		goto cleanup;
 	}
-	named_dir->owner = THIS_MODULE;
 
 	/*
 	 * Create per-number subdirectory
@@ -241,7 +239,6 @@ srm_env_init(void)
 		goto cleanup;
 
 	}
-	numbered_dir->owner = THIS_MODULE;
 
 	/*
 	 * Create all named nodes
@@ -254,7 +251,6 @@ srm_env_init(void)
 			goto cleanup;
 
 		entry->proc_entry->data		= (void *) entry;
-		entry->proc_entry->owner	= THIS_MODULE;
 		entry->proc_entry->read_proc	= srm_env_read;
 		entry->proc_entry->write_proc	= srm_env_write;
 
@@ -275,7 +271,6 @@ srm_env_init(void)
 
 		entry->id			= var_num;
 		entry->proc_entry->data		= (void *) entry;
-		entry->proc_entry->owner	= THIS_MODULE;
 		entry->proc_entry->read_proc	= srm_env_read;
 		entry->proc_entry->write_proc	= srm_env_write;
 	}
diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c
index 834cab7438a8..530d1393a232 100644
--- a/arch/blackfin/mm/sram-alloc.c
+++ b/arch/blackfin/mm/sram-alloc.c
@@ -854,7 +854,6 @@ static int __init sram_proc_init(void)
 		printk(KERN_WARNING "unable to create /proc/sram\n");
 		return -1;
 	}
-	ptr->owner = THIS_MODULE;
 	ptr->read_proc = sram_proc_read;
 	return 0;
 }
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index e5c57f413ca2..a4f19c70aadd 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -1002,8 +1002,6 @@ create_palinfo_proc_entries(unsigned int cpu)
 		*pdir = create_proc_read_entry(
 				palinfo_entries[j].name, 0, cpu_dir,
 				palinfo_read_entry, (void *)f.value);
-		if (*pdir)
-			(*pdir)->owner = THIS_MODULE;
 		pdir++;
 	}
 }
diff --git a/arch/ia64/sn/kernel/sn2/prominfo_proc.c b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
index 4dcce3d0e04c..e63328818643 100644
--- a/arch/ia64/sn/kernel/sn2/prominfo_proc.c
+++ b/arch/ia64/sn/kernel/sn2/prominfo_proc.c
@@ -225,7 +225,6 @@ static struct proc_dir_entry *sgi_prominfo_entry;
 int __init prominfo_init(void)
 {
 	struct proc_dir_entry **entp;
-	struct proc_dir_entry *p;
 	cnodeid_t cnodeid;
 	unsigned long nasid;
 	int size;
@@ -246,14 +245,10 @@ int __init prominfo_init(void)
 		sprintf(name, "node%d", cnodeid);
 		*entp = proc_mkdir(name, sgi_prominfo_entry);
 		nasid = cnodeid_to_nasid(cnodeid);
-		p = create_proc_read_entry("fit", 0, *entp, read_fit_entry,
+		create_proc_read_entry("fit", 0, *entp, read_fit_entry,
 					   (void *)nasid);
-		if (p)
-			p->owner = THIS_MODULE;
-		p = create_proc_read_entry("version", 0, *entp,
+		create_proc_read_entry("version", 0, *entp,
 					   read_version_entry, (void *)nasid);
-		if (p)
-			p->owner = THIS_MODULE;
 		entp++;
 	}
 
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 149cb112cd1a..13011a96a977 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -669,7 +669,6 @@ static void remove_flash_pde(struct proc_dir_entry *dp)
 {
 	if (dp) {
 		kfree(dp->data);
-		dp->owner = NULL;
 		remove_proc_entry(dp->name, dp->parent);
 	}
 }
diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c
index adaaed4ea2fb..00d034ea2164 100644
--- a/arch/sparc/kernel/led.c
+++ b/arch/sparc/kernel/led.c
@@ -126,7 +126,6 @@ static int __init led_init(void)
 	led = proc_create("led", 0, NULL, &led_proc_fops);
 	if (!led)
 		return -ENOMEM;
-	led->owner = THIS_MODULE;
 
 	printk(KERN_INFO
 	       "led: version %s, Lars Kotthoff <metalhead@metalhead.ws>\n",
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4c4214690dd1..fb73a52913a4 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -377,10 +377,6 @@ static const struct file_operations mtrr_fops = {
 	.release = mtrr_close,
 };
 
-
-static struct proc_dir_entry *proc_root_mtrr;
-
-
 static int mtrr_seq_show(struct seq_file *seq, void *offset)
 {
 	char factor;
@@ -423,11 +419,7 @@ static int __init mtrr_if_init(void)
 	    (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
 		return -ENODEV;
 
-	proc_root_mtrr =
-		proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
-
-	if (proc_root_mtrr)
-		proc_root_mtrr->owner = THIS_MODULE;
+	proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
 	return 0;
 }
 
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 9b917dac7732..88e42abf5d88 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -191,7 +191,6 @@ static int acpi_ac_add_fs(struct acpi_device *device)
 						     acpi_ac_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	/* 'state' [R] */
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 69cbc57c2d1c..3bcb5bfc45d3 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -760,7 +760,6 @@ static int acpi_battery_add_fs(struct acpi_device *device)
 						     acpi_battery_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	for (i = 0; i < ACPI_BATTERY_NUMFILES; ++i) {
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index 171fd914f435..c2f06069dcd4 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -200,12 +200,10 @@ static int acpi_button_add_fs(struct acpi_device *device)
 
 	if (!entry)
 		return -ENODEV;
-	entry->owner = THIS_MODULE;
 
 	acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device), entry);
 	if (!acpi_device_dir(device))
 		return -ENODEV;
-	acpi_device_dir(device)->owner = THIS_MODULE;
 
 	/* 'info' [R] */
 	entry = proc_create_data(ACPI_BUTTON_FILE_INFO,
@@ -522,7 +520,6 @@ static int __init acpi_button_init(void)
 	acpi_button_dir = proc_mkdir(ACPI_BUTTON_CLASS, acpi_root_dir);
 	if (!acpi_button_dir)
 		return -ENODEV;
-	acpi_button_dir->owner = THIS_MODULE;
 	result = acpi_bus_register_driver(&acpi_button_driver);
 	if (result < 0) {
 		remove_proc_entry(ACPI_BUTTON_CLASS, acpi_root_dir);
diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index eaaee1660bdf..8a02944bf92d 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -193,7 +193,6 @@ static int acpi_fan_add_fs(struct acpi_device *device)
 						     acpi_fan_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	/* 'status' [R/W] */
@@ -347,7 +346,6 @@ static int __init acpi_fan_init(void)
 	acpi_fan_dir = proc_mkdir(ACPI_FAN_CLASS, acpi_root_dir);
 	if (!acpi_fan_dir)
 		return -ENODEV;
-	acpi_fan_dir->owner = THIS_MODULE;
 #endif
 
 	result = acpi_bus_register_driver(&acpi_fan_driver);
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 0cc2fd31e376..fa2f7422d23d 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -359,7 +359,6 @@ static int acpi_processor_add_fs(struct acpi_device *device)
 		if (!acpi_device_dir(device))
 			return -ENODEV;
 	}
-	acpi_device_dir(device)->owner = THIS_MODULE;
 
 	/* 'info' [R] */
 	entry = proc_create_data(ACPI_PROCESSOR_FILE_INFO,
@@ -1137,7 +1136,6 @@ static int __init acpi_processor_init(void)
 	acpi_processor_dir = proc_mkdir(ACPI_PROCESSOR_CLASS, acpi_root_dir);
 	if (!acpi_processor_dir)
 		return -ENOMEM;
-	acpi_processor_dir->owner = THIS_MODULE;
 
 	/*
 	 * Check whether the system is DMI table. If yes, OSPM
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 6050ce481873..59afd52ccc12 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -488,7 +488,6 @@ acpi_sbs_add_fs(struct proc_dir_entry **dir,
 		if (!*dir) {
 			return -ENODEV;
 		}
-		(*dir)->owner = THIS_MODULE;
 	}
 
 	/* 'info' [R] */
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 99e6f1f8ea45..c11f9aeca706 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -1506,7 +1506,6 @@ static int acpi_thermal_add_fs(struct acpi_device *device)
 						     acpi_thermal_dir);
 		if (!acpi_device_dir(device))
 			return -ENODEV;
-		acpi_device_dir(device)->owner = THIS_MODULE;
 	}
 
 	/* 'state' [R] */
@@ -1875,7 +1874,6 @@ static int __init acpi_thermal_init(void)
 	acpi_thermal_dir = proc_mkdir(ACPI_THERMAL_CLASS, acpi_root_dir);
 	if (!acpi_thermal_dir)
 		return -ENODEV;
-	acpi_thermal_dir->owner = THIS_MODULE;
 
 	result = acpi_bus_register_driver(&acpi_thermal_driver);
 	if (result < 0) {
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index bb5ed059114a..67cc36dc9b82 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -1125,8 +1125,6 @@ static int acpi_video_device_add_fs(struct acpi_device *device)
 	if (!device_dir)
 		return -ENOMEM;
 
-	device_dir->owner = THIS_MODULE;
-
 	/* 'info' [R] */
 	entry = proc_create_data("info", S_IRUGO, device_dir,
 			&acpi_video_device_info_fops, acpi_driver_data(device));
@@ -1403,8 +1401,6 @@ static int acpi_video_bus_add_fs(struct acpi_device *device)
 	if (!device_dir)
 		return -ENOMEM;
 
-	device_dir->owner = THIS_MODULE;
-
 	/* 'info' [R] */
 	entry = proc_create_data("info", S_IRUGO, device_dir,
 				 &acpi_video_bus_info_fops,
@@ -2131,7 +2127,6 @@ static int __init acpi_video_init(void)
 	acpi_video_dir = proc_mkdir(ACPI_VIDEO_CLASS, acpi_root_dir);
 	if (!acpi_video_dir)
 		return -ENODEV;
-	acpi_video_dir->owner = THIS_MODULE;
 
 	result = acpi_bus_register_driver(&acpi_video_bus);
 	if (result < 0) {
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 393ed6760d78..8eddef373a91 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -551,8 +551,6 @@ static void __devinit ps3vram_proc_init(struct ps3_system_bus_device *dev)
 		dev_warn(&dev->core, "failed to create /proc entry\n");
 		return;
 	}
-
-	pde->owner = THIS_MODULE;
 	pde->data = priv;
 }
 
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 7a88dfd4427b..e93fc8d22fb2 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -1944,7 +1944,7 @@ static int stat_file_read_proc(char *page, char **start, off_t off,
 
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    read_proc_t *read_proc,
-			    void *data, struct module *owner)
+			    void *data)
 {
 	int                    rv = 0;
 #ifdef CONFIG_PROC_FS
@@ -1970,7 +1970,6 @@ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 	} else {
 		file->data = data;
 		file->read_proc = read_proc;
-		file->owner = owner;
 
 		mutex_lock(&smi->proc_entry_lock);
 		/* Stick it on the list. */
@@ -1993,23 +1992,21 @@ static int add_proc_entries(ipmi_smi_t smi, int num)
 	smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root);
 	if (!smi->proc_dir)
 		rv = -ENOMEM;
-	else
-		smi->proc_dir->owner = THIS_MODULE;
 
 	if (rv == 0)
 		rv = ipmi_smi_add_proc_entry(smi, "stats",
 					     stat_file_read_proc,
-					     smi, THIS_MODULE);
+					     smi);
 
 	if (rv == 0)
 		rv = ipmi_smi_add_proc_entry(smi, "ipmb",
 					     ipmb_file_read_proc,
-					     smi, THIS_MODULE);
+					     smi);
 
 	if (rv == 0)
 		rv = ipmi_smi_add_proc_entry(smi, "version",
 					     version_file_read_proc,
-					     smi, THIS_MODULE);
+					     smi);
 #endif /* CONFIG_PROC_FS */
 
 	return rv;
@@ -4265,7 +4262,6 @@ static int ipmi_init_msghandler(void)
 	    return -ENOMEM;
 	}
 
-	proc_ipmi_root->owner = THIS_MODULE;
 #endif /* CONFIG_PROC_FS */
 
 	setup_timer(&ipmi_timer, ipmi_timeout, 0);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 3000135f2ead..e58ea4cd55ce 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2899,7 +2899,7 @@ static int try_smi_init(struct smi_info *new_smi)
 
 	rv = ipmi_smi_add_proc_entry(new_smi->intf, "type",
 				     type_file_read_proc,
-				     new_smi, THIS_MODULE);
+				     new_smi);
 	if (rv) {
 		printk(KERN_ERR
 		       "ipmi_si: Unable to create proc entry: %d\n",
@@ -2909,7 +2909,7 @@ static int try_smi_init(struct smi_info *new_smi)
 
 	rv = ipmi_smi_add_proc_entry(new_smi->intf, "si_stats",
 				     stat_file_read_proc,
-				     new_smi, THIS_MODULE);
+				     new_smi);
 	if (rv) {
 		printk(KERN_ERR
 		       "ipmi_si: Unable to create proc entry: %d\n",
@@ -2919,7 +2919,7 @@ static int try_smi_init(struct smi_info *new_smi)
 
 	rv = ipmi_smi_add_proc_entry(new_smi->intf, "params",
 				     param_read_proc,
-				     new_smi, THIS_MODULE);
+				     new_smi);
 	if (rv) {
 		printk(KERN_ERR
 		       "ipmi_si: Unable to create proc entry: %d\n",
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 1730d7331a5d..ec3db3ade118 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -903,8 +903,6 @@ static int __init input_proc_init(void)
 	if (!proc_bus_input_dir)
 		return -ENOMEM;
 
-	proc_bus_input_dir->owner = THIS_MODULE;
-
 	entry = proc_create("devices", 0, proc_bus_input_dir,
 			    &input_devices_fileops);
 	if (!entry)
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index f4969fe0a055..69e71ebe7841 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -118,7 +118,6 @@ static int DIVA_INIT_FUNCTION create_um_idi_proc(void)
 		return (0);
 
 	um_idi_proc_entry->read_proc = um_idi_proc_read;
-	um_idi_proc_entry->owner = THIS_MODULE;
 
 	return (1);
 }
diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c
index c3b0c8c63c76..43ab0adf3b61 100644
--- a/drivers/media/video/cpia.c
+++ b/drivers/media/video/cpia.c
@@ -1381,9 +1381,7 @@ static void proc_cpia_create(void)
 {
 	cpia_proc_root = proc_mkdir("cpia", NULL);
 
-	if (cpia_proc_root)
-		cpia_proc_root->owner = THIS_MODULE;
-	else
+	if (!cpia_proc_root)
 		LOG("Unable to initialise /proc/cpia\n");
 }
 
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 9a36b5a7de57..7045c45da9b1 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -2037,8 +2037,6 @@ static int __init i2o_proc_fs_create(void)
 	if (!i2o_proc_dir_root)
 		return -1;
 
-	i2o_proc_dir_root->owner = THIS_MODULE;
-
 	list_for_each_entry(c, &i2o_controllers, list)
 	    i2o_proc_iop_add(i2o_proc_dir_root, c);
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9c326a50a3ee..99610f358c40 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3444,25 +3444,12 @@ static void bond_remove_proc_entry(struct bonding *bond)
  */
 static void bond_create_proc_dir(void)
 {
-	int len = strlen(DRV_NAME);
-
-	for (bond_proc_dir = init_net.proc_net->subdir; bond_proc_dir;
-	     bond_proc_dir = bond_proc_dir->next) {
-		if ((bond_proc_dir->namelen == len) &&
-		    !memcmp(bond_proc_dir->name, DRV_NAME, len)) {
-			break;
-		}
-	}
-
 	if (!bond_proc_dir) {
 		bond_proc_dir = proc_mkdir(DRV_NAME, init_net.proc_net);
-		if (bond_proc_dir) {
-			bond_proc_dir->owner = THIS_MODULE;
-		} else {
+		if (!bond_proc_dir)
 			printk(KERN_WARNING DRV_NAME
 				": Warning: cannot create /proc/net/%s\n",
 				DRV_NAME);
-		}
 	}
 }
 
@@ -3471,25 +3458,7 @@ static void bond_create_proc_dir(void)
  */
 static void bond_destroy_proc_dir(void)
 {
-	struct proc_dir_entry *de;
-
-	if (!bond_proc_dir) {
-		return;
-	}
-
-	/* verify that the /proc dir is empty */
-	for (de = bond_proc_dir->subdir; de; de = de->next) {
-		/* ignore . and .. */
-		if (*(de->name) != '.') {
-			break;
-		}
-	}
-
-	if (de) {
-		if (bond_proc_dir->owner == THIS_MODULE) {
-			bond_proc_dir->owner = NULL;
-		}
-	} else {
+	if (bond_proc_dir) {
 		remove_proc_entry(DRV_NAME, init_net.proc_net);
 		bond_proc_dir = NULL;
 	}
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index 1243bc8e0035..ac0e4b6b6b66 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -1871,13 +1871,6 @@ static int __init vlsi_mod_init(void)
 	 * without procfs - it's not required for the driver to work.
 	 */
 	vlsi_proc_root = proc_mkdir(PROC_DIR, NULL);
-	if (vlsi_proc_root) {
-		/* protect registered procdir against module removal.
-		 * Because we are in the module init path there's no race
-		 * window after create_proc_entry (and no barrier needed).
-		 */
-		vlsi_proc_root->owner = THIS_MODULE;
-	}
 
 	ret = pci_register_driver(&vlsi_irda_driver);
 
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 7e80aba8a148..31b1cc2b778a 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -4494,7 +4494,6 @@ static int setup_proc_entry( struct net_device *dev,
 		goto fail;
 	apriv->proc_entry->uid = proc_uid;
 	apriv->proc_entry->gid = proc_gid;
-	apriv->proc_entry->owner = THIS_MODULE;
 
 	/* Setup the StatsDelta */
 	entry = proc_create_data("StatsDelta",
diff --git a/drivers/platform/x86/asus_acpi.c b/drivers/platform/x86/asus_acpi.c
index d63f26e666a4..ba1f7497e4b9 100644
--- a/drivers/platform/x86/asus_acpi.c
+++ b/drivers/platform/x86/asus_acpi.c
@@ -987,7 +987,6 @@ asus_proc_add(char *name, proc_writefunc *writefunc,
 	proc->write_proc = writefunc;
 	proc->read_proc = readfunc;
 	proc->data = acpi_driver_data(device);
-	proc->owner = THIS_MODULE;
 	proc->uid = asus_uid;
 	proc->gid = asus_gid;
 	return 0;
@@ -1020,7 +1019,6 @@ static int asus_hotk_add_fs(struct acpi_device *device)
 	if (proc) {
 		proc->read_proc = proc_read_info;
 		proc->data = acpi_driver_data(device);
-		proc->owner = THIS_MODULE;
 		proc->uid = asus_uid;
 		proc->gid = asus_gid;
 	} else {
@@ -1436,7 +1434,6 @@ static int __init asus_acpi_init(void)
 		printk(KERN_ERR "Asus ACPI: Unable to create /proc entry\n");
 		return -ENODEV;
 	}
-	asus_proc_dir->owner = THIS_MODULE;
 
 	result = acpi_bus_register_driver(&asus_hotk_driver);
 	if (result < 0) {
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index d2433204a40c..3dad27a385d3 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -6992,7 +6992,6 @@ static int __init ibm_init(struct ibm_init_struct *iibm)
 			ret = -ENODEV;
 			goto err_out;
 		}
-		entry->owner = THIS_MODULE;
 		entry->data = ibm;
 		entry->read_proc = &dispatch_procfs_read;
 		if (ibm->write)
@@ -7405,7 +7404,6 @@ static int __init thinkpad_acpi_module_init(void)
 		thinkpad_acpi_module_exit();
 		return -ENODEV;
 	}
-	proc_dir->owner = THIS_MODULE;
 
 	ret = platform_driver_register(&tpacpi_pdriver);
 	if (ret) {
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 40e60fc2e596..9f187265db8e 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -679,8 +679,6 @@ static acpi_status __init add_device(void)
 					      toshiba_proc_dir,
 					      (read_proc_t *) dispatch_read,
 					      item);
-		if (proc)
-			proc->owner = THIS_MODULE;
 		if (proc && item->write_func)
 			proc->write_proc = (write_proc_t *) dispatch_write;
 	}
@@ -772,7 +770,6 @@ static int __init toshiba_acpi_init(void)
 		toshiba_acpi_exit();
 		return -ENODEV;
 	} else {
-		toshiba_proc_dir->owner = THIS_MODULE;
 		status = add_device();
 		if (ACPI_FAILURE(status)) {
 			toshiba_acpi_exit();
diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c
index 0c6257a034ff..c086fc30a84c 100644
--- a/drivers/rtc/rtc-proc.c
+++ b/drivers/rtc/rtc-proc.c
@@ -105,14 +105,8 @@ static const struct file_operations rtc_proc_fops = {
 
 void rtc_proc_add_device(struct rtc_device *rtc)
 {
-	if (rtc->id == 0) {
-		struct proc_dir_entry *ent;
-
-		ent = proc_create_data("driver/rtc", 0, NULL,
-				       &rtc_proc_fops, rtc);
-		if (ent)
-			ent->owner = rtc->owner;
-	}
+	if (rtc->id == 0)
+		proc_create_data("driver/rtc", 0, NULL, &rtc_proc_fops, rtc);
 }
 
 void rtc_proc_del_device(struct rtc_device *rtc)
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index 2080ba6a69b0..654daa3cdfda 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -320,7 +320,6 @@ dasd_proc_init(void)
 	dasd_proc_root_entry = proc_mkdir("dasd", NULL);
 	if (!dasd_proc_root_entry)
 		goto out_nodasd;
-	dasd_proc_root_entry->owner = THIS_MODULE;
 	dasd_devices_entry = proc_create("devices",
 					 S_IFREG | S_IRUGO | S_IWUSR,
 					 dasd_proc_root_entry,
@@ -334,7 +333,6 @@ dasd_proc_init(void)
 		goto out_nostatistics;
 	dasd_statistics_entry->read_proc = dasd_statistics_read;
 	dasd_statistics_entry->write_proc = dasd_statistics_write;
-	dasd_statistics_entry->owner = THIS_MODULE;
 	return 0;
 
  out_nostatistics:
diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index 099b5455bbce..b13481369642 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -596,8 +596,6 @@ int __init scsi_init_devinfo(void)
 		error = -ENOMEM;
 		goto out;
 	}
-
-	p->owner = THIS_MODULE;
 #endif /* CONFIG_SCSI_PROC_FS */
 
  out:
diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index 82f7b2dd08a2..77fbddb507fd 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -115,8 +115,6 @@ void scsi_proc_hostdir_add(struct scsi_host_template *sht)
         	if (!sht->proc_dir)
 			printk(KERN_ERR "%s: proc_mkdir failed for %s\n",
 			       __func__, sht->proc_name);
-		else
-			sht->proc_dir->owner = sht->module;
 	}
 	mutex_unlock(&global_host_template_mutex);
 }
@@ -163,7 +161,6 @@ void scsi_proc_host_add(struct Scsi_Host *shost)
 	} 
 
 	p->write_proc = proc_scsi_write_proc;
-	p->owner = sht->module;
 }
 
 /**
diff --git a/drivers/video/via/viafbdev.c b/drivers/video/via/viafbdev.c
index 37b433a08ce8..e327b84820d2 100644
--- a/drivers/video/via/viafbdev.c
+++ b/drivers/video/via/viafbdev.c
@@ -2059,25 +2059,21 @@ static void viafb_init_proc(struct proc_dir_entry **viafb_entry)
 	if (viafb_entry) {
 		entry = create_proc_entry("dvp0", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dvp0_proc_read;
 			entry->write_proc = viafb_dvp0_proc_write;
 		}
 		entry = create_proc_entry("dvp1", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dvp1_proc_read;
 			entry->write_proc = viafb_dvp1_proc_write;
 		}
 		entry = create_proc_entry("dfph", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dfph_proc_read;
 			entry->write_proc = viafb_dfph_proc_write;
 		}
 		entry = create_proc_entry("dfpl", 0, *viafb_entry);
 		if (entry) {
-			entry->owner = THIS_MODULE;
 			entry->read_proc = viafb_dfpl_proc_read;
 			entry->write_proc = viafb_dfpl_proc_write;
 		}
@@ -2086,7 +2082,6 @@ static void viafb_init_proc(struct proc_dir_entry **viafb_entry)
 		    viaparinfo->chip_info->lvds_chip_info2.lvds_chip_name) {
 			entry = create_proc_entry("vt1636", 0, *viafb_entry);
 			if (entry) {
-				entry->owner = THIS_MODULE;
 				entry->read_proc = viafb_vt1636_proc_read;
 				entry->write_proc = viafb_vt1636_proc_write;
 			}
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 7578c1ab9e0b..8630615e57fe 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -146,7 +146,6 @@ int afs_proc_init(void)
 	proc_afs = proc_mkdir("fs/afs", NULL);
 	if (!proc_afs)
 		goto error_dir;
-	proc_afs->owner = THIS_MODULE;
 
 	p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
 	if (!p)
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 877e4d9a1159..7f19fefd3d45 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -404,7 +404,6 @@ cifs_proc_init(void)
 	if (proc_fs_cifs == NULL)
 		return;
 
-	proc_fs_cifs->owner = THIS_MODULE;
 	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
 
 #ifdef CONFIG_CIFS_STATS
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 6a73de84bcef..dd824d9b0b1a 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -90,7 +90,6 @@ void jfs_proc_init(void)
 
 	if (!(base = proc_mkdir("fs/jfs", NULL)))
 		return;
-	base->owner = THIS_MODULE;
 
 	for (i = 0; i < NPROCENT; i++)
 		proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 574158ae2398..2277421656e7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1606,8 +1606,6 @@ int __init nfs_fs_proc_init(void)
 	if (!proc_fs_nfs)
 		goto error_0;
 
-	proc_fs_nfs->owner = THIS_MODULE;
-
 	/* a file of servers with which we're dealing */
 	p = proc_create("servers", S_IFREG|S_IRUGO,
 			proc_fs_nfs, &nfs_server_list_fops);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e11dc22c6511..d78ade305541 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,11 +58,8 @@ static void proc_delete_inode(struct inode *inode)
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
-	if (de) {
-		if (de->owner)
-			module_put(de->owner);
+	if (de)
 		de_put(de);
-	}
 	if (PROC_I(inode)->sysctl)
 		sysctl_head_put(PROC_I(inode)->sysctl);
 	clear_inode(inode);
@@ -449,12 +446,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 {
 	struct inode * inode;
 
-	if (!try_module_get(de->owner))
-		goto out_mod;
-
 	inode = iget_locked(sb, ino);
 	if (!inode)
-		goto out_ino;
+		return NULL;
 	if (inode->i_state & I_NEW) {
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->fd = 0;
@@ -485,16 +479,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			}
 		}
 		unlock_new_inode(inode);
-	} else {
-	       module_put(de->owner);
+	} else
 	       de_put(de);
-	}
 	return inode;
-
-out_ino:
-	module_put(de->owner);
-out_mod:
-	return NULL;
 }			
 
 int proc_fill_super(struct super_block *s)
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index d153946d6d15..4a9e0f65ae60 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -152,7 +152,6 @@ void proc_tty_register_driver(struct tty_driver *driver)
 	if (!ent)
 		return;
 	ent->read_proc = driver->ops->read_proc;
-	ent->owner = driver->owner;
 	ent->data = driver;
 
 	driver->proc_entry = ent;
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index d5066400638a..9229e5514a4e 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -492,7 +492,6 @@ int reiserfs_proc_info_init(struct super_block *sb)
 	spin_lock_init(&__PINFO(sb).lock);
 	REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
 	if (REISERFS_SB(sb)->procdir) {
-		REISERFS_SB(sb)->procdir->owner = THIS_MODULE;
 		REISERFS_SB(sb)->procdir->data = sb;
 		add_file(sb, "version", show_version);
 		add_file(sb, "super", show_super);
@@ -556,9 +555,7 @@ int reiserfs_proc_info_global_init(void)
 {
 	if (proc_info_root == NULL) {
 		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
-		if (proc_info_root) {
-			proc_info_root->owner = THIS_MODULE;
-		} else {
+		if (!proc_info_root) {
 			reiserfs_warning(NULL, "cannot create /proc/%s",
 					 proc_info_root_name);
 			return 1;
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index 62b73668b602..f7c9c75a2775 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -230,6 +230,6 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
    automatically be dstroyed when the interface is destroyed. */
 int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    read_proc_t *read_proc,
-			    void *data, struct module *owner);
+			    void *data);
 
 #endif /* __LINUX_IPMI_SMI_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index b8bdb96eff78..fbfa3d44d33d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -41,9 +41,6 @@ enum {
  * while parent/subdir create the directory structure (every
  * /proc file has a parent, but "subdir" is NULL for all
  * non-directory entries).
- *
- * "owner" is used to protect module
- * from unloading while proc_dir_entry is in use
  */
 
 typedef	int (read_proc_t)(char *page, char **start, off_t off,
@@ -70,7 +67,6 @@ struct proc_dir_entry {
 	 * somewhere.
 	 */
 	const struct file_operations *proc_fops;
-	struct module *owner;
 	struct proc_dir_entry *next, *parent, *subdir;
 	void *data;
 	read_proc_t *read_proc;
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index 162199a2d74f..fd8e0847b254 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -281,7 +281,6 @@ int __init atalk_proc_init(void)
 	atalk_proc_dir = proc_mkdir("atalk", init_net.proc_net);
 	if (!atalk_proc_dir)
 		goto out;
-	atalk_proc_dir->owner = THIS_MODULE;
 
 	p = proc_create("interface", S_IRUGO, atalk_proc_dir,
 			&atalk_seq_interface_fops);
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 4990541ef5da..1a0f5ccea9c4 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -281,7 +281,6 @@ int mpc_proc_init(void)
 		printk(KERN_ERR "Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
 		return -ENOMEM;
 	}
-	p->owner = THIS_MODULE;
 	return 0;
 }
 
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 49487b313f22..e7b3b273907d 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -476,7 +476,6 @@ int __init atm_proc_init(void)
 				     atm_proc_root, e->proc_fops);
 		if (!dirent)
 			goto err_out_remove;
-		dirent->owner = THIS_MODULE;
 		e->dirent = dirent;
 	}
 	ret = 0;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index b7c7d4651136..95d7f32643ae 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1604,10 +1604,6 @@ static int __init bcm_module_init(void)
 
 	/* create /proc/net/can-bcm directory */
 	proc_dir = proc_mkdir("can-bcm", init_net.proc_net);
-
-	if (proc_dir)
-		proc_dir->owner = THIS_MODULE;
-
 	return 0;
 }
 
diff --git a/net/can/proc.c b/net/can/proc.c
index 520fef5e5398..1463653dbe34 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -473,8 +473,6 @@ void can_init_proc(void)
 		return;
 	}
 
-	can_dir->owner = THIS_MODULE;
-
 	/* own procfs entries from the AF_CAN core */
 	pde_version     = can_create_proc_readentry(CAN_PROC_VERSION, 0644,
 					can_proc_read_version, NULL);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 32d419f5ac98..3779c1438c11 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3806,7 +3806,6 @@ static int __init pg_init(void)
 	pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net);
 	if (!pg_proc_dir)
 		return -ENODEV;
-	pg_proc_dir->owner = THIS_MODULE;
 
 	pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops);
 	if (pe == NULL) {
diff --git a/net/irda/irproc.c b/net/irda/irproc.c
index 88e80a312732..8ff1861649e8 100644
--- a/net/irda/irproc.c
+++ b/net/irda/irproc.c
@@ -70,7 +70,6 @@ void __init irda_proc_register(void)
 	proc_irda = proc_mkdir("irda", init_net.proc_net);
 	if (proc_irda == NULL)
 		return;
-	proc_irda->owner = THIS_MODULE;
 
 	for (i = 0; i < ARRAY_SIZE(irda_dirs); i++)
 		d = proc_create(irda_dirs[i].name, 0, proc_irda,
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index b58bd7c6cdf8..d208b3396d94 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -236,7 +236,6 @@ int __init llc_proc_init(void)
 	llc_proc_dir = proc_mkdir("llc", init_net.proc_net);
 	if (!llc_proc_dir)
 		goto out;
-	llc_proc_dir->owner = THIS_MODULE;
 
 	p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops);
 	if (!p)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index cb198af8887c..8eb3e61cb701 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -106,12 +106,8 @@ static __init int sctp_proc_init(void)
 		goto out_nomem;
 #ifdef CONFIG_PROC_FS
 	if (!proc_net_sctp) {
-		struct proc_dir_entry *ent;
-		ent = proc_mkdir("sctp", init_net.proc_net);
-		if (ent) {
-			ent->owner = THIS_MODULE;
-			proc_net_sctp = ent;
-		} else
+		proc_net_sctp = proc_mkdir("sctp", init_net.proc_net);
+		if (!proc_net_sctp)
 			goto out_free_percpu;
 	}
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4735caad26ed..20029a79a5de 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -313,7 +313,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 	cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
 	if (cd->proc_ent == NULL)
 		goto out_nomem;
-	cd->proc_ent->owner = cd->owner;
 	cd->channel_ent = cd->content_ent = NULL;
 
 	p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
@@ -321,7 +320,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 	cd->flush_ent = p;
 	if (p == NULL)
 		goto out_nomem;
-	p->owner = cd->owner;
 
 	if (cd->cache_request || cd->cache_parse) {
 		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
@@ -329,7 +327,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 		cd->channel_ent = p;
 		if (p == NULL)
 			goto out_nomem;
-		p->owner = cd->owner;
 	}
 	if (cd->cache_show) {
 		p = proc_create_data("content", S_IFREG|S_IRUSR|S_IWUSR,
@@ -337,7 +334,6 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 		cd->content_ent = p;
 		if (p == NULL)
 			goto out_nomem;
-		p->owner = cd->owner;
 	}
 	return 0;
 out_nomem:
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 085372ef4feb..1ef6e46d9da2 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -262,14 +262,8 @@ void
 rpc_proc_init(void)
 {
 	dprintk("RPC:       registering /proc/net/rpc\n");
-	if (!proc_net_rpc) {
-		struct proc_dir_entry *ent;
-		ent = proc_mkdir("rpc", init_net.proc_net);
-		if (ent) {
-			ent->owner = THIS_MODULE;
-			proc_net_rpc = ent;
-		}
-	}
+	if (!proc_net_rpc)
+		proc_net_rpc = proc_mkdir("rpc", init_net.proc_net);
 }
 
 void
diff --git a/sound/core/info.c b/sound/core/info.c
index 70fa87189f36..35df614f6c55 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -154,11 +154,6 @@ EXPORT_SYMBOL(snd_seq_root);
 struct snd_info_entry *snd_oss_root;
 #endif
 
-static inline void snd_info_entry_prepare(struct proc_dir_entry *de)
-{
-	de->owner = THIS_MODULE;
-}
-
 static void snd_remove_proc_entry(struct proc_dir_entry *parent,
 				  struct proc_dir_entry *de)
 {
@@ -522,32 +517,11 @@ static const struct file_operations snd_info_entry_operations =
 	.release =		snd_info_entry_release,
 };
 
-/**
- * snd_create_proc_entry - create a procfs entry
- * @name: the name of the proc file
- * @mode: the file permission bits, S_Ixxx
- * @parent: the parent proc-directory entry
- *
- * Creates a new proc file entry with the given name and permission
- * on the given directory.
- *
- * Returns the pointer of new instance or NULL on failure.
- */
-static struct proc_dir_entry *snd_create_proc_entry(const char *name, mode_t mode,
-						    struct proc_dir_entry *parent)
-{
-	struct proc_dir_entry *p;
-	p = create_proc_entry(name, mode, parent);
-	if (p)
-		snd_info_entry_prepare(p);
-	return p;
-}
-
 int __init snd_info_init(void)
 {
 	struct proc_dir_entry *p;
 
-	p = snd_create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL);
+	p = create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL);
 	if (p == NULL)
 		return -ENOMEM;
 	snd_proc_root = p;
@@ -974,12 +948,11 @@ int snd_info_register(struct snd_info_entry * entry)
 		return -ENXIO;
 	root = entry->parent == NULL ? snd_proc_root : entry->parent->p;
 	mutex_lock(&info_mutex);
-	p = snd_create_proc_entry(entry->name, entry->mode, root);
+	p = create_proc_entry(entry->name, entry->mode, root);
 	if (!p) {
 		mutex_unlock(&info_mutex);
 		return -ENOMEM;
 	}
-	p->owner = entry->module;
 	if (!S_ISDIR(entry->mode))
 		p->proc_fops = &snd_info_entry_operations;
 	p->size = entry->size;
-- 
cgit v1.2.3-71-gd317


From 77e465867080c2d1e0c410e96dcdcd51e8584a6f Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Tue, 31 Mar 2009 00:32:40 +0400
Subject: reiserfs: fix build breakage

Fix this build error when REISERFS_FS_POSIX_ACL is not set:

  fs/reiserfs/inode.c: In function 'reiserfs_new_inode':
  fs/reiserfs/inode.c:1919: warning: passing argument 1 of 'reiserfs_inherit_default_acl' from incompatible pointer type
  fs/reiserfs/inode.c:1919: warning: passing argument 2 of 'reiserfs_inherit_default_acl' from incompatible pointer type
  fs/reiserfs/inode.c:1919: warning: passing argument 3 of 'reiserfs_inherit_default_acl' from incompatible pointer type
  fs/reiserfs/inode.c:1919: error: too many arguments to function 'reiserfs_inherit_default_acl'

due to a missing transaction-handle argument in the non-acl
compatibility function.

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Acked-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reiserfs_acl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 52240e02de02..8cc65757e47a 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -80,7 +80,8 @@ static inline int reiserfs_acl_chmod(struct inode *inode)
 }
 
 static inline int
-reiserfs_inherit_default_acl(const struct inode *dir, struct dentry *dentry,
+reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+			     const struct inode *dir, struct dentry *dentry,
 			     struct inode *inode)
 {
 	return 0;
-- 
cgit v1.2.3-71-gd317


From e180a6b7759a99a28cbcce3547c4c80822cb6c2a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:29 -0600
Subject: param: fix charp parameters set via sysfs

Impact: fix crash on reading from /sys/module/.../ieee80211_default_rc_algo

The module_param type "charp" simply sets a char * pointer in the
module to the parameter in the commandline string: this is why we keep
the (mangled) module command line around.  But when set via sysfs (as
about 11 charp parameters can be) this memory is freed on the way
out of the write().  Future reads hit random mem.

So we kstrdup instead: we have to check we're not in early commandline
parsing, and we have to note when we've used it so we can reliably
kfree the parameter when it's next overwritten, and also on module
unload.

(Thanks to Randy Dunlap for CONFIG_SYSFS=n fixes)

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Diagnosed-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      |  4 ++++
 include/linux/moduleparam.h | 10 ++++++++++
 kernel/module.c             | 14 ++++++++------
 kernel/params.c             | 26 +++++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..08e5e75d6122 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -248,6 +248,10 @@ struct module
 	const unsigned long *crcs;
 	unsigned int num_syms;
 
+	/* Kernel parameters. */
+	struct kernel_param *kp;
+	unsigned int num_kp;
+
 	/* GPL-only exported symbols. */
 	unsigned int num_gpl_syms;
 	const struct kernel_symbol *gpl_syms;
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index e4af3399ef48..a4f0b931846c 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -138,6 +138,16 @@ extern int parse_args(const char *name,
 		      unsigned num,
 		      int (*unknown)(char *param, char *val));
 
+/* Called by module remove. */
+#ifdef CONFIG_SYSFS
+extern void destroy_params(const struct kernel_param *params, unsigned num);
+#else
+static inline void destroy_params(const struct kernel_param *params,
+				  unsigned num)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
 /* All the helper functions */
 /* The macros to do compile-time type checking stolen from Jakub
    Jelinek, who IIRC came up with this idea for the 2.4 module init code. */
diff --git a/kernel/module.c b/kernel/module.c
index f77ac320d0b5..b862fdb6a372 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1491,6 +1491,9 @@ static void free_module(struct module *mod)
 	/* Module unload stuff */
 	module_unload_free(mod);
 
+	/* Free any allocated parameters. */
+	destroy_params(mod->kp, mod->num_kp);
+
 	/* release any pointers to mcount in this module */
 	ftrace_release(mod->module_core, mod->core_size);
 
@@ -1898,8 +1901,7 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_kp, num_mcount;
-	struct kernel_param *kp;
+	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2144,8 +2146,8 @@ static noinline struct module *load_module(void __user *umod,
 
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
-	kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
-			  &num_kp);
+	mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
+			       sizeof(*mod->kp), &mod->num_kp);
 	mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
 				 sizeof(*mod->syms), &mod->num_syms);
 	mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2291,11 +2293,11 @@ static noinline struct module *load_module(void __user *umod,
 	 */
 	list_add_rcu(&mod->list, &modules);
 
-	err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
+	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
 	if (err < 0)
 		goto unlink;
 
-	err = mod_sysfs_setup(mod, kp, num_kp);
+	err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
+/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
+#define KPARAM_KMALLOCED	0x80000000
+
 #if 0
 #define DEBUGP printk
 #else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	*(char **)kp->arg = (char *)val;
+	if (kp->perm & KPARAM_KMALLOCED)
+		kfree(*(char **)kp->arg);
+
+	/* This is a hack.  We can't need to strdup in early boot, and we
+	 * don't need to; this mangled commandline is preserved. */
+	if (slab_is_available()) {
+		kp->perm |= KPARAM_KMALLOCED;
+		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+		if (!kp->arg)
+			return -ENOMEM;
+	} else
+		*(const char **)kp->arg = val;
+
 	return 0;
 }
 
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
 }
 #endif
 
+void destroy_params(const struct kernel_param *params, unsigned num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (params[i].perm & KPARAM_KMALLOCED)
+			kfree(*(char **)params[i].arg);
+}
+
 static void __init kernel_add_sysfs_param(const char *name,
 					  struct kernel_param *kparam,
 					  unsigned int name_skip)
-- 
cgit v1.2.3-71-gd317


From e610499e2656e61975affd0af56b26eb73964c84 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: __module_address

Impact: New API, cleanup

ksplice wants to know the bounds of a module, not just the module text.

It makes sense to have __module_address.  We then implement
is_module_address and __module_text_address in terms of this (and
change is_module_text_address() to bool while we're at it).

Also, add proper kerneldoc for them all.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 20 +++++++++----
 kernel/module.c        | 76 ++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 08e5e75d6122..fd1241e1416f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -365,7 +365,9 @@ static inline int module_is_live(struct module *mod)
 /* Is this address in a module? (second is with no locks, for oops) */
 struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
-int is_module_address(unsigned long addr);
+struct module *__module_address(unsigned long addr);
+bool is_module_address(unsigned long addr);
+bool is_module_text_address(unsigned long addr);
 
 static inline int within_module_core(unsigned long addr, struct module *mod)
 {
@@ -494,21 +496,29 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-/* Is this address in a module? */
 static inline struct module *module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-/* Is this address in a module? (don't take a lock, we're oopsing) */
+static inline struct module *__module_address(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline struct module *__module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-static inline int is_module_address(unsigned long addr)
+static inline bool is_module_address(unsigned long addr)
 {
-	return 0;
+	return false;
+}
+
+static inline bool is_module_text_address(unsigned long addr)
+{
+	return false;
 }
 
 /* Get/put a kernel symbol (calls should be symmetric) */
diff --git a/kernel/module.c b/kernel/module.c
index 2f0fddf3c114..bd15a94f91c1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -76,7 +76,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_text_address */
+/* Bounds of module allocation, for speeding __module_address */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
 int register_module_notifier(struct notifier_block * nb)
@@ -2745,29 +2745,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 }
 
 /*
- * Is this a valid module address?
+ * is_module_address - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
  */
-int is_module_address(unsigned long addr)
+bool is_module_address(unsigned long addr)
 {
-	struct module *mod;
+	bool ret;
 
 	preempt_disable();
-
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within_module_core(addr, mod)) {
-			preempt_enable();
-			return 1;
-		}
-	}
-
+	ret = __module_address(addr) != NULL;
 	preempt_enable();
 
-	return 0;
+	return ret;
 }
 
-
-/* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+/*
+ * __module_address - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+__notrace_funcgraph struct module *__module_address(unsigned long addr)
 {
 	struct module *mod;
 
@@ -2775,12 +2777,50 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 		return NULL;
 
 	list_for_each_entry_rcu(mod, &modules, list)
-		if (within(addr, mod->module_init, mod->init_text_size)
-		    || within(addr, mod->module_core, mod->core_text_size))
+		if (within_module_core(addr, mod)
+		    || within_module_init(addr, mod))
 			return mod;
 	return NULL;
 }
 
+/*
+ * is_module_text_address - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module.  See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+	bool ret;
+
+	preempt_disable();
+	ret = __module_text_address(addr) != NULL;
+	preempt_enable();
+
+	return ret;
+}
+
+/*
+ * __module_text_address - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+	struct module *mod = __module_address(addr);
+	if (mod) {
+		/* Make sure it's within the text section. */
+		if (!within(addr, mod->module_init, mod->init_text_size)
+		    && !within(addr, mod->module_core, mod->core_text_size))
+			mod = NULL;
+	}
+	return mod;
+}
+
 struct module *module_text_address(unsigned long addr)
 {
 	struct module *mod;
-- 
cgit v1.2.3-71-gd317


From a6e6abd575fcbe6572ebc7a70ad616406d206fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: remove module_text_address()

Impact: Replace and remove risky (non-EXPORTed) API

module_text_address() returns a pointer to the module, which given locking
improvements in module.c, is useless except to test for NULL:

1) If the module can't go away, use __module_text_address.
2) Otherwise, just use is_module_text_address().

Cc: linux-mtd@lists.infradead.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/mtd/nand/nand_base.c |  4 ++--
 include/linux/module.h       |  7 -------
 kernel/extable.c             |  6 +++---
 kernel/module.c              | 17 ++++-------------
 4 files changed, 9 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0c3afccde8a2..5f71371eb1b0 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2720,14 +2720,14 @@ int nand_scan_tail(struct mtd_info *mtd)
 	return chip->scan_bbt(mtd);
 }
 
-/* module_text_address() isn't exported, and it's mostly a pointless
+/* is_module_text_address() isn't exported, and it's mostly a pointless
    test if this is a module _anyway_ -- they'd have to try _really_ hard
    to call us from in-kernel code if the core NAND support is modular. */
 #ifdef MODULE
 #define caller_is_module() (1)
 #else
 #define caller_is_module() \
-	module_text_address((unsigned long)__builtin_return_address(0))
+	is_module_text_address((unsigned long)__builtin_return_address(0))
 #endif
 
 /**
diff --git a/include/linux/module.h b/include/linux/module.h
index fd1241e1416f..69761ce0dbf0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -362,8 +362,6 @@ static inline int module_is_live(struct module *mod)
 	return mod->state != MODULE_STATE_GOING;
 }
 
-/* Is this address in a module? (second is with no locks, for oops) */
-struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
@@ -496,11 +494,6 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-static inline struct module *module_text_address(unsigned long addr)
-{
-	return NULL;
-}
-
 static inline struct module *__module_address(unsigned long addr)
 {
 	return NULL;
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..384f0da8a03e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -58,14 +58,14 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 int kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 /*
@@ -81,5 +81,5 @@ int func_ptr_is_kernel_text(void *ptr)
 	addr = (unsigned long) dereference_function_descriptor(ptr);
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
diff --git a/kernel/module.c b/kernel/module.c
index bd15a94f91c1..8ddca629e079 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -908,8 +908,10 @@ void symbol_put_addr(void *addr)
 	if (core_kernel_text((unsigned long)addr))
 		return;
 
-	if (!(modaddr = module_text_address((unsigned long)addr)))
-		BUG();
+	/* module_text_address is safe here: we're supposed to have reference
+	 * to module from symbol_get, so it can't go away. */
+	modaddr = __module_text_address((unsigned long)addr);
+	BUG_ON(!modaddr);
 	module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -2821,17 +2823,6 @@ struct module *__module_text_address(unsigned long addr)
 	return mod;
 }
 
-struct module *module_text_address(unsigned long addr)
-{
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_text_address(addr);
-	preempt_enable();
-
-	return mod;
-}
-
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
 {
-- 
cgit v1.2.3-71-gd317


From 75a66614db21007bcc8c37f9c5d5b922981387b9 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@mit.edu>
Date: Fri, 5 Dec 2008 19:03:58 -0500
Subject: Ksplice: Add functions for walking kallsyms symbols

Impact: New API

kallsyms_lookup_name only returns the first match that it finds.  Ksplice
needs information about all symbols with a given name in order to correctly
resolve local symbols.

kallsyms_on_each_symbol provides a generic mechanism for iterating over the
kallsyms table.

Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/kallsyms.h | 15 +++++++++++++++
 include/linux/module.h   | 12 ++++++++++++
 kernel/kallsyms.c        | 19 +++++++++++++++++++
 kernel/module.c          | 19 +++++++++++++++++++
 4 files changed, 65 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index f3fe34391d8e..792274269f2b 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -13,10 +13,17 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
+struct module;
+
 #ifdef CONFIG_KALLSYMS
 /* Lookup the address for a symbol. Returns 0 if not found. */
 unsigned long kallsyms_lookup_name(const char *name);
 
+/* Call a function on each kallsyms symbol in the core kernel */
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data);
+
 extern int kallsyms_lookup_size_offset(unsigned long addr,
 				  unsigned long *symbolsize,
 				  unsigned long *offset);
@@ -43,6 +50,14 @@ static inline unsigned long kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+						    struct module *,
+						    unsigned long),
+					  void *data)
+{
+	return 0;
+}
+
 static inline int kallsyms_lookup_size_offset(unsigned long addr,
 					      unsigned long *symbolsize,
 					      unsigned long *offset)
diff --git a/include/linux/module.h b/include/linux/module.h
index 69761ce0dbf0..c3d3fc4ffb18 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -387,6 +387,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data);
+
 extern void __module_put_and_exit(struct module *mod, long code)
 	__attribute__((noreturn));
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code);
@@ -566,6 +570,14 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+							   struct module *,
+							   unsigned long),
+						 void *data)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block * nb)
 {
 	/* no events will happen anyway, so this can always succeed */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data)
+{
+	char namebuf[KSYM_NAME_LEN];
+	unsigned long i;
+	unsigned int off;
+	int ret;
+
+	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+		off = kallsyms_expand_symbol(off, namebuf);
+		ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+		if (ret != 0)
+			return ret;
+	}
+	return module_kallsyms_on_each_symbol(fn, data);
+}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
+
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
 				    unsigned long *offset)
diff --git a/kernel/module.c b/kernel/module.c
index 8ddca629e079..dd4389be9152 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2612,6 +2612,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	preempt_enable();
 	return ret;
 }
+
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data)
+{
+	struct module *mod;
+	unsigned int i;
+	int ret;
+
+	list_for_each_entry(mod, &modules, list) {
+		for (i = 0; i < mod->num_symtab; i++) {
+			ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+				 mod, mod->symtab[i].st_value);
+			if (ret != 0)
+				return ret;
+		}
+	}
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 static char *module_flags(struct module *mod, char *buf)
-- 
cgit v1.2.3-71-gd317


From c6b37801911d7f4663c99cad8aa230bc934cea82 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:59 -0500
Subject: module: Export symbols needed for Ksplice

Impact: Expose some module.c symbols

Ksplice uses several functions from module.c in order to resolve
symbols and implement dependency handling.  Calling these functions
requires holding module_mutex, so it is exported.

(This is just the module part of a bigger add-exports patch from Tim).

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 28 ++++++++++++++++++++++++++++
 kernel/module.c        | 43 +++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c3d3fc4ffb18..d246da0b0f8c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -354,6 +354,8 @@ struct module
 #define MODULE_ARCH_INIT {}
 #endif
 
+extern struct mutex module_mutex;
+
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
    (IDE & SCSI) require entry into the module during init.*/
@@ -379,6 +381,31 @@ static inline int within_module_init(unsigned long addr, struct module *mod)
 	       addr < (unsigned long)mod->module_init + mod->init_size;
 }
 
+/* Search for module by name: must hold module_mutex. */
+struct module *find_module(const char *name);
+
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const unsigned long *crcs;
+	enum {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} licence;
+	bool unused;
+};
+
+/* Search for an exported symbol by name. */
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn);
+
+/* Walk the exported symbol table */
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data);
+
 /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
    symnum out of range. */
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
@@ -452,6 +479,7 @@ static inline void __module_get(struct module *module)
 #define symbol_put_addr(p) do { } while(0)
 
 #endif /* CONFIG_MODULE_UNLOAD */
+int use_module(struct module *a, struct module *b);
 
 /* This is a #define so the string doesn't get put in every .o file */
 #define module_name(mod)			\
diff --git a/kernel/module.c b/kernel/module.c
index dd4389be9152..5fd00766a4dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
 
 /* List of modules, protected by module_mutex or preempt_disable
  * (delete uses stop_machine/add uses RCU list operations). */
-static DEFINE_MUTEX(module_mutex);
+DEFINE_MUTEX(module_mutex);
+EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
 /* Waiting for a module to finish initializing? */
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const unsigned long *crcs;
-	enum {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
-	} licence;
-	bool unused;
-};
-
 static bool each_symbol_in_section(const struct symsearch *arr,
 				   unsigned int arrsize,
 				   struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
 }
 
 /* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol(bool (*fn)(const struct symsearch *arr,
-				   struct module *owner,
-				   unsigned int symnum, void *data),
-			void *data)
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data)
 {
 	struct module *mod;
 	const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 	}
 	return false;
 }
+EXPORT_SYMBOL_GPL(each_symbol);
 
 struct find_symbol_arg {
 	/* Input */
@@ -330,11 +319,11 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 /* Find a symbol and return it, along with, (optional) crc and
  * (optional) module which owns it */
-static const struct kernel_symbol *find_symbol(const char *name,
-					       struct module **owner,
-					       const unsigned long **crc,
-					       bool gplok,
-					       bool warn)
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -353,9 +342,10 @@ static const struct kernel_symbol *find_symbol(const char *name,
 	DEBUGP("Failed to find symbol %s\n", name);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_symbol);
 
 /* Search for module by name: must hold module_mutex. */
-static struct module *find_module(const char *name)
+struct module *find_module(const char *name)
 {
 	struct module *mod;
 
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
 }
 
 /* Module a uses b */
-static int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	struct module_use *use;
 	int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
 	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
 	return 1;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
@@ -951,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-static inline int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	return strong_try_module_get(b) == 0;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 static inline void module_unload_init(struct module *mod)
 {
@@ -2803,6 +2796,7 @@ __notrace_funcgraph struct module *__module_address(unsigned long addr)
 			return mod;
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__module_address);
 
 /*
  * is_module_text_address - is this address inside module code?
@@ -2841,6 +2835,7 @@ struct module *__module_text_address(unsigned long addr)
 	}
 	return mod;
 }
+EXPORT_SYMBOL_GPL(__module_text_address);
 
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
-- 
cgit v1.2.3-71-gd317


From acae05156551fd7528fbb616271e672789388e3c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 8 Feb 2009 10:42:01 -0800
Subject: module: create a request_module_nowait()

There seems to be a common pattern in the kernel where drivers want to
call request_module() from inside a module_init() function. Currently
this would deadlock.

As a result, several drivers go through hoops like scheduling things via
kevent, or creating custom work queues (because kevent can deadlock on them).

This patch changes this to use a request_module_nowait() function macro instead,
which just fires the modprobe off but doesn't wait for it, and thus avoids the
original deadlock entirely.

On my laptop this already results in one less kernel thread running..

(Includes Jiri's patch to use enum umh_wait)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bool-ified)
Cc: Jiri Slaby <jirislaby@gmail.com>
---
 include/linux/kmod.h | 11 ++++++++---
 kernel/kmod.c        | 10 ++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 92213a9194e1..d5fa565086d1 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -29,10 +29,15 @@
 #ifdef CONFIG_MODULES
 /* modprobe exit status on success, -ve on error.  Return value
  * usually useless though. */
-extern int request_module(const char * name, ...) __attribute__ ((format (printf, 1, 2)));
-#define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
+extern int __request_module(bool wait, const char *name, ...) \
+	__attribute__((format(printf, 2, 3)));
+#define request_module(mod...) __request_module(true, mod)
+#define request_module_nowait(mod...) __request_module(false, mod)
+#define try_then_request_module(x, mod...) \
+	((x) ?: (__request_module(false, mod), (x)))
 #else
-static inline int request_module(const char * name, ...) { return -ENOSYS; }
+static inline int request_module(const char *name, ...) { return -ENOSYS; }
+static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; }
 #define try_then_request_module(x, mod...) (x)
 #endif
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f0c8f545180d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 /**
- * request_module - try to load a kernel module
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
  * @fmt: printf style format string for the name of the module
  * @...: arguments as specified in the format string
  *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int request_module(const char *fmt, ...)
+int __request_module(bool wait, const char *fmt, ...)
 {
 	va_list args;
 	char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
 		return -ENOMEM;
 	}
 
-	ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+	ret = call_usermodehelper(modprobe_path, argv, envp,
+			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
-EXPORT_SYMBOL(request_module);
+EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
 struct subprocess_info {
-- 
cgit v1.2.3-71-gd317


From 66f92cf9d415e96a5bdd6c64de8dd8418595d2fc Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:36 -0600
Subject: strstarts: helper function for !strncmp(str, prefix, strlen(prefix))

Impact: minor new API

ksplice added a "starts_with" function, which seems like a common need.
When people open-code it they seem to use fixed numbers rather than strlen,
so it's quite a readability win (also, strncmp() almost always wants != 0
on it).

So here's strstarts().

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/string.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2f..76ec218bb30f 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,5 +114,14 @@ extern bool sysfs_streq(const char *s1, const char *s2);
 extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 			const void *from, size_t available);
 
+/**
+ * strstarts - does @str start with @prefix?
+ * @str: string to examine
+ * @prefix: prefix to look for.
+ */
+static inline bool strstarts(const char *str, const char *prefix)
+{
+	return strncmp(str, prefix, strlen(prefix)) == 0;
+}
 #endif
 #endif /* _LINUX_STRING_H_ */
-- 
cgit v1.2.3-71-gd317


From eea1bf384e05b5ab747f8530c4fba9e9e6907fff Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:27:02 +1100
Subject: md: Fix is_mddev_idle test (again).

There are two problems with is_mddev_idle.

1/ sync_io is 'atomic_t' and hence 'int'.  curr_events and all the
   rest are 'long'.
   So if sync_io were to wrap on a 64bit host, the value of
   curr_events would go very negative suddenly, and take a very
   long time to return to positive.

   So do all calculations as 'int'.  That gives us plenty of precision
   for what we need.

2/ To initialise rdev->last_events we simply call is_mddev_idle, on
   the assumption that it will make sure that last_events is in a
   suitable range.  It used to do this, but now it does not.
   So now we need to be more explicit about initialisation.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 16 ++++++++--------
 include/linux/raid/md_k.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 03b4cd0a6344..a99c50e217c0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5716,19 +5716,19 @@ int unregister_md_personality(struct mdk_personality *p)
 	return 0;
 }
 
-static int is_mddev_idle(mddev_t *mddev)
+static int is_mddev_idle(mddev_t *mddev, int init)
 {
 	mdk_rdev_t * rdev;
 	int idle;
-	long curr_events;
+	int curr_events;
 
 	idle = 1;
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-		curr_events = part_stat_read(&disk->part0, sectors[0]) +
-				part_stat_read(&disk->part0, sectors[1]) -
-				atomic_read(&disk->sync_io);
+		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+			      (int)part_stat_read(&disk->part0, sectors[1]) -
+			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
 		 * disk_stats is counted when it completes.
@@ -5751,7 +5751,7 @@ static int is_mddev_idle(mddev_t *mddev)
 		 * always make curr_events less than last_events.
 		 *
 		 */
-		if (curr_events - rdev->last_events > 4096) {
+		if (init || curr_events - rdev->last_events > 64) {
 			rdev->last_events = curr_events;
 			idle = 0;
 		}
@@ -5994,7 +5994,7 @@ void md_do_sync(mddev_t *mddev)
 	       "(but not more than %d KB/sec) for %s.\n",
 	       speed_max(mddev), desc);
 
-	is_mddev_idle(mddev); /* this also initializes IO event counters */
+	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
 
 	io_sectors = 0;
 	for (m = 0; m < SYNC_MARKS; m++) {
@@ -6096,7 +6096,7 @@ void md_do_sync(mddev_t *mddev)
 
 		if (currspeed > speed_min(mddev)) {
 			if ((currspeed > speed_max(mddev)) ||
-					!is_mddev_idle(mddev)) {
+					!is_mddev_idle(mddev, 0)) {
 				msleep(500);
 				goto repeat;
 			}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 9743e4dbc918..4aedb9fe2bd8 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -51,7 +51,7 @@ struct mdk_rdev_s
 
 	sector_t size;			/* Device size (in blocks) */
 	mddev_t *mddev;			/* RAID array if running */
-	long last_events;		/* IO event timestamp */
+	int last_events;		/* IO event timestamp */
 
 	struct block_device *bdev;	/* block device handle */
 
-- 
cgit v1.2.3-71-gd317


From ef740c372dfd80e706dbf955d4e4aedda6c0c148 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 31 Mar 2009 14:27:03 +1100
Subject: md: move headers out of include/linux/raid/

Move the headers with the local structures for the disciplines and
bitmap.h into drivers/md/ so that they are more easily grepable for
hacking and not far away.  md.h is left where it is for now as there
are some uses from the outside.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c            |   2 +-
 drivers/md/bitmap.h            | 288 +++++++++++++++++++++++++++++
 drivers/md/linear.c            |   2 +-
 drivers/md/linear.h            |  31 ++++
 drivers/md/md.c                |   2 +-
 drivers/md/multipath.c         |   2 +-
 drivers/md/multipath.h         |  42 +++++
 drivers/md/raid0.c             |   2 +-
 drivers/md/raid0.h             |  30 +++
 drivers/md/raid1.c             |   4 +-
 drivers/md/raid1.h             | 134 ++++++++++++++
 drivers/md/raid10.c            |   4 +-
 drivers/md/raid10.h            | 123 +++++++++++++
 drivers/md/raid5.c             |   5 +-
 drivers/md/raid5.h             | 402 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid6.h             |   2 +-
 include/linux/raid/bitmap.h    | 288 -----------------------------
 include/linux/raid/linear.h    |  31 ----
 include/linux/raid/multipath.h |  42 -----
 include/linux/raid/raid0.h     |  30 ---
 include/linux/raid/raid1.h     | 134 --------------
 include/linux/raid/raid10.h    | 123 -------------
 include/linux/raid/raid5.h     | 402 -----------------------------------------
 23 files changed, 1062 insertions(+), 1063 deletions(-)
 create mode 100644 drivers/md/bitmap.h
 create mode 100644 drivers/md/linear.h
 create mode 100644 drivers/md/multipath.h
 create mode 100644 drivers/md/raid0.h
 create mode 100644 drivers/md/raid1.h
 create mode 100644 drivers/md/raid10.h
 create mode 100644 drivers/md/raid5.h
 delete mode 100644 include/linux/raid/bitmap.h
 delete mode 100644 include/linux/raid/linear.h
 delete mode 100644 include/linux/raid/multipath.h
 delete mode 100644 include/linux/raid/raid0.h
 delete mode 100644 include/linux/raid/raid1.h
 delete mode 100644 include/linux/raid/raid10.h
 delete mode 100644 include/linux/raid/raid5.h

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 27f978dfe6a3..7666117738c7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -27,7 +27,7 @@
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include "bitmap.h"
 
 /* debug macros */
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
new file mode 100644
index 000000000000..e98900671ca9
--- /dev/null
+++ b/drivers/md/bitmap.h
@@ -0,0 +1,288 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define	BITMAP_MAJOR_HOSTENDIAN 3
+
+#define BITMAP_MINOR 39
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed.  The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync |               counter                          |
+ * | needed | active |                                                |
+ * |  (0-1) |  (0-1) |              (0-16383)                         |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ *    a '1' bit is read from storage at startup.
+ *    a write request fails on some drives
+ *    a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ *    a resync is started on all drives, and resync_needed is set.
+ *       resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared aswell, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ *     page pointer (32-bit)
+ *
+ *     [ ] ------+
+ *               |
+ *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
+ *                          c1   c2    c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ *     hijacked page pointer (32-bit)
+ *
+ *     [		  ][		  ] (no page memory allocated)
+ *      counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#ifdef __KERNEL__
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SIZE 512
+#define BITMAP_BLOCK_SHIFT 9
+
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+			(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
+
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+
+/* map chunks (bits) to file pages - offset by the size of the superblock */
+#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
+
+#endif
+
+/*
+ * bitmap structures:
+ */
+
+#define BITMAP_MAGIC 0x6d746962
+
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
+	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
+	BITMAP_HOSTENDIAN = 0x8000,
+};
+
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+	__le32 magic;        /*  0  BITMAP_MAGIC */
+	__le32 version;      /*  4  the bitmap major for now, could change... */
+	__u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
+	__le64 events;       /* 24  event counter for the bitmap (1)*/
+	__le64 events_cleared;/*32  event counter when last bit cleared (2) */
+	__le64 sync_size;    /* 40  the size of the md device's sync range(3) */
+	__le32 state;        /* 48  bitmap state information */
+	__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
+	__le32 daemon_sleep; /* 56  seconds between disk flushes */
+	__le32 write_behind; /* 60  number of outstanding write-behind writes */
+
+	__u8  pad[256 - 64]; /* set to zero */
+} bitmap_super_t;
+
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ *    When a bitmap is loaded, it is only accepted if this event counter is equal
+ *    to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ *    array is not degraded.  As bits are not cleared when the array is degraded,
+ *    this represents the last time that any bits were cleared.
+ *    If a device is being added that has an event count with this value or
+ *    higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ *    resync happens across.  For raid1 and raid5/6 it is the size of individual
+ *    devices.  For raid10 it is the size of the array.
+ */
+
+#ifdef __KERNEL__
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+	/*
+	 * map points to the actual memory page
+	 */
+	char *map;
+	/*
+	 * in emergencies (when map cannot be alloced), hijack the map
+	 * pointer and use it as two counters itself
+	 */
+	unsigned int hijacked:1;
+	/*
+	 * count of dirty bits on the page
+	 */
+	unsigned int  count:31;
+};
+
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+	struct list_head list;
+	struct page *page;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+	struct bitmap_page *bp;
+	unsigned long pages; /* total number of pages in the bitmap */
+	unsigned long missing_pages; /* number of pages not yet allocated */
+
+	mddev_t *mddev; /* the md device that the bitmap is for */
+
+	int counter_bits; /* how many bits per block counter */
+
+	/* bitmap chunksize -- how much data does each bit represent? */
+	unsigned long chunksize;
+	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+	unsigned long chunks; /* total number of data chunks for the array */
+
+	/* We hold a count on the chunk currently being synced, and drop
+	 * it when the last block is started.  If the resync is aborted
+	 * midway, we need to be able to drop that count, so we remember
+	 * the counted chunk..
+	 */
+	unsigned long syncchunk;
+
+	__u64	events_cleared;
+	int need_sync;
+
+	/* bitmap spinlock */
+	spinlock_t lock;
+
+	long offset; /* offset from superblock if file is NULL */
+	struct file *file; /* backing disk file */
+	struct page *sb_page; /* cached copy of the bitmap file superblock */
+	struct page **filemap; /* list of cache pages for the file */
+	unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
+	unsigned long file_pages; /* number of pages in the file */
+	int last_page_size; /* bytes in the last page */
+
+	unsigned long flags;
+
+	int allclean;
+
+	unsigned long max_write_behind; /* write-behind mode */
+	atomic_t behind_writes;
+
+	/*
+	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
+	 * file, cleaning up bits and flushing out pages to disk as necessary
+	 */
+	unsigned long daemon_lastrun; /* jiffies of last run */
+	unsigned long daemon_sleep; /* how many seconds between updates? */
+	unsigned long last_end_sync; /* when we lasted called end_sync to
+				      * update bitmap with resync progress */
+
+	atomic_t pending_writes; /* pending writes to the bitmap file */
+	wait_queue_head_t write_wait;
+	wait_queue_head_t overflow_wait;
+
+};
+
+/* the bitmap API */
+
+/* these are used only by md/bitmap */
+int  bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
+void bitmap_destroy(mddev_t *mddev);
+
+void bitmap_print_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
+
+int  bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
+
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
+
+/* these are exported */
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int success, int behind);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(struct bitmap *bitmap);
+#endif
+
+#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 09658b218474..3603ffa9edc5 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,7 +16,7 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/linear.h>
+#include "linear.h"
 
 /*
  * find which device holds a particular offset 
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
new file mode 100644
index 000000000000..f38b9c586afb
--- /dev/null
+++ b/drivers/md/linear.h
@@ -0,0 +1,31 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+
+#include <linux/raid/md.h>
+
+struct dev_info {
+	mdk_rdev_t	*rdev;
+	sector_t	num_sectors;
+	sector_t	start_sector;
+};
+
+typedef struct dev_info dev_info_t;
+
+struct linear_private_data
+{
+	struct linear_private_data *prev;	/* earlier version */
+	dev_info_t		**hash_table;
+	sector_t		spacing;
+	sector_t		array_sectors;
+	int			sector_shift;	/* shift before dividing
+						 * by spacing
+						 */
+	dev_info_t		disks[0];
+};
+
+
+typedef struct linear_private_data linear_conf_t;
+
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3efc0bceada2..9a3214c8585f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,7 +34,6 @@
 
 #include <linux/kthread.h>
 #include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
 #include <linux/sysctl.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
@@ -45,6 +44,7 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
+#include "bitmap.h"
 
 /* 63 partitions with the alternate major number (mdp) */
 #define MdpMinorShift 6
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index f6d08f241671..547df09a7af3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,7 +19,7 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/raid/multipath.h>
+#include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
 
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
new file mode 100644
index 000000000000..6f53fc177a47
--- /dev/null
+++ b/drivers/md/multipath.h
@@ -0,0 +1,42 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+
+#include <linux/raid/md.h>
+
+struct multipath_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct multipath_private_data {
+	mddev_t			*mddev;
+	struct multipath_info	*multipaths;
+	int			raid_disks;
+	int			working_disks;
+	spinlock_t		device_lock;
+	struct list_head	retry_list;
+
+	mempool_t		*pool;
+};
+
+typedef struct multipath_private_data multipath_conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
+
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+
+struct multipath_bh {
+	mddev_t			*mddev;
+	struct bio		*master_bio;
+	struct bio		bio;
+	int			path;
+	struct list_head	retry_list;
+};
+#endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c605ba805586..ef09ed04864e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,7 +18,7 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/raid0.h>
+#include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
 {
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
new file mode 100644
index 000000000000..fd42aa87c391
--- /dev/null
+++ b/drivers/md/raid0.h
@@ -0,0 +1,30 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+#include <linux/raid/md.h>
+
+struct strip_zone
+{
+	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
+	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
+	sector_t sectors;	/* Zone size in sectors */
+	int nb_dev;		/* # of devices attached to the zone */
+	mdk_rdev_t **dev;	/* Devices attached to the zone */
+};
+
+struct raid0_private_data
+{
+	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
+	struct strip_zone *strip_zone;
+	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+	int nr_strip_zones;
+
+	sector_t spacing;
+	int sector_shift; /* shift this before divide by spacing */
+};
+
+typedef struct raid0_private_data raid0_conf_t;
+
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
+
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2466425d9ca..bff32285f8bb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -33,8 +33,8 @@
 
 #include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid1.h>
-#include <linux/raid/bitmap.h>
+#include "raid1.h"
+#include "bitmap.h"
 
 #define DEBUG 0
 #if DEBUG
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
new file mode 100644
index 000000000000..0a9ba7c3302e
--- /dev/null
+++ b/drivers/md/raid1.h
@@ -0,0 +1,134 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+
+#include <linux/raid/md.h>
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+/*
+ * memory pools need a pointer to the mddev, so they can force an unplug
+ * when memory is tight, and a count of the number of drives that the
+ * pool was allocated for, so they know how much to allocate and free.
+ * mddev->raid_disks cannot be used, as it can change while a pool is active
+ * These two datums are stored in a kmalloced struct.
+ */
+
+struct pool_info {
+	mddev_t *mddev;
+	int	raid_disks;
+};
+
+
+typedef struct r1bio_s r1bio_t;
+
+struct r1_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	int			last_used;
+	sector_t		next_seq_sect;
+	spinlock_t		device_lock;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+	/* queue of writes that have been unplugged */
+	struct bio_list		flushing_bio_list;
+
+	/* for use when syncing mirrors: */
+
+	spinlock_t		resync_lock;
+	int			nr_pending;
+	int			nr_waiting;
+	int			nr_queued;
+	int			barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	struct pool_info	*poolinfo;
+
+	struct page		*tmppage;
+
+	mempool_t *r1bio_pool;
+	mempool_t *r1buf_pool;
+};
+
+typedef struct r1_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID1 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+struct r1bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	atomic_t		behind_remaining; /* number of write-behind ios remaining
+						 * in this BehindIO request
+						 */
+	sector_t		sector;
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_disk;
+
+	struct list_head	retry_list;
+	struct bitmap_update	*bitmap_update;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used.
+	 * We choose the number when they are allocated.
+	 */
+	struct bio		*bios[0];
+	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r1bio.state */
+#define	R1BIO_Uptodate	0
+#define	R1BIO_IsSync	1
+#define	R1BIO_Degraded	2
+#define	R1BIO_BehindIO	3
+#define	R1BIO_Barrier	4
+#define R1BIO_BarrierRetry 5
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful.  Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define	R1BIO_Returned 6
+
+#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7301631abe04..f03dd70d12a5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -20,8 +20,8 @@
 
 #include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid10.h>
-#include <linux/raid/bitmap.h>
+#include "raid10.h"
+#include "bitmap.h"
 
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
new file mode 100644
index 000000000000..e9091cfeb286
--- /dev/null
+++ b/drivers/md/raid10.h
@@ -0,0 +1,123 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+
+#include <linux/raid/md.h>
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+typedef struct r10bio_s r10bio_t;
+
+struct r10_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	spinlock_t		device_lock;
+
+	/* geometry */
+	int			near_copies;  /* number of copies layed out raid0 style */
+	int 			far_copies;   /* number of copies layed out
+					       * at large strides across drives
+					       */
+	int			far_offset;   /* far_copies are offset by 1 stripe
+					       * instead of many
+					       */
+	int			copies;	      /* near_copies * far_copies.
+					       * must be <= raid_disks
+					       */
+	sector_t		stride;	      /* distance between far copies.
+					       * This is size / far_copies unless
+					       * far_offset, in which case it is
+					       * 1 stripe.
+					       */
+
+	int chunk_shift; /* shift from chunks to sectors */
+	sector_t chunk_mask;
+
+	struct list_head	retry_list;
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+
+
+	spinlock_t		resync_lock;
+	int nr_pending;
+	int nr_waiting;
+	int nr_queued;
+	int barrier;
+	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	wait_queue_head_t	wait_barrier;
+
+	mempool_t *r10bio_pool;
+	mempool_t *r10buf_pool;
+	struct page		*tmppage;
+};
+
+typedef struct r10_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+
+struct r10bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	sector_t		sector;	/* virtual sector number */
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_slot;
+
+	struct list_head	retry_list;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used,
+	 * one for each copy.
+	 * When resyncing we also use one for each copy.
+	 * When reconstructing, we use 2 bios, one for read, one for write.
+	 * We choose the number when they are allocated.
+	 */
+	struct {
+		struct bio		*bio;
+		sector_t addr;
+		int devnum;
+	} devs[0];
+};
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
+/* bits for r10bio.state */
+#define	R10BIO_Uptodate	0
+#define	R10BIO_IsSync	1
+#define	R10BIO_IsRecover 2
+#define	R10BIO_Degraded 3
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5ba080d303b..f75698b1f63d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -44,10 +44,9 @@
  */
 
 #include <linux/kthread.h>
-#include "raid6.h"
-
-#include <linux/raid/bitmap.h>
 #include <linux/async_tx.h>
+#include "raid6.h"
+#include "bitmap.h"
 
 /*
  * Stripe cache
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
new file mode 100644
index 000000000000..40f1d0335c74
--- /dev/null
+++ b/drivers/md/raid5.h
@@ -0,0 +1,402 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+
+/*
+ *
+ * Each stripe contains one buffer per disc.  Each buffer can be in
+ * one of a number of states stored in "flags".  Changes between
+ * these states happen *almost* exclusively under a per-stripe
+ * spinlock.  Some very specific changes can happen in bi_end_io, and
+ * these are not protected by the spin lock.
+ *
+ * The flag bits that are used to represent these states are:
+ *   R5_UPTODATE and R5_LOCKED
+ *
+ * State Empty == !UPTODATE, !LOCK
+ *        We have no data, and there is no active request
+ * State Want == !UPTODATE, LOCK
+ *        A read request is being submitted for this block
+ * State Dirty == UPTODATE, LOCK
+ *        Some new data is in this buffer, and it is being written out
+ * State Clean == UPTODATE, !LOCK
+ *        We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ *  Empty -> Want   - on read or write to get old data for  parity calc
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ *  Empty -> Clean  - on compute_block when computing a block for failed drive
+ *  Want  -> Empty  - on failed read
+ *  Want  -> Clean  - on successful completion of read request
+ *  Dirty -> Clean  - on successful completion of write request
+ *  Dirty -> Clean  - on failed write
+ *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ *    Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states.  That
+ * is if one drive has failed and there is a spare being rebuilt.  We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare.  A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called.  This may happen in the end_request routine only
+ * if the buffer has just successfully been read.  end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer.  Other threads may do this only if they first check
+ * that the Uptodate bit is set.  Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write it is copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written).  Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos.  The read list is
+ * protected by the device_lock.  The write and written lists are
+ * protected by the stripe lock.  The device_lock, which can be
+ * claimed while the stipe lock is held, is only for list
+ * manipulations and will only be held for a very short time.  It can
+ * be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither).  The "inactive_list" contains stripes which are not
+ * currently being used for any request.  They can freely be reused
+ * for another stripe.  The "handle_list" contains stripes that need
+ * to be handled in some way.  Both of these are fifo queues.  Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number.  Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front.  All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ *  - stripes on the inactive_list never have their stripe_lock held.
+ *  - stripes have a reference counter. If count==0, they are on a list.
+ *  - If a stripe might need handling, STRIPE_HANDLE is set.
+ *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ *    handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ *  activate an unhashed/inactive stripe (get_active_stripe())
+ *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ *  activate a hashed, possibly active stripe (get_active_stripe())
+ *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ *  attach a request to an active stripe (add_stripe_bh())
+ *     lockdev attach-buffer unlockdev
+ *  handle a stripe (handle_stripe())
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *		(lockdev check-buffers unlockdev) ..
+ *		change-state ..
+ *		record io/ops needed unlockstripe schedule io/ops
+ *  release an active stripe (release_stripe())
+ *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
+ */
+
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+	check_state_idle = 0,
+	check_state_run, /* parity check */
+	check_state_check_result,
+	check_state_compute_run, /* parity repair */
+	check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+	reconstruct_state_idle = 0,
+	reconstruct_state_prexor_drain_run,	/* prexor-write */
+	reconstruct_state_drain_run,		/* write */
+	reconstruct_state_run,			/* expand */
+	reconstruct_state_prexor_drain_result,
+	reconstruct_state_drain_result,
+	reconstruct_state_result,
+};
+
+struct stripe_head {
+	struct hlist_node	hash;
+	struct list_head	lru;			/* inactive_list or handle_list */
+	struct raid5_private_data	*raid_conf;
+	sector_t		sector;			/* sector of this row */
+	int			pd_idx;			/* parity disk index */
+	unsigned long		state;			/* state flags */
+	atomic_t		count;			/* nr of active thread/requests */
+	spinlock_t		lock;
+	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;			/* disks in stripe */
+	enum check_states	check_state;
+	enum reconstruct_states reconstruct_state;
+	/* stripe_operations
+	 * @target - STRIPE_OP_COMPUTE_BLK target
+	 */
+	struct stripe_operations {
+		int		   target;
+		u32		   zero_sum_result;
+	} ops;
+	struct r5dev {
+		struct bio	req;
+		struct bio_vec	vec;
+		struct page	*page;
+		struct bio	*toread, *read, *towrite, *written;
+		sector_t	sector;			/* sector of this page */
+		unsigned long	flags;
+	} dev[1]; /* allocated with extra space depending of RAID geometry */
+};
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+	int syncing, expanding, expanded;
+	int locked, uptodate, to_read, to_write, failed, written;
+	int to_fill, compute, req_compute, non_overwrite;
+	int failed_num;
+	unsigned long ops_request;
+};
+
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+	int p_failed, q_failed, qd_idx, failed_num[2];
+};
+
+/* Flags */
+#define	R5_UPTODATE	0	/* page contains current data */
+#define	R5_LOCKED	1	/* IO has been submitted on "req" */
+#define	R5_OVERWRITE	2	/* towrite covers whole page */
+/* and some that are internal to handle_stripe */
+#define	R5_Insync	3	/* rdev && rdev->in_sync at start */
+#define	R5_Wantread	4	/* want to schedule a read */
+#define	R5_Wantwrite	5
+#define	R5_Overlap	7	/* There is a pending overlapping request on this block */
+#define	R5_ReadError	8	/* seen a read error here recently */
+#define	R5_ReWrite	9	/* have tried to over-write the readerror */
+
+#define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Wantcompute	11 /* compute_block in progress treat as
+				    * uptodate
+				    */
+#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
+				    * filling
+				    */
+#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
+/*
+ * Write method
+ */
+#define RECONSTRUCT_WRITE	1
+#define READ_MODIFY_WRITE	2
+/* not a write method, but a compute_parity mode */
+#define	CHECK_PARITY		3
+
+/*
+ * Stripe state
+ */
+#define STRIPE_HANDLE		2
+#define	STRIPE_SYNCING		3
+#define	STRIPE_INSYNC		4
+#define	STRIPE_PREREAD_ACTIVE	5
+#define	STRIPE_DELAYED		6
+#define	STRIPE_DEGRADED		7
+#define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
+#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
+#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
+#define	STRIPE_BIOFILL_RUN	14
+#define	STRIPE_COMPUTE_RUN	15
+/*
+ * Operation request flags
+ */
+#define STRIPE_OP_BIOFILL	0
+#define STRIPE_OP_COMPUTE_BLK	1
+#define STRIPE_OP_PREXOR	2
+#define STRIPE_OP_BIODRAIN	3
+#define STRIPE_OP_POSTXOR	4
+#define STRIPE_OP_CHECK	5
+
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase.  Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplug call is made. (the unplug_io_fn() is called).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the 'handle' queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set
+ * PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
+ */
+
+
+struct disk_info {
+	mdk_rdev_t	*rdev;
+};
+
+struct raid5_private_data {
+	struct hlist_head	*stripe_hashtbl;
+	mddev_t			*mddev;
+	struct disk_info	*spare;
+	int			chunk_size, level, algorithm;
+	int			max_degraded;
+	int			raid_disks;
+	int			max_nr_stripes;
+
+	/* used during an expand */
+	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
+					    * as we haven't flushed the metadata yet
+					    */
+	int			previous_raid_disks;
+
+	struct list_head	handle_list; /* stripes needing handling */
+	struct list_head	hold_list; /* preread ready stripes */
+	struct list_head	delayed_list; /* stripes that have plugged requests */
+	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
+	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
+	atomic_t		preread_active_stripes; /* stripes with scheduled io */
+	atomic_t		active_aligned_reads;
+	atomic_t		pending_full_writes; /* full write backlog */
+	int			bypass_count; /* bypassed prereads */
+	int			bypass_threshold; /* preread nice */
+	struct list_head	*last_hold; /* detect hold_list promotions */
+
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
+	struct kmem_cache		*slab_cache; /* for allocating stripes */
+
+	int			seq_flush, seq_write;
+	int			quiesce;
+
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
+	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
+
+	/*
+	 * Free stripes pool
+	 */
+	atomic_t		active_stripes;
+	struct list_head	inactive_list;
+	wait_queue_head_t	wait_for_stripe;
+	wait_queue_head_t	wait_for_overlap;
+	int			inactive_blocked;	/* release of inactive stripes blocked,
+							 * waiting for 25% to be free
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
+	spinlock_t		device_lock;
+	struct disk_info	*disks;
+};
+
+typedef struct raid5_private_data raid5_conf_t;
+
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC	0
+#define ALGORITHM_RIGHT_ASYMMETRIC	1
+#define ALGORITHM_LEFT_SYMMETRIC	2
+#define ALGORITHM_RIGHT_SYMMETRIC	3
+
+#endif
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
index 98dcde88470e..f6c13af65002 100644
--- a/drivers/md/raid6.h
+++ b/drivers/md/raid6.h
@@ -19,7 +19,7 @@
 #define RAID6_USE_EMPTY_ZERO_PAGE 0
 
 #include <linux/raid/md.h>
-#include <linux/raid/raid5.h>
+#include "raid5.h"
 
 typedef raid5_conf_t raid6_conf_t; /* Same configuration */
 
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
deleted file mode 100644
index e98900671ca9..000000000000
--- a/include/linux/raid/bitmap.h
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
- *
- * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
- */
-#ifndef BITMAP_H
-#define BITMAP_H 1
-
-#define BITMAP_MAJOR_LO 3
-/* version 4 insists the bitmap is in little-endian order
- * with version 3, it is host-endian which is non-portable
- */
-#define BITMAP_MAJOR_HI 4
-#define	BITMAP_MAJOR_HOSTENDIAN 3
-
-#define BITMAP_MINOR 39
-
-/*
- * in-memory bitmap:
- *
- * Use 16 bit block counters to track pending writes to each "chunk".
- * The 2 high order bits are special-purpose, the first is a flag indicating
- * whether a resync is needed.  The second is a flag indicating whether a
- * resync is active.
- * This means that the counter is actually 14 bits:
- *
- * +--------+--------+------------------------------------------------+
- * | resync | resync |               counter                          |
- * | needed | active |                                                |
- * |  (0-1) |  (0-1) |              (0-16383)                         |
- * +--------+--------+------------------------------------------------+
- *
- * The "resync needed" bit is set when:
- *    a '1' bit is read from storage at startup.
- *    a write request fails on some drives
- *    a resync is aborted on a chunk with 'resync active' set
- * It is cleared (and resync-active set) when a resync starts across all drives
- * of the chunk.
- *
- *
- * The "resync active" bit is set when:
- *    a resync is started on all drives, and resync_needed is set.
- *       resync_needed will be cleared (as long as resync_active wasn't already set).
- * It is cleared when a resync completes.
- *
- * The counter counts pending write requests, plus the on-disk bit.
- * When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared aswell, thus setting the counter to 0.
- * When we set a bit, or in the counter (to start a write), if the fields is
- * 0, we first set the disk bit and set the counter to 1.
- *
- * If the counter is 0, the on-disk bit is clear and the stipe is clean
- * Anything that dirties the stipe pushes the counter to 2 (at least)
- * and sets the on-disk bit (lazily).
- * If a periodic sweep find the counter at 2, it is decremented to 1.
- * If the sweep find the counter at 1, the on-disk bit is cleared and the
- * counter goes to zero.
- *
- * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
- * counters as a fallback when "page" memory cannot be allocated:
- *
- * Normal case (page memory allocated):
- *
- *     page pointer (32-bit)
- *
- *     [ ] ------+
- *               |
- *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
- *                          c1   c2    c2048
- *
- * Hijacked case (page memory allocation failed):
- *
- *     hijacked page pointer (32-bit)
- *
- *     [		  ][		  ] (no page memory allocated)
- *      counter #1 (16-bit) counter #2 (16-bit)
- *
- */
-
-#ifdef __KERNEL__
-
-#define PAGE_BITS (PAGE_SIZE << 3)
-#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
-
-typedef __u16 bitmap_counter_t;
-#define COUNTER_BITS 16
-#define COUNTER_BIT_SHIFT 4
-#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
-#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
-
-#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
-#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
-#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
-#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
-#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
-#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
-
-/* how many counters per page? */
-#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
-/* same, except a shift value for more efficient bitops */
-#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
-/* same, except a mask value for more efficient bitops */
-#define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
-
-#define BITMAP_BLOCK_SIZE 512
-#define BITMAP_BLOCK_SHIFT 9
-
-/* how many blocks per chunk? (this is variable) */
-#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
-
-/* when hijacked, the counters and bits represent even larger "chunks" */
-/* there will be 1024 chunks represented by each counter in the page pointers */
-#define PAGEPTR_BLOCK_RATIO(bitmap) \
-			(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
-#define PAGEPTR_BLOCK_SHIFT(bitmap) \
-			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
-#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
-
-/*
- * on-disk bitmap:
- *
- * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
- * file a page at a time. There's a superblock at the start of the file.
- */
-
-/* map chunks (bits) to file pages - offset by the size of the superblock */
-#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
-
-#endif
-
-/*
- * bitmap structures:
- */
-
-#define BITMAP_MAGIC 0x6d746962
-
-/* use these for bitmap->flags and bitmap->sb->state bit-fields */
-enum bitmap_state {
-	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
-	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
-	BITMAP_HOSTENDIAN = 0x8000,
-};
-
-/* the superblock at the front of the bitmap file -- little endian */
-typedef struct bitmap_super_s {
-	__le32 magic;        /*  0  BITMAP_MAGIC */
-	__le32 version;      /*  4  the bitmap major for now, could change... */
-	__u8  uuid[16];      /*  8  128 bit uuid - must match md device uuid */
-	__le64 events;       /* 24  event counter for the bitmap (1)*/
-	__le64 events_cleared;/*32  event counter when last bit cleared (2) */
-	__le64 sync_size;    /* 40  the size of the md device's sync range(3) */
-	__le32 state;        /* 48  bitmap state information */
-	__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
-	__le32 daemon_sleep; /* 56  seconds between disk flushes */
-	__le32 write_behind; /* 60  number of outstanding write-behind writes */
-
-	__u8  pad[256 - 64]; /* set to zero */
-} bitmap_super_t;
-
-/* notes:
- * (1) This event counter is updated before the eventcounter in the md superblock
- *    When a bitmap is loaded, it is only accepted if this event counter is equal
- *    to, or one greater than, the event counter in the superblock.
- * (2) This event counter is updated when the other one is *if*and*only*if* the
- *    array is not degraded.  As bits are not cleared when the array is degraded,
- *    this represents the last time that any bits were cleared.
- *    If a device is being added that has an event count with this value or
- *    higher, it is accepted as conforming to the bitmap.
- * (3)This is the number of sectors represented by the bitmap, and is the range that
- *    resync happens across.  For raid1 and raid5/6 it is the size of individual
- *    devices.  For raid10 it is the size of the array.
- */
-
-#ifdef __KERNEL__
-
-/* the in-memory bitmap is represented by bitmap_pages */
-struct bitmap_page {
-	/*
-	 * map points to the actual memory page
-	 */
-	char *map;
-	/*
-	 * in emergencies (when map cannot be alloced), hijack the map
-	 * pointer and use it as two counters itself
-	 */
-	unsigned int hijacked:1;
-	/*
-	 * count of dirty bits on the page
-	 */
-	unsigned int  count:31;
-};
-
-/* keep track of bitmap file pages that have pending writes on them */
-struct page_list {
-	struct list_head list;
-	struct page *page;
-};
-
-/* the main bitmap structure - one per mddev */
-struct bitmap {
-	struct bitmap_page *bp;
-	unsigned long pages; /* total number of pages in the bitmap */
-	unsigned long missing_pages; /* number of pages not yet allocated */
-
-	mddev_t *mddev; /* the md device that the bitmap is for */
-
-	int counter_bits; /* how many bits per block counter */
-
-	/* bitmap chunksize -- how much data does each bit represent? */
-	unsigned long chunksize;
-	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
-	unsigned long chunks; /* total number of data chunks for the array */
-
-	/* We hold a count on the chunk currently being synced, and drop
-	 * it when the last block is started.  If the resync is aborted
-	 * midway, we need to be able to drop that count, so we remember
-	 * the counted chunk..
-	 */
-	unsigned long syncchunk;
-
-	__u64	events_cleared;
-	int need_sync;
-
-	/* bitmap spinlock */
-	spinlock_t lock;
-
-	long offset; /* offset from superblock if file is NULL */
-	struct file *file; /* backing disk file */
-	struct page *sb_page; /* cached copy of the bitmap file superblock */
-	struct page **filemap; /* list of cache pages for the file */
-	unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
-	unsigned long file_pages; /* number of pages in the file */
-	int last_page_size; /* bytes in the last page */
-
-	unsigned long flags;
-
-	int allclean;
-
-	unsigned long max_write_behind; /* write-behind mode */
-	atomic_t behind_writes;
-
-	/*
-	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
-	 * file, cleaning up bits and flushing out pages to disk as necessary
-	 */
-	unsigned long daemon_lastrun; /* jiffies of last run */
-	unsigned long daemon_sleep; /* how many seconds between updates? */
-	unsigned long last_end_sync; /* when we lasted called end_sync to
-				      * update bitmap with resync progress */
-
-	atomic_t pending_writes; /* pending writes to the bitmap file */
-	wait_queue_head_t write_wait;
-	wait_queue_head_t overflow_wait;
-
-};
-
-/* the bitmap API */
-
-/* these are used only by md/bitmap */
-int  bitmap_create(mddev_t *mddev);
-void bitmap_flush(mddev_t *mddev);
-void bitmap_destroy(mddev_t *mddev);
-
-void bitmap_print_sb(struct bitmap *bitmap);
-void bitmap_update_sb(struct bitmap *bitmap);
-
-int  bitmap_setallbits(struct bitmap *bitmap);
-void bitmap_write_all(struct bitmap *bitmap);
-
-void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
-
-/* these are exported */
-int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
-			unsigned long sectors, int behind);
-void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
-			unsigned long sectors, int success, int behind);
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
-void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
-void bitmap_close_sync(struct bitmap *bitmap);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
-
-void bitmap_unplug(struct bitmap *bitmap);
-void bitmap_daemon_work(struct bitmap *bitmap);
-#endif
-
-#endif
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
deleted file mode 100644
index f38b9c586afb..000000000000
--- a/include/linux/raid/linear.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _LINEAR_H
-#define _LINEAR_H
-
-#include <linux/raid/md.h>
-
-struct dev_info {
-	mdk_rdev_t	*rdev;
-	sector_t	num_sectors;
-	sector_t	start_sector;
-};
-
-typedef struct dev_info dev_info_t;
-
-struct linear_private_data
-{
-	struct linear_private_data *prev;	/* earlier version */
-	dev_info_t		**hash_table;
-	sector_t		spacing;
-	sector_t		array_sectors;
-	int			sector_shift;	/* shift before dividing
-						 * by spacing
-						 */
-	dev_info_t		disks[0];
-};
-
-
-typedef struct linear_private_data linear_conf_t;
-
-#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
-
-#endif
diff --git a/include/linux/raid/multipath.h b/include/linux/raid/multipath.h
deleted file mode 100644
index 6f53fc177a47..000000000000
--- a/include/linux/raid/multipath.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _MULTIPATH_H
-#define _MULTIPATH_H
-
-#include <linux/raid/md.h>
-
-struct multipath_info {
-	mdk_rdev_t	*rdev;
-};
-
-struct multipath_private_data {
-	mddev_t			*mddev;
-	struct multipath_info	*multipaths;
-	int			raid_disks;
-	int			working_disks;
-	spinlock_t		device_lock;
-	struct list_head	retry_list;
-
-	mempool_t		*pool;
-};
-
-typedef struct multipath_private_data multipath_conf_t;
-
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
-
-/*
- * this is our 'private' 'collective' MULTIPATH buffer head.
- * it contains information about what kind of IO operations were started
- * for this MULTIPATH operation, and about their status:
- */
-
-struct multipath_bh {
-	mddev_t			*mddev;
-	struct bio		*master_bio;
-	struct bio		bio;
-	int			path;
-	struct list_head	retry_list;
-};
-#endif
diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h
deleted file mode 100644
index fd42aa87c391..000000000000
--- a/include/linux/raid/raid0.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _RAID0_H
-#define _RAID0_H
-
-#include <linux/raid/md.h>
-
-struct strip_zone
-{
-	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
-	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
-	sector_t sectors;	/* Zone size in sectors */
-	int nb_dev;		/* # of devices attached to the zone */
-	mdk_rdev_t **dev;	/* Devices attached to the zone */
-};
-
-struct raid0_private_data
-{
-	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
-	struct strip_zone *strip_zone;
-	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
-	int nr_strip_zones;
-
-	sector_t spacing;
-	int sector_shift; /* shift this before divide by spacing */
-};
-
-typedef struct raid0_private_data raid0_conf_t;
-
-#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
-
-#endif
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
deleted file mode 100644
index 0a9ba7c3302e..000000000000
--- a/include/linux/raid/raid1.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef _RAID1_H
-#define _RAID1_H
-
-#include <linux/raid/md.h>
-
-typedef struct mirror_info mirror_info_t;
-
-struct mirror_info {
-	mdk_rdev_t	*rdev;
-	sector_t	head_position;
-};
-
-/*
- * memory pools need a pointer to the mddev, so they can force an unplug
- * when memory is tight, and a count of the number of drives that the
- * pool was allocated for, so they know how much to allocate and free.
- * mddev->raid_disks cannot be used, as it can change while a pool is active
- * These two datums are stored in a kmalloced struct.
- */
-
-struct pool_info {
-	mddev_t *mddev;
-	int	raid_disks;
-};
-
-
-typedef struct r1bio_s r1bio_t;
-
-struct r1_private_data_s {
-	mddev_t			*mddev;
-	mirror_info_t		*mirrors;
-	int			raid_disks;
-	int			last_used;
-	sector_t		next_seq_sect;
-	spinlock_t		device_lock;
-
-	struct list_head	retry_list;
-	/* queue pending writes and submit them on unplug */
-	struct bio_list		pending_bio_list;
-	/* queue of writes that have been unplugged */
-	struct bio_list		flushing_bio_list;
-
-	/* for use when syncing mirrors: */
-
-	spinlock_t		resync_lock;
-	int			nr_pending;
-	int			nr_waiting;
-	int			nr_queued;
-	int			barrier;
-	sector_t		next_resync;
-	int			fullsync;  /* set to 1 if a full sync is needed,
-					    * (fresh device added).
-					    * Cleared when a sync completes.
-					    */
-
-	wait_queue_head_t	wait_barrier;
-
-	struct pool_info	*poolinfo;
-
-	struct page		*tmppage;
-
-	mempool_t *r1bio_pool;
-	mempool_t *r1buf_pool;
-};
-
-typedef struct r1_private_data_s conf_t;
-
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
-/*
- * this is our 'private' RAID1 bio.
- *
- * it contains information about what kind of IO operations were started
- * for this RAID1 operation, and about their status:
- */
-
-struct r1bio_s {
-	atomic_t		remaining; /* 'have we finished' count,
-					    * used from IRQ handlers
-					    */
-	atomic_t		behind_remaining; /* number of write-behind ios remaining
-						 * in this BehindIO request
-						 */
-	sector_t		sector;
-	int			sectors;
-	unsigned long		state;
-	mddev_t			*mddev;
-	/*
-	 * original bio going to /dev/mdx
-	 */
-	struct bio		*master_bio;
-	/*
-	 * if the IO is in READ direction, then this is where we read
-	 */
-	int			read_disk;
-
-	struct list_head	retry_list;
-	struct bitmap_update	*bitmap_update;
-	/*
-	 * if the IO is in WRITE direction, then multiple bios are used.
-	 * We choose the number when they are allocated.
-	 */
-	struct bio		*bios[0];
-	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
-};
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error.  To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio*)1)
-
-/* bits for r1bio.state */
-#define	R1BIO_Uptodate	0
-#define	R1BIO_IsSync	1
-#define	R1BIO_Degraded	2
-#define	R1BIO_BehindIO	3
-#define	R1BIO_Barrier	4
-#define R1BIO_BarrierRetry 5
-/* For write-behind requests, we call bi_end_io when
- * the last non-write-behind device completes, providing
- * any write was successful.  Otherwise we call when
- * any write-behind write succeeds, otherwise we call
- * with failure when last write completes (and all failed).
- * Record that bi_end_io was called with this flag...
- */
-#define	R1BIO_Returned 6
-
-#endif
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
deleted file mode 100644
index e9091cfeb286..000000000000
--- a/include/linux/raid/raid10.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef _RAID10_H
-#define _RAID10_H
-
-#include <linux/raid/md.h>
-
-typedef struct mirror_info mirror_info_t;
-
-struct mirror_info {
-	mdk_rdev_t	*rdev;
-	sector_t	head_position;
-};
-
-typedef struct r10bio_s r10bio_t;
-
-struct r10_private_data_s {
-	mddev_t			*mddev;
-	mirror_info_t		*mirrors;
-	int			raid_disks;
-	spinlock_t		device_lock;
-
-	/* geometry */
-	int			near_copies;  /* number of copies layed out raid0 style */
-	int 			far_copies;   /* number of copies layed out
-					       * at large strides across drives
-					       */
-	int			far_offset;   /* far_copies are offset by 1 stripe
-					       * instead of many
-					       */
-	int			copies;	      /* near_copies * far_copies.
-					       * must be <= raid_disks
-					       */
-	sector_t		stride;	      /* distance between far copies.
-					       * This is size / far_copies unless
-					       * far_offset, in which case it is
-					       * 1 stripe.
-					       */
-
-	int chunk_shift; /* shift from chunks to sectors */
-	sector_t chunk_mask;
-
-	struct list_head	retry_list;
-	/* queue pending writes and submit them on unplug */
-	struct bio_list		pending_bio_list;
-
-
-	spinlock_t		resync_lock;
-	int nr_pending;
-	int nr_waiting;
-	int nr_queued;
-	int barrier;
-	sector_t		next_resync;
-	int			fullsync;  /* set to 1 if a full sync is needed,
-					    * (fresh device added).
-					    * Cleared when a sync completes.
-					    */
-
-	wait_queue_head_t	wait_barrier;
-
-	mempool_t *r10bio_pool;
-	mempool_t *r10buf_pool;
-	struct page		*tmppage;
-};
-
-typedef struct r10_private_data_s conf_t;
-
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
-/*
- * this is our 'private' RAID10 bio.
- *
- * it contains information about what kind of IO operations were started
- * for this RAID10 operation, and about their status:
- */
-
-struct r10bio_s {
-	atomic_t		remaining; /* 'have we finished' count,
-					    * used from IRQ handlers
-					    */
-	sector_t		sector;	/* virtual sector number */
-	int			sectors;
-	unsigned long		state;
-	mddev_t			*mddev;
-	/*
-	 * original bio going to /dev/mdx
-	 */
-	struct bio		*master_bio;
-	/*
-	 * if the IO is in READ direction, then this is where we read
-	 */
-	int			read_slot;
-
-	struct list_head	retry_list;
-	/*
-	 * if the IO is in WRITE direction, then multiple bios are used,
-	 * one for each copy.
-	 * When resyncing we also use one for each copy.
-	 * When reconstructing, we use 2 bios, one for read, one for write.
-	 * We choose the number when they are allocated.
-	 */
-	struct {
-		struct bio		*bio;
-		sector_t addr;
-		int devnum;
-	} devs[0];
-};
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error.  To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio*)1)
-
-/* bits for r10bio.state */
-#define	R10BIO_Uptodate	0
-#define	R10BIO_IsSync	1
-#define	R10BIO_IsRecover 2
-#define	R10BIO_Degraded 3
-#endif
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
deleted file mode 100644
index 3b2672792457..000000000000
--- a/include/linux/raid/raid5.h
+++ /dev/null
@@ -1,402 +0,0 @@
-#ifndef _RAID5_H
-#define _RAID5_H
-
-#include <linux/raid/md.h>
-#include <linux/raid/xor.h>
-
-/*
- *
- * Each stripe contains one buffer per disc.  Each buffer can be in
- * one of a number of states stored in "flags".  Changes between
- * these states happen *almost* exclusively under a per-stripe
- * spinlock.  Some very specific changes can happen in bi_end_io, and
- * these are not protected by the spin lock.
- *
- * The flag bits that are used to represent these states are:
- *   R5_UPTODATE and R5_LOCKED
- *
- * State Empty == !UPTODATE, !LOCK
- *        We have no data, and there is no active request
- * State Want == !UPTODATE, LOCK
- *        A read request is being submitted for this block
- * State Dirty == UPTODATE, LOCK
- *        Some new data is in this buffer, and it is being written out
- * State Clean == UPTODATE, !LOCK
- *        We have valid data which is the same as on disc
- *
- * The possible state transitions are:
- *
- *  Empty -> Want   - on read or write to get old data for  parity calc
- *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
- *  Empty -> Clean  - on compute_block when computing a block for failed drive
- *  Want  -> Empty  - on failed read
- *  Want  -> Clean  - on successful completion of read request
- *  Dirty -> Clean  - on successful completion of write request
- *  Dirty -> Clean  - on failed write
- *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
- *
- * The Want->Empty, Want->Clean, Dirty->Clean, transitions
- * all happen in b_end_io at interrupt time.
- * Each sets the Uptodate bit before releasing the Lock bit.
- * This leaves one multi-stage transition:
- *    Want->Dirty->Clean
- * This is safe because thinking that a Clean buffer is actually dirty
- * will at worst delay some action, and the stripe will be scheduled
- * for attention after the transition is complete.
- *
- * There is one possibility that is not covered by these states.  That
- * is if one drive has failed and there is a spare being rebuilt.  We
- * can't distinguish between a clean block that has been generated
- * from parity calculations, and a clean block that has been
- * successfully written to the spare ( or to parity when resyncing).
- * To distingush these states we have a stripe bit STRIPE_INSYNC that
- * is set whenever a write is scheduled to the spare, or to the parity
- * disc if there is no spare.  A sync request clears this bit, and
- * when we find it set with no buffers locked, we know the sync is
- * complete.
- *
- * Buffers for the md device that arrive via make_request are attached
- * to the appropriate stripe in one of two lists linked on b_reqnext.
- * One list (bh_read) for read requests, one (bh_write) for write.
- * There should never be more than one buffer on the two lists
- * together, but we are not guaranteed of that so we allow for more.
- *
- * If a buffer is on the read list when the associated cache buffer is
- * Uptodate, the data is copied into the read buffer and it's b_end_io
- * routine is called.  This may happen in the end_request routine only
- * if the buffer has just successfully been read.  end_request should
- * remove the buffers from the list and then set the Uptodate bit on
- * the buffer.  Other threads may do this only if they first check
- * that the Uptodate bit is set.  Once they have checked that they may
- * take buffers off the read queue.
- *
- * When a buffer on the write list is committed for write it is copied
- * into the cache buffer, which is then marked dirty, and moved onto a
- * third list, the written list (bh_written).  Once both the parity
- * block and the cached buffer are successfully written, any buffer on
- * a written list can be returned with b_end_io.
- *
- * The write list and read list both act as fifos.  The read list is
- * protected by the device_lock.  The write and written lists are
- * protected by the stripe lock.  The device_lock, which can be
- * claimed while the stipe lock is held, is only for list
- * manipulations and will only be held for a very short time.  It can
- * be claimed from interrupts.
- *
- *
- * Stripes in the stripe cache can be on one of two lists (or on
- * neither).  The "inactive_list" contains stripes which are not
- * currently being used for any request.  They can freely be reused
- * for another stripe.  The "handle_list" contains stripes that need
- * to be handled in some way.  Both of these are fifo queues.  Each
- * stripe is also (potentially) linked to a hash bucket in the hash
- * table so that it can be found by sector number.  Stripes that are
- * not hashed must be on the inactive_list, and will normally be at
- * the front.  All stripes start life this way.
- *
- * The inactive_list, handle_list and hash bucket lists are all protected by the
- * device_lock.
- *  - stripes on the inactive_list never have their stripe_lock held.
- *  - stripes have a reference counter. If count==0, they are on a list.
- *  - If a stripe might need handling, STRIPE_HANDLE is set.
- *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
- *    handle_list else inactive_list
- *
- * This, combined with the fact that STRIPE_HANDLE is only ever
- * cleared while a stripe has a non-zero count means that if the
- * refcount is 0 and STRIPE_HANDLE is set, then it is on the
- * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
- * the stripe is on inactive_list.
- *
- * The possible transitions are:
- *  activate an unhashed/inactive stripe (get_active_stripe())
- *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
- *  activate a hashed, possibly active stripe (get_active_stripe())
- *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
- *  attach a request to an active stripe (add_stripe_bh())
- *     lockdev attach-buffer unlockdev
- *  handle a stripe (handle_stripe())
- *     lockstripe clrSTRIPE_HANDLE ...
- *		(lockdev check-buffers unlockdev) ..
- *		change-state ..
- *		record io/ops needed unlockstripe schedule io/ops
- *  release an active stripe (release_stripe())
- *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
- *
- * The refcount counts each thread that have activated the stripe,
- * plus raid5d if it is handling it, plus one for each active request
- * on a cached buffer, and plus one if the stripe is undergoing stripe
- * operations.
- *
- * Stripe operations are performed outside the stripe lock,
- * the stripe operations are:
- * -copying data between the stripe cache and user application buffers
- * -computing blocks to save a disk access, or to recover a missing block
- * -updating the parity on a write operation (reconstruct write and
- *  read-modify-write)
- * -checking parity correctness
- * -running i/o to disk
- * These operations are carried out by raid5_run_ops which uses the async_tx
- * api to (optionally) offload operations to dedicated hardware engines.
- * When requesting an operation handle_stripe sets the pending bit for the
- * operation and increments the count.  raid5_run_ops is then run whenever
- * the count is non-zero.
- * There are some critical dependencies between the operations that prevent some
- * from being requested while another is in flight.
- * 1/ Parity check operations destroy the in cache version of the parity block,
- *    so we prevent parity dependent operations like writes and compute_blocks
- *    from starting while a check is in progress.  Some dma engines can perform
- *    the check without damaging the parity block, in these cases the parity
- *    block is re-marked up to date (assuming the check was successful) and is
- *    not re-read from disk.
- * 2/ When a write operation is requested we immediately lock the affected
- *    blocks, and mark them as not up to date.  This causes new read requests
- *    to be held off, as well as parity checks and compute block operations.
- * 3/ Once a compute block operation has been requested handle_stripe treats
- *    that block as if it is up to date.  raid5_run_ops guaruntees that any
- *    operation that is dependent on the compute block result is initiated after
- *    the compute block completes.
- */
-
-/*
- * Operations state - intermediate states that are visible outside of sh->lock
- * In general _idle indicates nothing is running, _run indicates a data
- * processing operation is active, and _result means the data processing result
- * is stable and can be acted upon.  For simple operations like biofill and
- * compute that only have an _idle and _run state they are indicated with
- * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
- */
-/**
- * enum check_states - handles syncing / repairing a stripe
- * @check_state_idle - check operations are quiesced
- * @check_state_run - check operation is running
- * @check_state_result - set outside lock when check result is valid
- * @check_state_compute_run - check failed and we are repairing
- * @check_state_compute_result - set outside lock when compute result is valid
- */
-enum check_states {
-	check_state_idle = 0,
-	check_state_run, /* parity check */
-	check_state_check_result,
-	check_state_compute_run, /* parity repair */
-	check_state_compute_result,
-};
-
-/**
- * enum reconstruct_states - handles writing or expanding a stripe
- */
-enum reconstruct_states {
-	reconstruct_state_idle = 0,
-	reconstruct_state_prexor_drain_run,	/* prexor-write */
-	reconstruct_state_drain_run,		/* write */
-	reconstruct_state_run,			/* expand */
-	reconstruct_state_prexor_drain_result,
-	reconstruct_state_drain_result,
-	reconstruct_state_result,
-};
-
-struct stripe_head {
-	struct hlist_node	hash;
-	struct list_head	lru;			/* inactive_list or handle_list */
-	struct raid5_private_data	*raid_conf;
-	sector_t		sector;			/* sector of this row */
-	int			pd_idx;			/* parity disk index */
-	unsigned long		state;			/* state flags */
-	atomic_t		count;			/* nr of active thread/requests */
-	spinlock_t		lock;
-	int			bm_seq;	/* sequence number for bitmap flushes */
-	int			disks;			/* disks in stripe */
-	enum check_states	check_state;
-	enum reconstruct_states reconstruct_state;
-	/* stripe_operations
-	 * @target - STRIPE_OP_COMPUTE_BLK target
-	 */
-	struct stripe_operations {
-		int		   target;
-		u32		   zero_sum_result;
-	} ops;
-	struct r5dev {
-		struct bio	req;
-		struct bio_vec	vec;
-		struct page	*page;
-		struct bio	*toread, *read, *towrite, *written;
-		sector_t	sector;			/* sector of this page */
-		unsigned long	flags;
-	} dev[1]; /* allocated with extra space depending of RAID geometry */
-};
-
-/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
- *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
- */
-struct stripe_head_state {
-	int syncing, expanding, expanded;
-	int locked, uptodate, to_read, to_write, failed, written;
-	int to_fill, compute, req_compute, non_overwrite;
-	int failed_num;
-	unsigned long ops_request;
-};
-
-/* r6_state - extra state data only relevant to r6 */
-struct r6_state {
-	int p_failed, q_failed, qd_idx, failed_num[2];
-};
-
-/* Flags */
-#define	R5_UPTODATE	0	/* page contains current data */
-#define	R5_LOCKED	1	/* IO has been submitted on "req" */
-#define	R5_OVERWRITE	2	/* towrite covers whole page */
-/* and some that are internal to handle_stripe */
-#define	R5_Insync	3	/* rdev && rdev->in_sync at start */
-#define	R5_Wantread	4	/* want to schedule a read */
-#define	R5_Wantwrite	5
-#define	R5_Overlap	7	/* There is a pending overlapping request on this block */
-#define	R5_ReadError	8	/* seen a read error here recently */
-#define	R5_ReWrite	9	/* have tried to over-write the readerror */
-
-#define	R5_Expanded	10	/* This block now has post-expand data */
-#define	R5_Wantcompute	11 /* compute_block in progress treat as
-				    * uptodate
-				    */
-#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
-				    * filling
-				    */
-#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
-/*
- * Write method
- */
-#define RECONSTRUCT_WRITE	1
-#define READ_MODIFY_WRITE	2
-/* not a write method, but a compute_parity mode */
-#define	CHECK_PARITY		3
-
-/*
- * Stripe state
- */
-#define STRIPE_HANDLE		2
-#define	STRIPE_SYNCING		3
-#define	STRIPE_INSYNC		4
-#define	STRIPE_PREREAD_ACTIVE	5
-#define	STRIPE_DELAYED		6
-#define	STRIPE_DEGRADED		7
-#define	STRIPE_BIT_DELAY	8
-#define	STRIPE_EXPANDING	9
-#define	STRIPE_EXPAND_SOURCE	10
-#define	STRIPE_EXPAND_READY	11
-#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
-#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
-#define	STRIPE_BIOFILL_RUN	14
-#define	STRIPE_COMPUTE_RUN	15
-/*
- * Operation request flags
- */
-#define STRIPE_OP_BIOFILL	0
-#define STRIPE_OP_COMPUTE_BLK	1
-#define STRIPE_OP_PREXOR	2
-#define STRIPE_OP_BIODRAIN	3
-#define STRIPE_OP_POSTXOR	4
-#define STRIPE_OP_CHECK	5
-
-/*
- * Plugging:
- *
- * To improve write throughput, we need to delay the handling of some
- * stripes until there has been a chance that several write requests
- * for the one stripe have all been collected.
- * In particular, any write request that would require pre-reading
- * is put on a "delayed" queue until there are no stripes currently
- * in a pre-read phase.  Further, if the "delayed" queue is empty when
- * a stripe is put on it then we "plug" the queue and do not process it
- * until an unplug call is made. (the unplug_io_fn() is called).
- *
- * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
- * it to the count of prereading stripes.
- * When write is initiated, or the stripe refcnt == 0 (just in case) we
- * clear the PREREAD_ACTIVE flag and decrement the count
- * Whenever the 'handle' queue is empty and the device is not plugged, we
- * move any strips from delayed to handle and clear the DELAYED flag and set
- * PREREAD_ACTIVE.
- * In stripe_handle, if we find pre-reading is necessary, we do it if
- * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
- * HANDLE gets cleared if stripe_handle leave nothing locked.
- */
- 
-
-struct disk_info {
-	mdk_rdev_t	*rdev;
-};
-
-struct raid5_private_data {
-	struct hlist_head	*stripe_hashtbl;
-	mddev_t			*mddev;
-	struct disk_info	*spare;
-	int			chunk_size, level, algorithm;
-	int			max_degraded;
-	int			raid_disks;
-	int			max_nr_stripes;
-
-	/* used during an expand */
-	sector_t		expand_progress;	/* MaxSector when no expand happening */
-	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
-					    * as we haven't flushed the metadata yet
-					    */
-	int			previous_raid_disks;
-
-	struct list_head	handle_list; /* stripes needing handling */
-	struct list_head	hold_list; /* preread ready stripes */
-	struct list_head	delayed_list; /* stripes that have plugged requests */
-	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
-	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
-	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
-	atomic_t		preread_active_stripes; /* stripes with scheduled io */
-	atomic_t		active_aligned_reads;
-	atomic_t		pending_full_writes; /* full write backlog */
-	int			bypass_count; /* bypassed prereads */
-	int			bypass_threshold; /* preread nice */
-	struct list_head	*last_hold; /* detect hold_list promotions */
-
-	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
-	/* unfortunately we need two cache names as we temporarily have
-	 * two caches.
-	 */
-	int			active_name;
-	char			cache_name[2][20];
-	struct kmem_cache		*slab_cache; /* for allocating stripes */
-
-	int			seq_flush, seq_write;
-	int			quiesce;
-
-	int			fullsync;  /* set to 1 if a full sync is needed,
-					    * (fresh device added).
-					    * Cleared when a sync completes.
-					    */
-
-	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
-
-	/*
-	 * Free stripes pool
-	 */
-	atomic_t		active_stripes;
-	struct list_head	inactive_list;
-	wait_queue_head_t	wait_for_stripe;
-	wait_queue_head_t	wait_for_overlap;
-	int			inactive_blocked;	/* release of inactive stripes blocked,
-							 * waiting for 25% to be free
-							 */
-	int			pool_size; /* number of disks in stripeheads in pool */
-	spinlock_t		device_lock;
-	struct disk_info	*disks;
-};
-
-typedef struct raid5_private_data raid5_conf_t;
-
-#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
-
-/*
- * Our supported algorithms
- */
-#define ALGORITHM_LEFT_ASYMMETRIC	0
-#define ALGORITHM_RIGHT_ASYMMETRIC	1
-#define ALGORITHM_LEFT_SYMMETRIC	2
-#define ALGORITHM_RIGHT_SYMMETRIC	3
-
-#endif
-- 
cgit v1.2.3-71-gd317


From 8b2b5c217c20b5460218ab8731295f2e46c7dd29 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:27:03 +1100
Subject: md: move LEVEL_* definition from md_k.h to md_u.h

.. as they are part of the user-space interface.
Also move MdpMinorShift into there so we can remove duplication.

Lastly move mdp_major in.  It is less obviously part of the user-space
interface, but do_mounts_md.c uses it, and it is acting a bit like
user-space.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           |  3 ---
 include/linux/raid/md.h   |  2 --
 include/linux/raid/md_k.h | 10 ----------
 include/linux/raid/md_u.h | 17 +++++++++++++++++
 init/do_mounts_md.c       |  2 --
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9a3214c8585f..96336b050b59 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -46,9 +46,6 @@
 #include <linux/delay.h>
 #include "bitmap.h"
 
-/* 63 partitions with the alternate major number (mdp) */
-#define MdpMinorShift 6
-
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
 
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 82bea14cae1a..8bfaf6b1d309 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -52,8 +52,6 @@
  */
 #define MD_PATCHLEVEL_VERSION           3
 
-extern int mdp_major;
-
 extern int register_md_personality(struct mdk_personality *p);
 extern int unregister_md_personality(struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4aedb9fe2bd8..758ec2842d9a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -20,16 +20,6 @@
 
 #ifdef CONFIG_BLOCK
 
-#define	LEVEL_MULTIPATH		(-4)
-#define	LEVEL_LINEAR		(-1)
-#define	LEVEL_FAULTY		(-5)
-
-/* we need a value for 'no level specified' and 0
- * means 'raid0', so we need something else.  This is
- * for internal use only
- */
-#define	LEVEL_NONE		(-1000000)
-
 #define MaxSector (~(sector_t)0)
 
 typedef struct mddev_s mddev_t;
diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h
index 7192035fc4b0..2f824aa889f3 100644
--- a/include/linux/raid/md_u.h
+++ b/include/linux/raid/md_u.h
@@ -46,6 +46,12 @@
 #define STOP_ARRAY_RO		_IO (MD_MAJOR, 0x33)
 #define RESTART_ARRAY_RW	_IO (MD_MAJOR, 0x34)
 
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+#ifdef __KERNEL__
+extern int mdp_major;
+#endif
+
 typedef struct mdu_version_s {
 	int major;
 	int minor;
@@ -85,6 +91,17 @@ typedef struct mdu_array_info_s {
 
 } mdu_array_info_t;
 
+/* non-obvious values for 'level' */
+#define	LEVEL_MULTIPATH		(-4)
+#define	LEVEL_LINEAR		(-1)
+#define	LEVEL_FAULTY		(-5)
+
+/* we need a value for 'no level specified' and 0
+ * means 'raid0', so we need something else.  This is
+ * for internal use only
+ */
+#define	LEVEL_NONE		(-1000000)
+
 typedef struct mdu_disk_info_s {
 	/*
 	 * configuration/status of one particular disk
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 9bdddbcb3d6a..23a15fb57e15 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -112,8 +112,6 @@ static int __init md_setup(char *str)
 	return 1;
 }
 
-#define MdpMinorShift 6
-
 static void __init md_setup_drive(void)
 {
 	int minor, i, ent, partitioned;
-- 
cgit v1.2.3-71-gd317


From 92022950c6b1bb3da90b2976b20271cdfd98b8a3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:33:13 +1100
Subject: md: move most content from md.h to md_k.h

The extern function definitions are kernel-internal definitions, so
they belong in md_k.h

The MD_*_VERSION values could reasonably go in a number of places,
but md_u.h seems most reasonable.

This leaves almost nothing in md.h.  It will go soon.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/md.h   | 40 ----------------------------------------
 include/linux/raid/md_k.h | 22 ++++++++++++++++++++++
 include/linux/raid/md_u.h | 18 ++++++++++++++++++
 3 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 8bfaf6b1d309..71c4fd19c317 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -34,46 +34,6 @@
 
 #ifdef CONFIG_MD
 
-/*
- * Different major versions are not compatible.
- * Different minor versions are only downward compatible.
- * Different patchlevel versions are downward and upward compatible.
- */
-#define MD_MAJOR_VERSION                0
-#define MD_MINOR_VERSION                90
-/*
- * MD_PATCHLEVEL_VERSION indicates kernel functionality.
- * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
- *     and major_version/minor_version accordingly
- * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
- *     in the super status byte
- * >=3 means that bitmap superblock version 4 is supported, which uses
- *     little-ending representation rather than host-endian
- */
-#define MD_PATCHLEVEL_VERSION           3
-
-extern int register_md_personality(struct mdk_personality *p);
-extern int unregister_md_personality(struct mdk_personality *p);
-extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
-				mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t *thread);
-extern void md_wakeup_thread(mdk_thread_t *thread);
-extern void md_check_recovery(mddev_t *mddev);
-extern void md_write_start(mddev_t *mddev, struct bio *bi);
-extern void md_write_end(mddev_t *mddev);
-extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
-
-extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
-			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
-			struct page *page, int rw);
-extern void md_do_sync(mddev_t *mddev);
-extern void md_new_event(mddev_t *mddev);
-extern int md_allow_write(mddev_t *mddev);
-extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
-
 #endif /* CONFIG_MD */
 #endif 
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 758ec2842d9a..4c5e2d00ff5e 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -390,3 +390,25 @@ static inline void safe_put_page(struct page *p)
 #endif /* CONFIG_BLOCK */
 #endif
 
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+				mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+			   sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+			struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h
index 2f824aa889f3..fb1abb3367e9 100644
--- a/include/linux/raid/md_u.h
+++ b/include/linux/raid/md_u.h
@@ -15,6 +15,24 @@
 #ifndef _MD_U_H
 #define _MD_U_H
 
+/*
+ * Different major versions are not compatible.
+ * Different minor versions are only downward compatible.
+ * Different patchlevel versions are downward and upward compatible.
+ */
+#define MD_MAJOR_VERSION                0
+#define MD_MINOR_VERSION                90
+/*
+ * MD_PATCHLEVEL_VERSION indicates kernel functionality.
+ * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
+ *     and major_version/minor_version accordingly
+ * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
+ *     in the super status byte
+ * >=3 means that bitmap superblock version 4 is supported, which uses
+ *     little-ending representation rather than host-endian
+ */
+#define MD_PATCHLEVEL_VERSION           3
+
 /* ioctls */
 
 /* status */
-- 
cgit v1.2.3-71-gd317


From bff61975b3d6c18ee31457cc5b4d73042f44915f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:33:13 +1100
Subject: md: move lots of #include lines out of .h files and into .c

This makes the includes more explicit, and is preparation for moving
md_k.h to drivers/md/md.h

Remove include/raid/md.h as its only remaining use was to #include
other files.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 crypto/xor.c              |  2 +-
 drivers/md/bitmap.c       |  3 ++-
 drivers/md/faulty.c       |  5 ++++-
 drivers/md/linear.c       |  4 ++++
 drivers/md/linear.h       |  2 --
 drivers/md/md.c           |  6 +++++-
 drivers/md/multipath.c    |  4 ++++
 drivers/md/multipath.h    |  2 --
 drivers/md/raid0.c        |  3 +++
 drivers/md/raid0.h        |  2 --
 drivers/md/raid1.c        |  5 ++++-
 drivers/md/raid1.h        |  2 --
 drivers/md/raid10.c       |  5 ++++-
 drivers/md/raid10.h       |  2 --
 drivers/md/raid5.c        |  8 ++++++--
 drivers/md/raid5.h        |  1 -
 drivers/md/raid6.h        |  6 +-----
 fs/compat_ioctl.c         |  2 +-
 include/linux/raid/md.h   | 39 ---------------------------------------
 include/linux/raid/md_k.h |  3 ---
 include/linux/raid/xor.h  |  2 --
 init/do_mounts.h          |  1 +
 init/do_mounts_md.c       |  3 ++-
 23 files changed, 42 insertions(+), 70 deletions(-)
 delete mode 100644 include/linux/raid/md.h

(limited to 'include/linux')

diff --git a/crypto/xor.c b/crypto/xor.c
index b2e6db075e49..996b6ee57d9e 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -18,8 +18,8 @@
 
 #define BH_TRACE 0
 #include <linux/module.h>
-#include <linux/raid/md.h>
 #include <linux/raid/xor.h>
+#include <linux/jiffies.h>
 #include <asm/xor.h>
 
 /* The xor routines to use.  */
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7666117738c7..1df012e9d73d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -16,6 +16,7 @@
  * wait if count gets too high, wake when it drops to half.
  */
 
+#include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -26,7 +27,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_k.h>
 #include "bitmap.h"
 
 /* debug macros */
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 86d9adf90e79..cc5d2cf08dfc 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -62,7 +62,10 @@
 #define	ModeShift	5
 
 #define MaxFault	50
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 
 
 static void faulty_fail(struct bio *bio, int error)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3603ffa9edc5..c43c3b60ef09 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,6 +16,10 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 #include "linear.h"
 
 /*
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index f38b9c586afb..bf8179587f95 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -1,8 +1,6 @@
 #ifndef _LINEAR_H
 #define _LINEAR_H
 
-#include <linux/raid/md.h>
-
 struct dev_info {
 	mdk_rdev_t	*rdev;
 	sector_t	num_sectors;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 96336b050b59..11d6e0e1045a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,8 +33,9 @@
 */
 
 #include <linux/kthread.h>
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
@@ -44,6 +45,9 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
+#include <linux/raid/md_k.h>
+#include <linux/raid/md_p.h>
+#include <linux/raid/md_u.h>
 #include "bitmap.h"
 
 #define DEBUG 0
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 547df09a7af3..148b3cd058bf 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,6 +19,10 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 #include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 6f53fc177a47..6fa70b400cda 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -1,8 +1,6 @@
 #ifndef _MULTIPATH_H
 #define _MULTIPATH_H
 
-#include <linux/raid/md.h>
-
 struct multipath_info {
 	mdk_rdev_t	*rdev;
 };
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ef09ed04864e..64e4c77a1568 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,6 +18,9 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
 #include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index fd42aa87c391..824b12eb1d4f 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -1,8 +1,6 @@
 #ifndef _RAID0_H
 #define _RAID0_H
 
-#include <linux/raid/md.h>
-
 struct strip_zone
 {
 	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bff32285f8bb..253b09c86eca 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,8 +31,11 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
+#include "dm-bio-list.h"
 #include "raid1.h"
 #include "bitmap.h"
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0a9ba7c3302e..1620eea3d57c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,8 +1,6 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
-#include <linux/raid/md.h>
-
 typedef struct mirror_info mirror_info_t;
 
 struct mirror_info {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f03dd70d12a5..186e1b199d46 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,8 +18,11 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
+#include <linux/seq_file.h>
+#include "dm-bio-list.h"
 #include "raid10.h"
 #include "bitmap.h"
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index e9091cfeb286..244dbe507a54 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,8 +1,6 @@
 #ifndef _RAID10_H
 #define _RAID10_H
 
-#include <linux/raid/md.h>
-
 typedef struct mirror_info mirror_info_t;
 
 struct mirror_info {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f75698b1f63d..816157e7d8e0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -43,8 +43,12 @@
  * miss any bits.
  */
 
+#include <linux/blkdev.h>
+#include <linux/raid/md_k.h>
 #include <linux/kthread.h>
 #include <linux/async_tx.h>
+#include <linux/seq_file.h>
+#include "raid5.h"
 #include "raid6.h"
 #include "bitmap.h"
 
@@ -1467,7 +1471,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_parity6(struct stripe_head *sh, int method)
 {
-	raid6_conf_t *conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
 	struct bio *chosen;
 	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
@@ -2795,7 +2799,7 @@ static bool handle_stripe5(struct stripe_head *sh)
 
 static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
-	raid6_conf_t *conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
 	struct bio *return_bi = NULL;
 	int i, pd_idx = sh->pd_idx;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 40f1d0335c74..0ed22dff56e0 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -1,7 +1,6 @@
 #ifndef _RAID5_H
 #define _RAID5_H
 
-#include <linux/raid/md.h>
 #include <linux/raid/xor.h>
 
 /*
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
index f6c13af65002..66e6b0c6734f 100644
--- a/drivers/md/raid6.h
+++ b/drivers/md/raid6.h
@@ -17,11 +17,7 @@
 
 /* Set to 1 to use kernel-wide empty_zero_page */
 #define RAID6_USE_EMPTY_ZERO_PAGE 0
-
-#include <linux/raid/md.h>
-#include "raid5.h"
-
-typedef raid5_conf_t raid6_conf_t; /* Same configuration */
+#include <linux/blkdev.h>
 
 /* Additional compute_parity mode -- updates the parity w/o LOCKING */
 #define UPDATE_PARITY	4
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 45e59d3c7f1f..141c03829153 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
 #include <linux/if.h>
 #include <linux/if_bridge.h>
 #include <linux/slab.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
 #include <linux/in6.h>
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
deleted file mode 100644
index 71c4fd19c317..000000000000
--- a/include/linux/raid/md.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-   md.h : Multiple Devices driver for Linux
-          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
-          Copyright (C) 1994-96 Marc ZYNGIER
-	  <zyngier@ufr-info-p7.ibp.fr> or
-	  <maz@gloups.fdn.fr>
-	  
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-   
-   You should have received a copy of the GNU General Public License
-   (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
-*/
-
-#ifndef _MD_H
-#define _MD_H
-
-#include <linux/blkdev.h>
-#include <linux/seq_file.h>
-
-/*
- * 'md_p.h' holds the 'physical' layout of RAID devices
- * 'md_u.h' holds the user <=> kernel API
- *
- * 'md_k.h' holds kernel internal definitions
- */
-
-#include <linux/raid/md_p.h>
-#include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
-
-#ifdef CONFIG_MD
-
-#endif /* CONFIG_MD */
-#endif 
-
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4c5e2d00ff5e..e78b3c1d55fd 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -15,9 +15,6 @@
 #ifndef _MD_K_H
 #define _MD_K_H
 
-/* and dm-bio-list.h is not under include/linux because.... ??? */
-#include "../../../drivers/md/dm-bio-list.h"
-
 #ifdef CONFIG_BLOCK
 
 #define MaxSector (~(sector_t)0)
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index 3e120587eada..5a210959e3f8 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -1,8 +1,6 @@
 #ifndef _XOR_H
 #define _XOR_H
 
-#include <linux/raid/md.h>
-
 #define MAX_XOR_BLOCKS 4
 
 extern void xor_blocks(unsigned int count, unsigned int bytes,
diff --git a/init/do_mounts.h b/init/do_mounts.h
index 9aa968d54329..f5b978a9bb92 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 23a15fb57e15..69aebbf8fd2d 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -1,5 +1,6 @@
 #include <linux/delay.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_p.h>
 
 #include "do_mounts.h"
 
-- 
cgit v1.2.3-71-gd317


From 43b2e5d86d8bdd77386226db0bc961529492c043 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Mar 2009 14:33:13 +1100
Subject: md: move md_k.h from include/linux/raid/ to drivers/md/

It really is nicer to keep related code together..

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c       |   2 +-
 drivers/md/faulty.c       |   2 +-
 drivers/md/linear.c       |   2 +-
 drivers/md/md.c           |   2 +-
 drivers/md/md.h           | 411 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/multipath.c    |   2 +-
 drivers/md/raid0.c        |   2 +-
 drivers/md/raid1.c        |   2 +-
 drivers/md/raid10.c       |   2 +-
 drivers/md/raid5.c        |   2 +-
 include/linux/raid/md_k.h | 411 ----------------------------------------------
 11 files changed, 420 insertions(+), 420 deletions(-)
 create mode 100644 drivers/md/md.h
 delete mode 100644 include/linux/raid/md_k.h

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1df012e9d73d..623292a5473e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -27,7 +27,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/raid/md_k.h>
+#include "md.h"
 #include "bitmap.h"
 
 /* debug macros */
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index cc5d2cf08dfc..7b66b9fca29d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -64,7 +64,7 @@
 #define MaxFault	50
 #include <linux/blkdev.h>
 #include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
+#include "md.h"
 #include <linux/seq_file.h>
 
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index c43c3b60ef09..f2488343ed4a 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -18,8 +18,8 @@
 
 #include <linux/blkdev.h>
 #include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "linear.h"
 
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 11d6e0e1045a..aad0ac54bf90 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -45,9 +45,9 @@
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
-#include <linux/raid/md_k.h>
 #include <linux/raid/md_p.h>
 #include <linux/raid/md_u.h>
+#include "md.h"
 #include "bitmap.h"
 
 #define DEBUG 0
diff --git a/drivers/md/md.h b/drivers/md/md.h
new file mode 100644
index 000000000000..e78b3c1d55fd
--- /dev/null
+++ b/drivers/md/md.h
@@ -0,0 +1,411 @@
+/*
+   md_k.h : kernel internal structure of the Linux MD driver
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_K_H
+#define _MD_K_H
+
+#ifdef CONFIG_BLOCK
+
+#define MaxSector (~(sector_t)0)
+
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+
+/*
+ * options passed in raidrun:
+ */
+
+/* Currently this must fit in an 'int' */
+#define MAX_CHUNK_SIZE (1<<30)
+
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+	struct list_head same_set;	/* RAID devices within the same set */
+
+	sector_t size;			/* Device size (in blocks) */
+	mddev_t *mddev;			/* RAID array if running */
+	int last_events;		/* IO event timestamp */
+
+	struct block_device *bdev;	/* block device handle */
+
+	struct page	*sb_page;
+	int		sb_loaded;
+	__u64		sb_events;
+	sector_t	data_offset;	/* start of data in array */
+	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
+	int		sb_size;	/* bytes in the superblock */
+	int		preferred_minor;	/* autorun support */
+
+	struct kobject	kobj;
+
+	/* A device can be in one of three states based on two flags:
+	 * Not working:   faulty==1 in_sync==0
+	 * Fully working: faulty==0 in_sync==1
+	 * Working, but not
+	 * in sync with array
+	 *                faulty==0 in_sync==0
+	 *
+	 * It can never have faulty==1, in_sync==1
+	 * This reduces the burden of testing multiple flags in many cases
+	 */
+
+	unsigned long	flags;
+#define	Faulty		1		/* device is known to have a fault */
+#define	In_sync		2		/* device is in_sync with rest of array */
+#define	WriteMostly	4		/* Avoid reading if at all possible */
+#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
+#define	AllReserved	6		/* If whole device is reserved for
+					 * one array */
+#define	AutoDetected	7		/* added by auto-detect */
+#define Blocked		8		/* An error occured on an externally
+					 * managed array, don't allow writes
+					 * until it is cleared */
+#define StateChanged	9		/* Faulty or Blocked has changed during
+					 * interrupt, so it needs to be
+					 * notified by the thread */
+	wait_queue_head_t blocked_wait;
+
+	int desc_nr;			/* descriptor index in the superblock */
+	int raid_disk;			/* role of device in array */
+	int saved_raid_disk;		/* role that device used to have in the
+					 * array and could again if we did a partial
+					 * resync from the bitmap
+					 */
+	sector_t	recovery_offset;/* If this device has been partially
+					 * recovered, this is where we were
+					 * up to.
+					 */
+
+	atomic_t	nr_pending;	/* number of pending requests.
+					 * only maintained for arrays that
+					 * support hot removal
+					 */
+	atomic_t	read_errors;	/* number of consecutive read errors that
+					 * we have tried to ignore.
+					 */
+	atomic_t	corrected_errors; /* number of corrected read errors,
+					   * for reporting to userspace and storing
+					   * in superblock.
+					   */
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+					   * sysfs entry */
+};
+
+struct mddev_s
+{
+	void				*private;
+	struct mdk_personality		*pers;
+	dev_t				unit;
+	int				md_minor;
+	struct list_head 		disks;
+	unsigned long			flags;
+#define MD_CHANGE_DEVS	0	/* Some device status has changed */
+#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
+#define MD_CHANGE_PENDING 2	/* superblock update in progress */
+
+	int				ro;
+
+	struct gendisk			*gendisk;
+
+	struct kobject			kobj;
+	int				hold_active;
+#define	UNTIL_IOCTL	1
+#define	UNTIL_STOP	2
+
+	/* Superblock information */
+	int				major_version,
+					minor_version,
+					patch_version;
+	int				persistent;
+	int 				external;	/* metadata is
+							 * managed externally */
+	char				metadata_type[17]; /* externally set*/
+	int				chunk_size;
+	time_t				ctime, utime;
+	int				level, layout;
+	char				clevel[16];
+	int				raid_disks;
+	int				max_disks;
+	sector_t			size; /* used size of component devices */
+	sector_t			array_sectors; /* exported array size */
+	__u64				events;
+
+	char				uuid[16];
+
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout, new_chunk;
+
+	struct mdk_thread_s		*thread;	/* management thread */
+	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
+	sector_t			curr_resync;	/* last block scheduled */
+	unsigned long			resync_mark;	/* a recent timestamp */
+	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
+	sector_t			curr_mark_cnt; /* blocks scheduled now */
+
+	sector_t			resync_max_sectors; /* may be set by personality */
+
+	sector_t			resync_mismatches; /* count of sectors where
+							    * parity/replica mismatch found
+							    */
+
+	/* allow user-space to request suspension of IO to regions of the array */
+	sector_t			suspend_lo;
+	sector_t			suspend_hi;
+	/* if zero, use the system-wide default */
+	int				sync_speed_min;
+	int				sync_speed_max;
+
+	/* resync even though the same disks are shared among md-devices */
+	int				parallel_resync;
+
+	int				ok_start_degraded;
+	/* recovery/resync flags 
+	 * NEEDED:   we might need to start a resync/recover
+	 * RUNNING:  a thread is running, or about to be started
+	 * SYNC:     actually doing a resync, not a recovery
+	 * RECOVER:  doing recovery, or need to try it.
+	 * INTR:     resync needs to be aborted for some reason
+	 * DONE:     thread is done and is waiting to be reaped
+	 * REQUEST:  user-space has requested a sync (used with SYNC)
+	 * CHECK:    user-space request for for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
+	 */
+#define	MD_RECOVERY_RUNNING	0
+#define	MD_RECOVERY_SYNC	1
+#define	MD_RECOVERY_RECOVER	2
+#define	MD_RECOVERY_INTR	3
+#define	MD_RECOVERY_DONE	4
+#define	MD_RECOVERY_NEEDED	5
+#define	MD_RECOVERY_REQUESTED	6
+#define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
+#define	MD_RECOVERY_FROZEN	9
+
+	unsigned long			recovery;
+	int				recovery_disabled; /* if we detect that recovery
+							    * will always fail, set this
+							    * so we don't loop trying */
+
+	int				in_sync;	/* know to not need resync */
+	struct mutex			reconfig_mutex;
+	atomic_t			active;		/* general refcount */
+	atomic_t			openers;	/* number of active opens */
+
+	int				changed;	/* true if we might need to reread partition info */
+	int				degraded;	/* whether md should consider
+							 * adding a spare
+							 */
+	int				barriers_work;	/* initialised to true, cleared as soon
+							 * as a barrier request to slave
+							 * fails.  Only supported
+							 */
+	struct bio			*biolist; 	/* bios that need to be retried
+							 * because BIO_RW_BARRIER is not supported
+							 */
+
+	atomic_t			recovery_active; /* blocks scheduled, but not written */
+	wait_queue_head_t		recovery_wait;
+	sector_t			recovery_cp;
+	sector_t			resync_min;	/* user requested sync
+							 * starts here */
+	sector_t			resync_max;	/* resync should pause
+							 * when it gets here */
+
+	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+							 * file in sysfs.
+							 */
+	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
+
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	spinlock_t			write_lock;
+	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
+	atomic_t			pending_writes;	/* number of active superblock writes */
+
+	unsigned int			safemode;	/* if set, update "clean" superblock
+							 * when no writes pending.
+							 */ 
+	unsigned int			safemode_delay;
+	struct timer_list		safemode_timer;
+	atomic_t			writes_pending; 
+	struct request_queue		*queue;	/* for plugging ... */
+
+	atomic_t                        write_behind; /* outstanding async IO */
+	unsigned int                    max_write_behind; /* 0 = sync */
+
+	struct bitmap                   *bitmap; /* the bitmap for the device */
+	struct file			*bitmap_file; /* the bitmap file */
+	long				bitmap_offset; /* offset from superblock of
+							* start of bitmap. May be
+							* negative, but not '0'
+							*/
+	long				default_bitmap_offset; /* this is the offset to use when
+								* hot-adding a bitmap.  It should
+								* eventually be settable by sysfs.
+								*/
+
+	struct list_head		all_mddevs;
+};
+
+
+static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+	int faulty = test_bit(Faulty, &rdev->flags);
+	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+}
+
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+
+struct mdk_personality
+{
+	char *name;
+	int level;
+	struct list_head list;
+	struct module *owner;
+	int (*make_request)(struct request_queue *q, struct bio *bio);
+	int (*run)(mddev_t *mddev);
+	int (*stop)(mddev_t *mddev);
+	void (*status)(struct seq_file *seq, mddev_t *mddev);
+	/* error_handler must set ->faulty and clear ->in_sync
+	 * if appropriate, and should abort recovery if needed 
+	 */
+	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
+	int (*hot_remove_disk) (mddev_t *mddev, int number);
+	int (*spare_active) (mddev_t *mddev);
+	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
+	int (*resize) (mddev_t *mddev, sector_t sectors);
+	int (*check_reshape) (mddev_t *mddev);
+	int (*start_reshape) (mddev_t *mddev);
+	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+	/* quiesce moves between quiescence states
+	 * 0 - fully active
+	 * 1 - no new requests allowed
+	 * others - reserved
+	 */
+	void (*quiesce) (mddev_t *mddev, int state);
+};
+
+
+struct md_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(mddev_t *, char *);
+	ssize_t (*store)(mddev_t *, const char *, size_t);
+};
+
+
+static inline char * mdname (mddev_t * mddev)
+{
+	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
+}
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define rdev_for_each_list(rdev, tmp, head)				\
+	list_for_each_entry_safe(rdev, tmp, head, same_set)
+
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define rdev_for_each(rdev, tmp, mddev)				\
+	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
+
+#define rdev_for_each_rcu(rdev, mddev)				\
+	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
+typedef struct mdk_thread_s {
+	void			(*run) (mddev_t *mddev);
+	mddev_t			*mddev;
+	wait_queue_head_t	wqueue;
+	unsigned long           flags;
+	struct task_struct	*tsk;
+	unsigned long		timeout;
+} mdk_thread_t;
+
+#define THREAD_WAKEUP  0
+
+#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+		spin_unlock_irq(&lock);					\
+		cmd;							\
+		schedule();						\
+		spin_lock_irq(&lock);					\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
+do {									\
+	if (condition)	 						\
+		break;							\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
+} while (0)
+
+static inline void safe_put_page(struct page *p)
+{
+	if (p) put_page(p);
+}
+
+#endif /* CONFIG_BLOCK */
+#endif
+
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+				mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+			   sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+			struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 148b3cd058bf..0ed1005afb58 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -21,8 +21,8 @@
 
 #include <linux/blkdev.h>
 #include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 64e4c77a1568..3d06df86da87 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -19,8 +19,8 @@
 */
 
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 253b09c86eca..051ecfa61514 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -33,8 +33,8 @@
 
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "dm-bio-list.h"
 #include "raid1.h"
 #include "bitmap.h"
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 186e1b199d46..fea61e3dcd95 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -20,8 +20,8 @@
 
 #include <linux/delay.h>
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "dm-bio-list.h"
 #include "raid10.h"
 #include "bitmap.h"
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 816157e7d8e0..849478e9afdc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -44,10 +44,10 @@
  */
 
 #include <linux/blkdev.h>
-#include <linux/raid/md_k.h>
 #include <linux/kthread.h>
 #include <linux/async_tx.h>
 #include <linux/seq_file.h>
+#include "md.h"
 #include "raid5.h"
 #include "raid6.h"
 #include "bitmap.h"
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
deleted file mode 100644
index e78b3c1d55fd..000000000000
--- a/include/linux/raid/md_k.h
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
-   md_k.h : kernel internal structure of the Linux MD driver
-          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
-	  
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-   
-   You should have received a copy of the GNU General Public License
-   (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
-*/
-
-#ifndef _MD_K_H
-#define _MD_K_H
-
-#ifdef CONFIG_BLOCK
-
-#define MaxSector (~(sector_t)0)
-
-typedef struct mddev_s mddev_t;
-typedef struct mdk_rdev_s mdk_rdev_t;
-
-/*
- * options passed in raidrun:
- */
-
-/* Currently this must fit in an 'int' */
-#define MAX_CHUNK_SIZE (1<<30)
-
-/*
- * MD's 'extended' device
- */
-struct mdk_rdev_s
-{
-	struct list_head same_set;	/* RAID devices within the same set */
-
-	sector_t size;			/* Device size (in blocks) */
-	mddev_t *mddev;			/* RAID array if running */
-	int last_events;		/* IO event timestamp */
-
-	struct block_device *bdev;	/* block device handle */
-
-	struct page	*sb_page;
-	int		sb_loaded;
-	__u64		sb_events;
-	sector_t	data_offset;	/* start of data in array */
-	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
-	int		sb_size;	/* bytes in the superblock */
-	int		preferred_minor;	/* autorun support */
-
-	struct kobject	kobj;
-
-	/* A device can be in one of three states based on two flags:
-	 * Not working:   faulty==1 in_sync==0
-	 * Fully working: faulty==0 in_sync==1
-	 * Working, but not
-	 * in sync with array
-	 *                faulty==0 in_sync==0
-	 *
-	 * It can never have faulty==1, in_sync==1
-	 * This reduces the burden of testing multiple flags in many cases
-	 */
-
-	unsigned long	flags;
-#define	Faulty		1		/* device is known to have a fault */
-#define	In_sync		2		/* device is in_sync with rest of array */
-#define	WriteMostly	4		/* Avoid reading if at all possible */
-#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
-#define	AllReserved	6		/* If whole device is reserved for
-					 * one array */
-#define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occured on an externally
-					 * managed array, don't allow writes
-					 * until it is cleared */
-#define StateChanged	9		/* Faulty or Blocked has changed during
-					 * interrupt, so it needs to be
-					 * notified by the thread */
-	wait_queue_head_t blocked_wait;
-
-	int desc_nr;			/* descriptor index in the superblock */
-	int raid_disk;			/* role of device in array */
-	int saved_raid_disk;		/* role that device used to have in the
-					 * array and could again if we did a partial
-					 * resync from the bitmap
-					 */
-	sector_t	recovery_offset;/* If this device has been partially
-					 * recovered, this is where we were
-					 * up to.
-					 */
-
-	atomic_t	nr_pending;	/* number of pending requests.
-					 * only maintained for arrays that
-					 * support hot removal
-					 */
-	atomic_t	read_errors;	/* number of consecutive read errors that
-					 * we have tried to ignore.
-					 */
-	atomic_t	corrected_errors; /* number of corrected read errors,
-					   * for reporting to userspace and storing
-					   * in superblock.
-					   */
-	struct work_struct del_work;	/* used for delayed sysfs removal */
-
-	struct sysfs_dirent *sysfs_state; /* handle for 'state'
-					   * sysfs entry */
-};
-
-struct mddev_s
-{
-	void				*private;
-	struct mdk_personality		*pers;
-	dev_t				unit;
-	int				md_minor;
-	struct list_head 		disks;
-	unsigned long			flags;
-#define MD_CHANGE_DEVS	0	/* Some device status has changed */
-#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
-#define MD_CHANGE_PENDING 2	/* superblock update in progress */
-
-	int				ro;
-
-	struct gendisk			*gendisk;
-
-	struct kobject			kobj;
-	int				hold_active;
-#define	UNTIL_IOCTL	1
-#define	UNTIL_STOP	2
-
-	/* Superblock information */
-	int				major_version,
-					minor_version,
-					patch_version;
-	int				persistent;
-	int 				external;	/* metadata is
-							 * managed externally */
-	char				metadata_type[17]; /* externally set*/
-	int				chunk_size;
-	time_t				ctime, utime;
-	int				level, layout;
-	char				clevel[16];
-	int				raid_disks;
-	int				max_disks;
-	sector_t			size; /* used size of component devices */
-	sector_t			array_sectors; /* exported array size */
-	__u64				events;
-
-	char				uuid[16];
-
-	/* If the array is being reshaped, we need to record the
-	 * new shape and an indication of where we are up to.
-	 * This is written to the superblock.
-	 * If reshape_position is MaxSector, then no reshape is happening (yet).
-	 */
-	sector_t			reshape_position;
-	int				delta_disks, new_level, new_layout, new_chunk;
-
-	struct mdk_thread_s		*thread;	/* management thread */
-	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
-	sector_t			curr_resync;	/* last block scheduled */
-	unsigned long			resync_mark;	/* a recent timestamp */
-	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
-	sector_t			curr_mark_cnt; /* blocks scheduled now */
-
-	sector_t			resync_max_sectors; /* may be set by personality */
-
-	sector_t			resync_mismatches; /* count of sectors where
-							    * parity/replica mismatch found
-							    */
-
-	/* allow user-space to request suspension of IO to regions of the array */
-	sector_t			suspend_lo;
-	sector_t			suspend_hi;
-	/* if zero, use the system-wide default */
-	int				sync_speed_min;
-	int				sync_speed_max;
-
-	/* resync even though the same disks are shared among md-devices */
-	int				parallel_resync;
-
-	int				ok_start_degraded;
-	/* recovery/resync flags 
-	 * NEEDED:   we might need to start a resync/recover
-	 * RUNNING:  a thread is running, or about to be started
-	 * SYNC:     actually doing a resync, not a recovery
-	 * RECOVER:  doing recovery, or need to try it.
-	 * INTR:     resync needs to be aborted for some reason
-	 * DONE:     thread is done and is waiting to be reaped
-	 * REQUEST:  user-space has requested a sync (used with SYNC)
-	 * CHECK:    user-space request for for check-only, no repair
-	 * RESHAPE:  A reshape is happening
-	 *
-	 * If neither SYNC or RESHAPE are set, then it is a recovery.
-	 */
-#define	MD_RECOVERY_RUNNING	0
-#define	MD_RECOVERY_SYNC	1
-#define	MD_RECOVERY_RECOVER	2
-#define	MD_RECOVERY_INTR	3
-#define	MD_RECOVERY_DONE	4
-#define	MD_RECOVERY_NEEDED	5
-#define	MD_RECOVERY_REQUESTED	6
-#define	MD_RECOVERY_CHECK	7
-#define MD_RECOVERY_RESHAPE	8
-#define	MD_RECOVERY_FROZEN	9
-
-	unsigned long			recovery;
-	int				recovery_disabled; /* if we detect that recovery
-							    * will always fail, set this
-							    * so we don't loop trying */
-
-	int				in_sync;	/* know to not need resync */
-	struct mutex			reconfig_mutex;
-	atomic_t			active;		/* general refcount */
-	atomic_t			openers;	/* number of active opens */
-
-	int				changed;	/* true if we might need to reread partition info */
-	int				degraded;	/* whether md should consider
-							 * adding a spare
-							 */
-	int				barriers_work;	/* initialised to true, cleared as soon
-							 * as a barrier request to slave
-							 * fails.  Only supported
-							 */
-	struct bio			*biolist; 	/* bios that need to be retried
-							 * because BIO_RW_BARRIER is not supported
-							 */
-
-	atomic_t			recovery_active; /* blocks scheduled, but not written */
-	wait_queue_head_t		recovery_wait;
-	sector_t			recovery_cp;
-	sector_t			resync_min;	/* user requested sync
-							 * starts here */
-	sector_t			resync_max;	/* resync should pause
-							 * when it gets here */
-
-	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
-							 * file in sysfs.
-							 */
-	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
-
-	struct work_struct del_work;	/* used for delayed sysfs removal */
-
-	spinlock_t			write_lock;
-	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
-	atomic_t			pending_writes;	/* number of active superblock writes */
-
-	unsigned int			safemode;	/* if set, update "clean" superblock
-							 * when no writes pending.
-							 */ 
-	unsigned int			safemode_delay;
-	struct timer_list		safemode_timer;
-	atomic_t			writes_pending; 
-	struct request_queue		*queue;	/* for plugging ... */
-
-	atomic_t                        write_behind; /* outstanding async IO */
-	unsigned int                    max_write_behind; /* 0 = sync */
-
-	struct bitmap                   *bitmap; /* the bitmap for the device */
-	struct file			*bitmap_file; /* the bitmap file */
-	long				bitmap_offset; /* offset from superblock of
-							* start of bitmap. May be
-							* negative, but not '0'
-							*/
-	long				default_bitmap_offset; /* this is the offset to use when
-								* hot-adding a bitmap.  It should
-								* eventually be settable by sysfs.
-								*/
-
-	struct list_head		all_mddevs;
-};
-
-
-static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
-{
-	int faulty = test_bit(Faulty, &rdev->flags);
-	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-}
-
-static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-{
-        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
-}
-
-struct mdk_personality
-{
-	char *name;
-	int level;
-	struct list_head list;
-	struct module *owner;
-	int (*make_request)(struct request_queue *q, struct bio *bio);
-	int (*run)(mddev_t *mddev);
-	int (*stop)(mddev_t *mddev);
-	void (*status)(struct seq_file *seq, mddev_t *mddev);
-	/* error_handler must set ->faulty and clear ->in_sync
-	 * if appropriate, and should abort recovery if needed 
-	 */
-	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
-	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
-	int (*hot_remove_disk) (mddev_t *mddev, int number);
-	int (*spare_active) (mddev_t *mddev);
-	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
-	int (*resize) (mddev_t *mddev, sector_t sectors);
-	int (*check_reshape) (mddev_t *mddev);
-	int (*start_reshape) (mddev_t *mddev);
-	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
-	/* quiesce moves between quiescence states
-	 * 0 - fully active
-	 * 1 - no new requests allowed
-	 * others - reserved
-	 */
-	void (*quiesce) (mddev_t *mddev, int state);
-};
-
-
-struct md_sysfs_entry {
-	struct attribute attr;
-	ssize_t (*show)(mddev_t *, char *);
-	ssize_t (*store)(mddev_t *, const char *, size_t);
-};
-
-
-static inline char * mdname (mddev_t * mddev)
-{
-	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
-}
-
-/*
- * iterates through some rdev ringlist. It's safe to remove the
- * current 'rdev'. Dont touch 'tmp' though.
- */
-#define rdev_for_each_list(rdev, tmp, head)				\
-	list_for_each_entry_safe(rdev, tmp, head, same_set)
-
-/*
- * iterates through the 'same array disks' ringlist
- */
-#define rdev_for_each(rdev, tmp, mddev)				\
-	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
-
-#define rdev_for_each_rcu(rdev, mddev)				\
-	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
-
-typedef struct mdk_thread_s {
-	void			(*run) (mddev_t *mddev);
-	mddev_t			*mddev;
-	wait_queue_head_t	wqueue;
-	unsigned long           flags;
-	struct task_struct	*tsk;
-	unsigned long		timeout;
-} mdk_thread_t;
-
-#define THREAD_WAKEUP  0
-
-#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		spin_unlock_irq(&lock);					\
-		cmd;							\
-		schedule();						\
-		spin_lock_irq(&lock);					\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_event_lock_irq(wq, condition, lock, cmd);		\
-} while (0)
-
-static inline void safe_put_page(struct page *p)
-{
-	if (p) put_page(p);
-}
-
-#endif /* CONFIG_BLOCK */
-#endif
-
-
-extern int register_md_personality(struct mdk_personality *p);
-extern int unregister_md_personality(struct mdk_personality *p);
-extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
-				mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t *thread);
-extern void md_wakeup_thread(mdk_thread_t *thread);
-extern void md_check_recovery(mddev_t *mddev);
-extern void md_write_start(mddev_t *mddev, struct bio *bi);
-extern void md_write_end(mddev_t *mddev);
-extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
-
-extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
-			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
-			struct page *page, int rw);
-extern void md_do_sync(mddev_t *mddev);
-extern void md_new_event(mddev_t *mddev);
-extern int md_allow_write(mddev_t *mddev);
-extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
-- 
cgit v1.2.3-71-gd317


From f701d589aa34d7531183c9ac6f7713ba14212b02 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 31 Mar 2009 15:09:39 +1100
Subject: md/raid6: move raid6 data processing to raid6_pq.ko

Move the raid6 data processing routines into a standalone module
(raid6_pq) to prepare them to be called from async_tx wrappers and other
non-md drivers/modules.  This precludes a circular dependency of raid456
needing the async modules for data processing while those modules in
turn depend on raid456 for the base level synchronous raid6 routines.

To support this move:
1/ The exportable definitions in raid6.h move to include/linux/raid/pq.h
2/ The raid6_call, recovery calls, and table symbols are exported
3/ Extra #ifdef __KERNEL__ statements to enable the userspace raid6test to
   compile

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/Kconfig            |   4 ++
 drivers/md/Makefile           |   4 +-
 drivers/md/mktables.c         |  14 ++++-
 drivers/md/raid5.c            |  12 +---
 drivers/md/raid5.h            |   2 +
 drivers/md/raid6.h            | 126 ----------------------------------------
 drivers/md/raid6algos.c       |  19 +++++-
 drivers/md/raid6altivec.uc    |   2 +-
 drivers/md/raid6int.uc        |   2 +-
 drivers/md/raid6mmx.c         |   2 +-
 drivers/md/raid6recov.c       |  11 ++--
 drivers/md/raid6sse1.c        |   2 +-
 drivers/md/raid6sse2.c        |   2 +-
 drivers/md/raid6test/Makefile |   2 +-
 drivers/md/raid6test/test.c   |   2 +-
 include/linux/raid/pq.h       | 132 ++++++++++++++++++++++++++++++++++++++++++
 16 files changed, 185 insertions(+), 153 deletions(-)
 delete mode 100644 drivers/md/raid6.h
 create mode 100644 include/linux/raid/pq.h

(limited to 'include/linux')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2281b5098e95..449d0b9cac14 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -121,6 +121,7 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
+	select MD_RAID6_PQ
 	select ASYNC_MEMCPY
 	select ASYNC_XOR
 	---help---
@@ -180,6 +181,9 @@ config MD_RAID5_RESHAPE
 
 	  If unsure, say Y.
 
+config MD_RAID6_PQ
+	tristate
+
 config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3b118da575ee..45cc5951d928 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -9,7 +9,8 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
 dm-mirror-y	+= dm-raid1.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o raid6algos.o raid6recov.o raid6tables.o \
+raid456-y	+= raid5.o
+raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
 		   raid6int1.o raid6int2.o raid6int4.o \
 		   raid6int8.o raid6int16.o raid6int32.o \
 		   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
+obj-$(CONFIG_MD_RAID6_PQ)	+= raid6_pq.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
index b61d5767aae7..3b1500843bba 100644
--- a/drivers/md/mktables.c
+++ b/drivers/md/mktables.c
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
 	uint8_t v;
 	uint8_t exptbl[256], invtbl[256];
 
-	printf("#include \"raid6.h\"\n");
+	printf("#include <linux/raid/pq.h>\n");
 
 	/* Compute multiplication table */
 	printf("\nconst u8  __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
 		printf("\t},\n");
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+	printf("#endif\n");
 
 	/* Compute power-of-2 table (exponent) */
 	v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
 		}
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+	printf("#endif\n");
 
 	/* Compute inverse table x^-1 == x^254 */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
 		}
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+	printf("#endif\n");
 
 	/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
 			       (j == 7) ? '\n' : ' ');
 	}
 	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+	printf("#endif\n");
 
 	return 0;
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e1ee181b79bb..1f1b054ff0b6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -45,11 +45,11 @@
 
 #include <linux/blkdev.h>
 #include <linux/kthread.h>
+#include <linux/raid/pq.h>
 #include <linux/async_tx.h>
 #include <linux/seq_file.h>
 #include "md.h"
 #include "raid5.h"
-#include "raid6.h"
 #include "bitmap.h"
 
 /*
@@ -94,11 +94,6 @@
 
 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
 
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -5153,11 +5148,6 @@ static struct mdk_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
-	int e;
-
-	e = raid6_select_algo();
-	if ( e )
-		return e;
 	register_md_personality(&raid6_personality);
 	register_md_personality(&raid5_personality);
 	register_md_personality(&raid4_personality);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c172371481c7..2934ee0a39c6 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -269,6 +269,8 @@ struct r6_state {
 #define READ_MODIFY_WRITE	2
 /* not a write method, but a compute_parity mode */
 #define	CHECK_PARITY		3
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY		4
 
 /*
  * Stripe state
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
deleted file mode 100644
index 8a9c823bab9e..000000000000
--- a/drivers/md/raid6.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2003 H. Peter Anvin - All Rights Reserved
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Boston MA 02111-1307, USA; either version 2 of the License, or
- *   (at your option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-#ifndef LINUX_RAID_RAID6_H
-#define LINUX_RAID_RAID6_H
-
-#ifdef __KERNEL__
-
-/* Set to 1 to use kernel-wide empty_zero_page */
-#define RAID6_USE_EMPTY_ZERO_PAGE 0
-#include <linux/blkdev.h>
-
-/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-#define UPDATE_PARITY	4
-
-/* We need a pre-zeroed page... if we don't want to use the kernel-provided
-   one define it here */
-#if RAID6_USE_EMPTY_ZERO_PAGE
-# define raid6_empty_zero_page empty_zero_page
-#else
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-#endif
-
-#else /* ! __KERNEL__ */
-/* Used for testing in user space */
-
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <stddef.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-
-/* Not standard, but glibc defines it */
-#define BITS_PER_LONG __WORDSIZE
-
-typedef uint8_t  u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE 4096
-#endif
-extern const char raid6_empty_zero_page[PAGE_SIZE];
-
-#define __init
-#define __exit
-#define __attribute_const__ __attribute__((const))
-#define noinline __attribute__((noinline))
-
-#define preempt_enable()
-#define preempt_disable()
-#define cpu_has_feature(x) 1
-#define enable_kernel_altivec()
-#define disable_kernel_altivec()
-
-#endif /* __KERNEL__ */
-
-/* Routine choices */
-struct raid6_calls {
-	void (*gen_syndrome)(int, size_t, void **);
-	int  (*valid)(void);	/* Returns 1 if this routine set is usable */
-	const char *name;	/* Name of this routine set */
-	int prefer;		/* Has special performance attribute */
-};
-
-/* Selected algorithm */
-extern struct raid6_calls raid6_call;
-
-/* Algorithm list */
-extern const struct raid6_calls * const raid6_algos[];
-int raid6_select_algo(void);
-
-/* Return values from chk_syndrome */
-#define RAID6_OK	0
-#define RAID6_P_BAD	1
-#define RAID6_Q_BAD	2
-#define RAID6_PQ_BAD	3
-
-/* Galois field tables */
-extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
-extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
-extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
-
-/* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
-void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
-
-/* Some definitions to allow code to be compiled for testing in userspace */
-#ifndef __KERNEL__
-
-# define jiffies	raid6_jiffies()
-# define printk 	printf
-# define GFP_KERNEL	0
-# define __get_free_pages(x,y)	((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
-# define free_pages(x,y)	munmap((void *)(x), (y)*PAGE_SIZE)
-
-static inline void cpu_relax(void)
-{
-	/* Nothing */
-}
-
-#undef  HZ
-#define HZ 1000
-static inline uint32_t raid6_jiffies(void)
-{
-	struct timeval tv;
-	gettimeofday(&tv, NULL);
-	return tv.tv_sec*1000 + tv.tv_usec/1000;
-}
-
-#endif /* ! __KERNEL__ */
-
-#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 1f6a3c82ee0c..866215ac7f25 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -16,13 +16,20 @@
  * Algorithm list and algorithm selection for RAID-6
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
+#else
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+EXPORT_SYMBOL(raid6_empty_zero_page);
+#endif
 #endif
 
 struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
 
 /* Various routine sets */
 extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
 #else
 /* Need more time to be stable in userspace */
 #define RAID6_TIME_JIFFIES_LG2	9
+#define time_before(x, y) ((x) < (y))
 #endif
 
 /* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
 
 	return best ? 0 : -EINVAL;
 }
+
+static void raid6_exit(void)
+{
+	do { } while (0);
+}
+
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
index 217580667e0c..699dfeee4944 100644
--- a/drivers/md/raid6altivec.uc
+++ b/drivers/md/raid6altivec.uc
@@ -22,7 +22,7 @@
  * bracked this with preempt_disable/enable or in a lock)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #ifdef CONFIG_ALTIVEC
 
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
index 32a0bac3eb3d..f9bf9cba357f 100644
--- a/drivers/md/raid6int.uc
+++ b/drivers/md/raid6int.uc
@@ -18,7 +18,7 @@
  * This file is postprocessed using unroll.pl
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /*
  * This is the C data type to use
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
index 804cb50ecc19..e7f6c13132bf 100644
--- a/drivers/md/raid6mmx.c
+++ b/drivers/md/raid6mmx.c
@@ -18,7 +18,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Shared with raid6sse1.c */
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
index 7a98b8652582..2609f00e0d61 100644
--- a/drivers/md/raid6recov.c
+++ b/drivers/md/raid6recov.c
@@ -18,7 +18,7 @@
  * the syndrome.)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		p++; q++;
 	}
 }
-
-
-
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
 
 /* Recover failure of one data block plus the P block */
 void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
 		q++; dq++;
 	}
 }
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
-
-#ifndef __KERNEL__		/* Testing only */
+#ifndef __KERNEL__
+/* Testing only */
 
 /* Recover two failed blocks. */
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
index 15c588905225..b274dd5eab8f 100644
--- a/drivers/md/raid6sse1.c
+++ b/drivers/md/raid6sse1.c
@@ -23,7 +23,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Defined in raid6mmx.c */
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
index 2e92e96275be..6ed6c6c0389f 100644
--- a/drivers/md/raid6sse2.c
+++ b/drivers/md/raid6sse2.c
@@ -19,7 +19,7 @@
 
 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 static const struct raid6_sse_constants {
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
index 78e0396adf2a..58ffdf4f5161 100644
--- a/drivers/md/raid6test/Makefile
+++ b/drivers/md/raid6test/Makefile
@@ -5,7 +5,7 @@
 
 CC	 = gcc
 OPTFLAGS = -O2			# Adjust as desired
-CFLAGS	 = -I.. -g $(OPTFLAGS)
+CFLAGS	 = -I.. -I ../../../include -g $(OPTFLAGS)
 LD	 = ld
 PERL	 = perl
 AR	 = ar
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
index 559cc41b2585..7a930318b17d 100644
--- a/drivers/md/raid6test/test.c
+++ b/drivers/md/raid6test/test.c
@@ -17,7 +17,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #define NDISKS		16	/* Including P and Q */
 
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
new file mode 100644
index 000000000000..d92480f8285c
--- /dev/null
+++ b/include/linux/raid/pq.h
@@ -0,0 +1,132 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2003 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+#ifndef LINUX_RAID_RAID6_H
+#define LINUX_RAID_RAID6_H
+
+#ifdef __KERNEL__
+
+/* Set to 1 to use kernel-wide empty_zero_page */
+#define RAID6_USE_EMPTY_ZERO_PAGE 0
+#include <linux/blkdev.h>
+
+/* We need a pre-zeroed page... if we don't want to use the kernel-provided
+   one define it here */
+#if RAID6_USE_EMPTY_ZERO_PAGE
+# define raid6_empty_zero_page empty_zero_page
+#else
+extern const char raid6_empty_zero_page[PAGE_SIZE];
+#endif
+
+#else /* ! __KERNEL__ */
+/* Used for testing in user space */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+
+/* Not standard, but glibc defines it */
+#define BITS_PER_LONG __WORDSIZE
+
+typedef uint8_t  u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+extern const char raid6_empty_zero_page[PAGE_SIZE];
+
+#define __init
+#define __exit
+#define __attribute_const__ __attribute__((const))
+#define noinline __attribute__((noinline))
+
+#define preempt_enable()
+#define preempt_disable()
+#define cpu_has_feature(x) 1
+#define enable_kernel_altivec()
+#define disable_kernel_altivec()
+
+#define EXPORT_SYMBOL(sym)
+#define MODULE_LICENSE(licence)
+#define subsys_initcall(x)
+#define module_exit(x)
+#endif /* __KERNEL__ */
+
+/* Routine choices */
+struct raid6_calls {
+	void (*gen_syndrome)(int, size_t, void **);
+	int  (*valid)(void);	/* Returns 1 if this routine set is usable */
+	const char *name;	/* Name of this routine set */
+	int prefer;		/* Has special performance attribute */
+};
+
+/* Selected algorithm */
+extern struct raid6_calls raid6_call;
+
+/* Algorithm list */
+extern const struct raid6_calls * const raid6_algos[];
+int raid6_select_algo(void);
+
+/* Return values from chk_syndrome */
+#define RAID6_OK	0
+#define RAID6_P_BAD	1
+#define RAID6_Q_BAD	2
+#define RAID6_PQ_BAD	3
+
+/* Galois field tables */
+extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
+extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
+extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
+
+/* Recovery routines */
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+		       void **ptrs);
+void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
+void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
+		      void **ptrs);
+
+/* Some definitions to allow code to be compiled for testing in userspace */
+#ifndef __KERNEL__
+
+# define jiffies	raid6_jiffies()
+# define printk 	printf
+# define GFP_KERNEL	0
+# define __get_free_pages(x, y)	((unsigned long)mmap(NULL, PAGE_SIZE << (y), \
+						     PROT_READ|PROT_WRITE,   \
+						     MAP_PRIVATE|MAP_ANONYMOUS,\
+						     0, 0))
+# define free_pages(x, y)	munmap((void *)(x), (y)*PAGE_SIZE)
+
+static inline void cpu_relax(void)
+{
+	/* Nothing */
+}
+
+#undef  HZ
+#define HZ 1000
+static inline uint32_t raid6_jiffies(void)
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec*1000 + tv.tv_usec/1000;
+}
+
+#endif /* ! __KERNEL__ */
+
+#endif /* LINUX_RAID_RAID6_H */
-- 
cgit v1.2.3-71-gd317


From 853116a10544206b6b2cf42ebc9d78fba2668888 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 14 Jan 2009 23:03:17 -0800
Subject: regulator: add get_status()

Based on previous LKML discussions:

 * Update docs for regulator sysfs class attributes to highlight
   the fact that all current attributes are intended to be control
   inputs, including notably "state" and "opmode" which previously
   implied otherwise.

 * Define a new regulator driver get_status() method, which is the
   first method reporting regulator outputs instead of inputs.
   It can report on/off and error status; or instead of simply
   "on", report the actual operating mode.

For the moment, this is a sysfs-only interface, not accessible to
regulator clients.  Such clients can use the current notification
interfaces to detect errors, if the regulator reports them.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 Documentation/ABI/testing/sysfs-class-regulator | 57 +++++++++++++++++++++----
 drivers/regulator/core.c                        | 46 ++++++++++++++++++++
 include/linux/regulator/driver.h                | 17 ++++++++
 3 files changed, 111 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator
index 873ef1fc1569..e091fa873792 100644
--- a/Documentation/ABI/testing/sysfs-class-regulator
+++ b/Documentation/ABI/testing/sysfs-class-regulator
@@ -4,8 +4,8 @@ KernelVersion:	2.6.26
 Contact:	Liam Girdwood <lrg@slimlogic.co.uk>
 Description:
 		Some regulator directories will contain a field called
-		state. This reports the regulator enable status, for
-		regulators which can report that value.
+		state. This reports the regulator enable control, for
+		regulators which can report that input value.
 
 		This will be one of the following strings:
 
@@ -14,16 +14,54 @@ Description:
 		'unknown'
 
 		'enabled' means the regulator output is ON and is supplying
-		power to the system.
+		power to the system (assuming no error prevents it).
 
 		'disabled' means the regulator output is OFF and is not
-		supplying power to the system..
+		supplying power to the system (unless some non-Linux
+		control has enabled it).
 
 		'unknown' means software cannot determine the state, or
 		the reported state is invalid.
 
 		NOTE: this field can be used in conjunction with microvolts
-		and microamps to determine regulator output levels.
+		or microamps to determine configured regulator output levels.
+
+
+What:		/sys/class/regulator/.../status
+Description:
+		Some regulator directories will contain a field called
+		"status". This reports the current regulator status, for
+		regulators which can report that output value.
+
+		This will be one of the following strings:
+
+			off
+			on
+			error
+			fast
+			normal
+			idle
+			standby
+
+		"off" means the regulator is not supplying power to the
+		system.
+
+		"on" means the regulator is supplying power to the system,
+		and the regulator can't report a detailed operation mode.
+
+		"error" indicates an out-of-regulation status such as being
+		disabled due to thermal shutdown, or voltage being unstable
+		because of problems with the input power supply.
+
+		"fast", "normal", "idle", and "standby" are all detailed
+		regulator operation modes (described elsewhere).  They
+		imply "on", but provide more detail.
+
+		Note that regulator status is a function of many inputs,
+		not limited to control inputs from Linux.  For example,
+		the actual load presented may trigger "error" status; or
+		a regulator may be enabled by another user, even though
+		Linux did not enable it.
 
 
 What:		/sys/class/regulator/.../type
@@ -58,7 +96,7 @@ Description:
 		Some regulator directories will contain a field called
 		microvolts. This holds the regulator output voltage setting
 		measured in microvolts (i.e. E-6 Volts), for regulators
-		which can report that voltage.
+		which can report the control input for voltage.
 
 		NOTE: This value should not be used to determine the regulator
 		output voltage level as this value is the same regardless of
@@ -73,7 +111,7 @@ Description:
 		Some regulator directories will contain a field called
 		microamps. This holds the regulator output current limit
 		setting measured in microamps (i.e. E-6 Amps), for regulators
-		which can report that current.
+		which can report the control input for a current limit.
 
 		NOTE: This value should not be used to determine the regulator
 		output current level as this value is the same regardless of
@@ -87,7 +125,7 @@ Contact:	Liam Girdwood <lrg@slimlogic.co.uk>
 Description:
 		Some regulator directories will contain a field called
 		opmode. This holds the current regulator operating mode,
-		for regulators which can report it.
+		for regulators which can report that control input value.
 
 		The opmode value can be one of the following strings:
 
@@ -101,7 +139,8 @@ Description:
 
 		NOTE: This value should not be used to determine the regulator
 		output operating mode as this value is the same regardless of
-		whether the regulator is enabled or disabled.
+		whether the regulator is enabled or disabled.  A "status"
+		attribute may be available to determine the actual mode.
 
 
 What:		/sys/class/regulator/.../min_microvolts
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index f511a406fcaa..0ff95c3ccf5b 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -312,6 +312,47 @@ static ssize_t regulator_state_show(struct device *dev,
 }
 static DEVICE_ATTR(state, 0444, regulator_state_show, NULL);
 
+static ssize_t regulator_status_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct regulator_dev *rdev = dev_get_drvdata(dev);
+	int status;
+	char *label;
+
+	status = rdev->desc->ops->get_status(rdev);
+	if (status < 0)
+		return status;
+
+	switch (status) {
+	case REGULATOR_STATUS_OFF:
+		label = "off";
+		break;
+	case REGULATOR_STATUS_ON:
+		label = "on";
+		break;
+	case REGULATOR_STATUS_ERROR:
+		label = "error";
+		break;
+	case REGULATOR_STATUS_FAST:
+		label = "fast";
+		break;
+	case REGULATOR_STATUS_NORMAL:
+		label = "normal";
+		break;
+	case REGULATOR_STATUS_IDLE:
+		label = "idle";
+		break;
+	case REGULATOR_STATUS_STANDBY:
+		label = "standby";
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return sprintf(buf, "%s\n", label);
+}
+static DEVICE_ATTR(status, 0444, regulator_status_show, NULL);
+
 static ssize_t regulator_min_uA_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
@@ -1744,6 +1785,11 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 		if (status < 0)
 			return status;
 	}
+	if (ops->get_status) {
+		status = device_create_file(dev, &dev_attr_status);
+		if (status < 0)
+			return status;
+	}
 
 	/* some attributes are type-specific */
 	if (rdev->desc->type == REGULATOR_CURRENT) {
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 2dae05705f13..6e957aae7629 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -21,6 +21,17 @@
 struct regulator_dev;
 struct regulator_init_data;
 
+enum regulator_status {
+	REGULATOR_STATUS_OFF,
+	REGULATOR_STATUS_ON,
+	REGULATOR_STATUS_ERROR,
+	/* fast/normal/idle/standby are flavors of "on" */
+	REGULATOR_STATUS_FAST,
+	REGULATOR_STATUS_NORMAL,
+	REGULATOR_STATUS_IDLE,
+	REGULATOR_STATUS_STANDBY,
+};
+
 /**
  * struct regulator_ops - regulator operations.
  *
@@ -72,6 +83,12 @@ struct regulator_ops {
 	int (*set_mode) (struct regulator_dev *, unsigned int mode);
 	unsigned int (*get_mode) (struct regulator_dev *);
 
+	/* report regulator status ... most other accessors report
+	 * control inputs, this reports results of combining inputs
+	 * from Linux (and other sources) with the actual load.
+	 */
+	int (*get_status)(struct regulator_dev *);
+
 	/* get most efficient regulator operating mode for load */
 	unsigned int (*get_optimum_mode) (struct regulator_dev *, int input_uV,
 					  int output_uV, int load_uA);
-- 
cgit v1.2.3-71-gd317


From b136fb4463d13eea129bf090a8a465bba6bf0003 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@cam.ac.uk>
Date: Mon, 19 Jan 2009 18:20:58 +0000
Subject: Regulator: Push lock out of _notifier_call_chain + add voltage change
 event.

Regulator: Push lock out of _notifier_call_chain and into caller functions
(side effect of fixing deadlock in regulator_force_disable)
+ Add a voltage changed event.
Signed-off-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c             | 15 ++++++++++-----
 drivers/regulator/wm8350-regulator.c |  2 ++
 include/linux/regulator/consumer.h   |  2 ++
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 0ff95c3ccf5b..96c877dd9daf 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1284,6 +1284,7 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
 	ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV);
 
 out:
+	_notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, NULL);
 	mutex_unlock(&rdev->mutex);
 	return ret;
 }
@@ -1584,20 +1585,23 @@ int regulator_unregister_notifier(struct regulator *regulator,
 }
 EXPORT_SYMBOL_GPL(regulator_unregister_notifier);
 
-/* notify regulator consumers and downstream regulator consumers */
+/* notify regulator consumers and downstream regulator consumers.
+ * Note mutex must be held by caller.
+ */
 static void _notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
 {
 	struct regulator_dev *_rdev;
 
 	/* call rdev chain first */
-	mutex_lock(&rdev->mutex);
 	blocking_notifier_call_chain(&rdev->notifier, event, NULL);
-	mutex_unlock(&rdev->mutex);
 
 	/* now notify regulator we supply */
-	list_for_each_entry(_rdev, &rdev->supply_list, slist)
-		_notifier_call_chain(_rdev, event, data);
+	list_for_each_entry(_rdev, &rdev->supply_list, slist) {
+	  mutex_lock(&_rdev->mutex);
+	  _notifier_call_chain(_rdev, event, data);
+	  mutex_unlock(&_rdev->mutex);
+	}
 }
 
 /**
@@ -1744,6 +1748,7 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free);
  *
  * Called by regulator drivers to notify clients a regulator event has
  * occurred. We also notify regulator clients downstream.
+ * Note lock must be held by caller.
  */
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index 5056e23e4414..afad611fbb80 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -1293,6 +1293,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
 {
 	struct regulator_dev *rdev = (struct regulator_dev *)data;
 
+	mutex_lock(&rdev->mutex);
 	if (irq == WM8350_IRQ_CS1 || irq == WM8350_IRQ_CS2)
 		regulator_notifier_call_chain(rdev,
 					      REGULATOR_EVENT_REGULATION_OUT,
@@ -1301,6 +1302,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
 		regulator_notifier_call_chain(rdev,
 					      REGULATOR_EVENT_UNDER_VOLTAGE,
 					      wm8350);
+	mutex_unlock(&rdev->mutex);
 }
 
 static int wm8350_regulator_probe(struct platform_device *pdev)
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 801bf77ff4e2..533f4e26db96 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -88,6 +88,7 @@
  * FAIL           Regulator output has failed.
  * OVER_TEMP      Regulator over temp.
  * FORCE_DISABLE  Regulator shut down by software.
+ * VOLTAGE_CHANGE Regulator voltage changed.
  *
  * NOTE: These events can be OR'ed together when passed into handler.
  */
@@ -98,6 +99,7 @@
 #define REGULATOR_EVENT_FAIL			0x08
 #define REGULATOR_EVENT_OVER_TEMP		0x10
 #define REGULATOR_EVENT_FORCE_DISABLE		0x20
+#define REGULATOR_EVENT_VOLTAGE_CHANGE		0x40
 
 struct regulator;
 
-- 
cgit v1.2.3-71-gd317


From 0527100fd11d9710c7e153d791da78824b7b46fa Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 19 Jan 2009 13:37:02 +0000
Subject: regulator: Pass regulator init data as explict argument when
 registering

Rather than having the regulator init data read from the platform_data
member of the struct device that is registered for the regulator make
the init data an explict argument passed in when registering. This
allows drivers to use the platform data for their own purposes if they
wish.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/bq24022.c            | 2 +-
 drivers/regulator/core.c               | 5 +++--
 drivers/regulator/da903x.c             | 3 ++-
 drivers/regulator/pcf50633-regulator.c | 3 ++-
 drivers/regulator/wm8350-regulator.c   | 2 +-
 drivers/regulator/wm8400-regulator.c   | 2 +-
 include/linux/regulator/driver.h       | 3 ++-
 7 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/bq24022.c b/drivers/regulator/bq24022.c
index c175e38a4cd5..6804333492eb 100644
--- a/drivers/regulator/bq24022.c
+++ b/drivers/regulator/bq24022.c
@@ -105,7 +105,7 @@ static int __init bq24022_probe(struct platform_device *pdev)
 	ret = gpio_direction_output(pdata->gpio_iset2, 0);
 	ret = gpio_direction_output(pdata->gpio_nce, 1);
 
-	bq24022 = regulator_register(&bq24022_desc, &pdev->dev, pdata);
+	bq24022 = regulator_register(&bq24022_desc, &pdev->dev, NULL, pdata);
 	if (IS_ERR(bq24022)) {
 		dev_dbg(&pdev->dev, "couldn't register regulator\n");
 		ret = PTR_ERR(bq24022);
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 96c877dd9daf..f17362ac9c61 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1879,17 +1879,18 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
  * regulator_register - register regulator
  * @regulator_desc: regulator to register
  * @dev: struct device for the regulator
+ * @init_data: platform provided init data, passed through by driver
  * @driver_data: private regulator data
  *
  * Called by regulator drivers to register a regulator.
  * Returns 0 on success.
  */
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-	struct device *dev, void *driver_data)
+	struct device *dev, struct regulator_init_data *init_data,
+	void *driver_data)
 {
 	static atomic_t regulator_no = ATOMIC_INIT(0);
 	struct regulator_dev *rdev;
-	struct regulator_init_data *init_data = dev->platform_data;
 	int ret, i;
 
 	if (regulator_desc == NULL)
diff --git a/drivers/regulator/da903x.c b/drivers/regulator/da903x.c
index fe77730a7edb..72b15495183c 100644
--- a/drivers/regulator/da903x.c
+++ b/drivers/regulator/da903x.c
@@ -471,7 +471,8 @@ static int __devinit da903x_regulator_probe(struct platform_device *pdev)
 	if (ri->desc.id == DA9030_ID_LDO1 || ri->desc.id == DA9030_ID_LDO15)
 		ri->desc.ops = &da9030_regulator_ldo1_15_ops;
 
-	rdev = regulator_register(&ri->desc, &pdev->dev, ri);
+	rdev = regulator_register(&ri->desc, &pdev->dev,
+				  pdev->dev.platform_data, ri);
 	if (IS_ERR(rdev)) {
 		dev_err(&pdev->dev, "failed to register regulator %s\n",
 				ri->desc.name);
diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c
index 4cc85ec6e120..cd761d85c8fd 100644
--- a/drivers/regulator/pcf50633-regulator.c
+++ b/drivers/regulator/pcf50633-regulator.c
@@ -284,7 +284,8 @@ static int __devinit pcf50633_regulator_probe(struct platform_device *pdev)
 	/* Already set by core driver */
 	pcf = platform_get_drvdata(pdev);
 
-	rdev = regulator_register(&regulators[pdev->id], &pdev->dev, pcf);
+	rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
+				  pdev->dev.platform_data, pcf);
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
 
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index afad611fbb80..93e0ce5a5c23 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -1335,9 +1335,9 @@ static int wm8350_regulator_probe(struct platform_device *pdev)
 		break;
 	}
 
-
 	/* register regulator */
 	rdev = regulator_register(&wm8350_reg[pdev->id], &pdev->dev,
+				  pdev->dev.platform_data,
 				  dev_get_drvdata(&pdev->dev));
 	if (IS_ERR(rdev)) {
 		dev_err(&pdev->dev, "failed to register %s\n",
diff --git a/drivers/regulator/wm8400-regulator.c b/drivers/regulator/wm8400-regulator.c
index 56e23d44ba59..6ed43b0dbdfc 100644
--- a/drivers/regulator/wm8400-regulator.c
+++ b/drivers/regulator/wm8400-regulator.c
@@ -294,7 +294,7 @@ static int __devinit wm8400_regulator_probe(struct platform_device *pdev)
 	struct regulator_dev *rdev;
 
 	rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
-		pdev->dev.driver_data);
+		pdev->dev.platform_data, pdev->dev.driver_data);
 
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 6e957aae7629..2254ad93b784 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -138,7 +138,8 @@ struct regulator_desc {
 };
 
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-	struct device *dev, void *driver_data);
+	struct device *dev, struct regulator_init_data *init_data,
+	void *driver_data);
 void regulator_unregister(struct regulator_dev *rdev);
 
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
-- 
cgit v1.2.3-71-gd317


From 93c62da23a717f59933ec799688da42f71d8c6c4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 19 Jan 2009 13:37:03 +0000
Subject: regulator: Allow init data to be supplied for bq24022

Previously it was not possible to do so, making it impossible for
machines to configure the driver.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/bq24022.c       | 3 ++-
 include/linux/regulator/bq24022.h | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/bq24022.c b/drivers/regulator/bq24022.c
index 6804333492eb..7ecb820ceebc 100644
--- a/drivers/regulator/bq24022.c
+++ b/drivers/regulator/bq24022.c
@@ -105,7 +105,8 @@ static int __init bq24022_probe(struct platform_device *pdev)
 	ret = gpio_direction_output(pdata->gpio_iset2, 0);
 	ret = gpio_direction_output(pdata->gpio_nce, 1);
 
-	bq24022 = regulator_register(&bq24022_desc, &pdev->dev, NULL, pdata);
+	bq24022 = regulator_register(&bq24022_desc, &pdev->dev,
+				     pdata->init_data, pdata);
 	if (IS_ERR(bq24022)) {
 		dev_dbg(&pdev->dev, "couldn't register regulator\n");
 		ret = PTR_ERR(bq24022);
diff --git a/include/linux/regulator/bq24022.h b/include/linux/regulator/bq24022.h
index e84b0a9feda5..a6d014005d49 100644
--- a/include/linux/regulator/bq24022.h
+++ b/include/linux/regulator/bq24022.h
@@ -10,6 +10,8 @@
  *
  */
 
+struct regulator_init_data;
+
 /**
  * bq24022_mach_info - platform data for bq24022
  * @gpio_nce: GPIO line connected to the nCE pin, used to enable / disable charging
@@ -18,4 +20,5 @@
 struct bq24022_mach_info {
 	int gpio_nce;
 	int gpio_iset2;
+	struct regulator_init_data *init_data;
 };
-- 
cgit v1.2.3-71-gd317


From bcf3402c50a48d51462f37f72129d9c4369702b4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 19 Jan 2009 13:37:04 +0000
Subject: regulator: Allow init_data to be passed to fixed voltage regulators

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/fixed.c       | 3 ++-
 include/linux/regulator/fixed.h | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c
index d31db3e14913..23d554628a76 100644
--- a/drivers/regulator/fixed.c
+++ b/drivers/regulator/fixed.c
@@ -73,7 +73,8 @@ static int regulator_fixed_voltage_probe(struct platform_device *pdev)
 
 	drvdata->microvolts = config->microvolts;
 
-	drvdata->dev = regulator_register(&drvdata->desc, drvdata);
+	drvdata->dev = regulator_register(&drvdata->desc, &pdev->dev,
+					  config->init_data, drvdata);
 	if (IS_ERR(drvdata->dev)) {
 		ret = PTR_ERR(drvdata->dev);
 		goto err_name;
diff --git a/include/linux/regulator/fixed.h b/include/linux/regulator/fixed.h
index 1387a5d2190e..91b4da31f1b5 100644
--- a/include/linux/regulator/fixed.h
+++ b/include/linux/regulator/fixed.h
@@ -14,9 +14,12 @@
 #ifndef __REGULATOR_FIXED_H
 #define __REGULATOR_FIXED_H
 
+struct regulator_init_data;
+
 struct fixed_voltage_config {
 	const char *supply_name;
 	int microvolts;
+	struct regulator_init_data *init_data;
 };
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 1fa9ad52b07811ebf258f3f6907de8dbf020ec2d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 21 Jan 2009 14:08:40 +0000
Subject: regulator: Hoist struct regulator_dev out of core to fix notifiers

Commit 872ed3fe176833f7d43748eb88010da4bbd2f983 caused regulator drivers
to take the struct regulator_dev lock themselves which requires that the
struct be visible to them. Band aid this by making the struct visible.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c         | 27 ---------------------------
 include/linux/regulator/driver.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index f17362ac9c61..0ed13c2a8c3c 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -29,33 +29,6 @@ static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_list);
 static LIST_HEAD(regulator_map_list);
 
-/*
- * struct regulator_dev
- *
- * Voltage / Current regulator class device. One for each regulator.
- */
-struct regulator_dev {
-	struct regulator_desc *desc;
-	int use_count;
-
-	/* lists we belong to */
-	struct list_head list; /* list of all regulators */
-	struct list_head slist; /* list of supplied regulators */
-
-	/* lists we own */
-	struct list_head consumer_list; /* consumers we supply */
-	struct list_head supply_list; /* regulators we supply */
-
-	struct blocking_notifier_head notifier;
-	struct mutex mutex; /* consumer lock */
-	struct module *owner;
-	struct device dev;
-	struct regulation_constraints *constraints;
-	struct regulator_dev *supply;	/* for tree */
-
-	void *reg_data;		/* regulator_dev data */
-};
-
 /*
  * struct regulator_map
  *
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 2254ad93b784..c263e36e564e 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -137,6 +137,38 @@ struct regulator_desc {
 	struct module *owner;
 };
 
+/*
+ * struct regulator_dev
+ *
+ * Voltage / Current regulator class device. One for each
+ * regulator.
+ *
+ * This should *not* be used directly by anything except the regulator
+ * core and notification injection (which should take the mutex and do
+ * no other direct access).
+ */
+struct regulator_dev {
+	struct regulator_desc *desc;
+	int use_count;
+
+	/* lists we belong to */
+	struct list_head list; /* list of all regulators */
+	struct list_head slist; /* list of supplied regulators */
+
+	/* lists we own */
+	struct list_head consumer_list; /* consumers we supply */
+	struct list_head supply_list; /* regulators we supply */
+
+	struct blocking_notifier_head notifier;
+	struct mutex mutex; /* consumer lock */
+	struct module *owner;
+	struct device dev;
+	struct regulation_constraints *constraints;
+	struct regulator_dev *supply;	/* for tree */
+
+	void *reg_data;		/* regulator_dev data */
+};
+
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
 	struct device *dev, struct regulator_init_data *init_data,
 	void *driver_data);
-- 
cgit v1.2.3-71-gd317


From 90ca563b1030bece8a4f15a910e39a46f059ff48 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 20 Jan 2009 16:29:05 -0800
Subject: regulator: fix header file missing kernel-doc

Fix regulator/driver.h missing kernel-doc:

Warning(linux-next-20090120//include/linux/regulator/driver.h:108): No description found for parameter 'get_status'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc: Liam Girdwood <lrg@slimlogic.co.uk>
cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index c263e36e564e..eb8773b05ac3 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -51,6 +51,7 @@ enum regulator_status {
  *
  * @set_mode: Set the operating mode for the regulator.
  * @get_mode: Get the current operating mode for the regulator.
+ * @get_status: Report the regulator status.
  * @get_optimum_mode: Get the most efficient operating mode for the regulator
  *                    when running with the specified parameters.
  *
-- 
cgit v1.2.3-71-gd317


From 1dd68f01886a2d5cabbbe90b86e82f70917de89c Mon Sep 17 00:00:00 2001
From: Liam Girdwood <lrg@slimlogic.co.uk>
Date: Mon, 2 Feb 2009 21:43:31 +0000
Subject: regulator: email - update email address and regulator webpage.

Remove deceased email address and update to new address. Also update
website details in MAINTAINERS with correct page.

Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 MAINTAINERS                        | 2 +-
 include/linux/regulator/consumer.h | 2 +-
 include/linux/regulator/driver.h   | 2 +-
 include/linux/regulator/machine.h  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c5f4e9d27b64..15e1b73bb8d5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4850,7 +4850,7 @@ M:	lrg@slimlogic.co.uk
 P:	Mark Brown
 M:	broonie@opensource.wolfsonmicro.com
 W:	http://opensource.wolfsonmicro.com/node/15
-W:	http://www.slimlogic.co.uk/?page_id=5
+W:	http://www.slimlogic.co.uk/?p=48
 T:	git kernel.org/pub/scm/linux/kernel/git/lrg/voltage-2.6.git
 S:	Supported
 
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 533f4e26db96..df6c4bcf38f8 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index eb8773b05ac3..0cf37bc85c41 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 3794773b23d2..5aa00ee36a3d 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
-- 
cgit v1.2.3-71-gd317


From a308466c24b4f42bab6945026e938874d22cde50 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 26 Feb 2009 19:24:19 +0000
Subject: regulator: Allow regulators to set the initial operating mode

This is useful when wishing to run in a fixed operating mode that isn't
the default.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          | 17 +++++++++++++++++
 include/linux/regulator/machine.h |  4 ++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index d55a25a6fab2..75abcd85e51b 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -724,6 +724,23 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
+	if (constraints->initial_mode) {
+		if (!ops->set_mode) {
+			printk(KERN_ERR "%s: no set_mode operation for %s\n",
+			       __func__, name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		ret = ops->set_mode(rdev, constraints->initial_mode);
+		if (ret < 0) {
+			printk(KERN_ERR
+			       "%s: failed to set initial mode for %s: %d\n",
+			       __func__, name, ret);
+			goto out;
+		}
+	}
+
 	/* if always_on is set then turn the regulator on if it's not
 	 * already on. */
 	if (constraints->always_on && ops->enable &&
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 5aa00ee36a3d..1eb861cf4b2c 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -83,6 +83,7 @@ struct regulator_state {
  * @state_standby: State for regulator when system is suspended in standby
  *                 mode.
  * @initial_state: Suspend state to set by default.
+ * @initial_mode: Mode to set at startup.
  */
 struct regulation_constraints {
 
@@ -111,6 +112,9 @@ struct regulation_constraints {
 	struct regulator_state state_standby;
 	suspend_state_t initial_state; /* suspend state to set at init */
 
+	/* mode to set on startup */
+	unsigned int initial_mode;
+
 	/* constriant flags */
 	unsigned always_on:1;	/* regulator never off when system is on */
 	unsigned boot_on:1;	/* bootloader/firmware enabled regulator */
-- 
cgit v1.2.3-71-gd317


From 4367cfdc7c657ad8a797f51b9ffd3c64b31910e7 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Thu, 26 Feb 2009 11:48:36 -0800
Subject: regulator: enumerate voltages (v2)

Add a basic mechanism for regulators to report the discrete
voltages they support:  list_voltage() enumerates them using
selectors numbered from 0 to an upper bound.

Use those methods to force machine-level constraints into bounds.
(Example:  regulator supports 1.8V, 2.4V, 2.6V, 3.3V, and board
constraints for that rail are 2.0V to 3.6V ... so the range of
voltages is then 2.4V to 3.3V on this board.)

Export those voltages to the regulator consumer interface, so for
example regulator hooked up to an MMC/SD/SDIO slot can report the
actual voltage options available to cards connected there.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c           | 113 +++++++++++++++++++++++++++++++++++++
 include/linux/regulator/consumer.h |   2 +
 include/linux/regulator/driver.h   |   9 +++
 3 files changed, 124 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 75abcd85e51b..da357a07c98e 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -692,6 +692,69 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 	else
 		name = "regulator";
 
+	/* constrain machine-level voltage specs to fit
+	 * the actual range supported by this regulator.
+	 */
+	if (ops->list_voltage && rdev->desc->n_voltages) {
+		int	count = rdev->desc->n_voltages;
+		int	i;
+		int	min_uV = INT_MAX;
+		int	max_uV = INT_MIN;
+		int	cmin = constraints->min_uV;
+		int	cmax = constraints->max_uV;
+
+		/* it's safe to autoconfigure fixed-voltage supplies */
+		if (count == 1 && !cmin) {
+			cmin = INT_MIN;
+			cmax = INT_MAX;
+		}
+
+		/* else require explicit machine-level constraints */
+		else if (cmin <= 0 || cmax <= 0 || cmax < cmin) {
+			pr_err("%s: %s '%s' voltage constraints\n",
+				       __func__, "invalid", name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/* initial: [cmin..cmax] valid, [min_uV..max_uV] not */
+		for (i = 0; i < count; i++) {
+			int	value;
+
+			value = ops->list_voltage(rdev, i);
+			if (value <= 0)
+				continue;
+
+			/* maybe adjust [min_uV..max_uV] */
+			if (value >= cmin && value < min_uV)
+				min_uV = value;
+			if (value <= cmax && value > max_uV)
+				max_uV = value;
+		}
+
+		/* final: [min_uV..max_uV] valid iff constraints valid */
+		if (max_uV < min_uV) {
+			pr_err("%s: %s '%s' voltage constraints\n",
+				       __func__, "unsupportable", name);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/* use regulator's subset of machine constraints */
+		if (constraints->min_uV < min_uV) {
+			pr_debug("%s: override '%s' %s, %d -> %d\n",
+				       __func__, name, "min_uV",
+					constraints->min_uV, min_uV);
+			constraints->min_uV = min_uV;
+		}
+		if (constraints->max_uV > max_uV) {
+			pr_debug("%s: override '%s' %s, %d -> %d\n",
+				       __func__, name, "max_uV",
+					constraints->max_uV, max_uV);
+			constraints->max_uV = max_uV;
+		}
+	}
+
 	rdev->constraints = constraints;
 
 	/* do we need to apply the constraint voltage */
@@ -1250,6 +1313,56 @@ int regulator_is_enabled(struct regulator *regulator)
 }
 EXPORT_SYMBOL_GPL(regulator_is_enabled);
 
+/**
+ * regulator_count_voltages - count regulator_list_voltage() selectors
+ * @regulator: regulator source
+ *
+ * Returns number of selectors, or negative errno.  Selectors are
+ * numbered starting at zero, and typically correspond to bitfields
+ * in hardware registers.
+ */
+int regulator_count_voltages(struct regulator *regulator)
+{
+	struct regulator_dev	*rdev = regulator->rdev;
+
+	return rdev->desc->n_voltages ? : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(regulator_count_voltages);
+
+/**
+ * regulator_list_voltage - enumerate supported voltages
+ * @regulator: regulator source
+ * @selector: identify voltage to list
+ * Context: can sleep
+ *
+ * Returns a voltage that can be passed to @regulator_set_voltage(),
+ * zero if this selector code can't be used on this sytem, or a
+ * negative errno.
+ */
+int regulator_list_voltage(struct regulator *regulator, unsigned selector)
+{
+	struct regulator_dev	*rdev = regulator->rdev;
+	struct regulator_ops	*ops = rdev->desc->ops;
+	int			ret;
+
+	if (!ops->list_voltage || selector >= rdev->desc->n_voltages)
+		return -EINVAL;
+
+	mutex_lock(&rdev->mutex);
+	ret = ops->list_voltage(rdev, selector);
+	mutex_unlock(&rdev->mutex);
+
+	if (ret > 0) {
+		if (ret < rdev->constraints->min_uV)
+			ret = 0;
+		else if (ret > rdev->constraints->max_uV)
+			ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regulator_list_voltage);
+
 /**
  * regulator_set_voltage - set regulator output voltage
  * @regulator: regulator source
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index df6c4bcf38f8..277f4b964df5 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -142,6 +142,8 @@ int regulator_bulk_disable(int num_consumers,
 void regulator_bulk_free(int num_consumers,
 			 struct regulator_bulk_data *consumers);
 
+int regulator_count_voltages(struct regulator *regulator);
+int regulator_list_voltage(struct regulator *regulator, unsigned selector);
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV);
 int regulator_get_voltage(struct regulator *regulator);
 int regulator_set_current_limit(struct regulator *regulator,
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 0cf37bc85c41..2255468d456f 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -45,6 +45,10 @@ enum regulator_status {
  * @set_voltage: Set the voltage for the regulator within the range specified.
  *               The driver should select the voltage closest to min_uV.
  * @get_voltage: Return the currently configured voltage for the regulator.
+ * @list_voltage: Return one of the supported voltages, in microvolts; zero
+ *	if the selector indicates a voltage that is unusable on this system;
+ *	or negative errno.  Selectors range from zero to one less than
+ *	regulator_desc.n_voltages.  Voltages may be reported in any order.
  *
  * @set_current_limit: Configure a limit for a current-limited regulator.
  * @get_current_limit: Get the limit for a current-limited regulator.
@@ -66,6 +70,9 @@ enum regulator_status {
  */
 struct regulator_ops {
 
+	/* enumerate supported voltages */
+	int (*list_voltage) (struct regulator_dev *, unsigned selector);
+
 	/* get/set regulator voltage */
 	int (*set_voltage) (struct regulator_dev *, int min_uV, int max_uV);
 	int (*get_voltage) (struct regulator_dev *);
@@ -124,6 +131,7 @@ enum regulator_type {
  *
  * @name: Identifying name for the regulator.
  * @id: Numerical identifier for the regulator.
+ * @n_voltages: Number of selectors available for ops.list_voltage().
  * @ops: Regulator operations table.
  * @irq: Interrupt number for the regulator.
  * @type: Indicates if the regulator is a voltage or current regulator.
@@ -132,6 +140,7 @@ enum regulator_type {
 struct regulator_desc {
 	const char *name;
 	int id;
+	unsigned n_voltages;
 	struct regulator_ops *ops;
 	int irq;
 	enum regulator_type type;
-- 
cgit v1.2.3-71-gd317


From 3b2a6061afe6fcc44437cd5ec641b0aeb2825ee3 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Thu, 26 Feb 2009 13:28:41 -0800
Subject: regulator: get_status() grows kerneldoc

Add kerneldoc for the new get_status() message.  Fix the existing
kerneldoc for that struct in two ways:

 (a) Syntax, making sure parameter descriptions immediately
     follow the one-line struct description and that the first
     blank lines is before any more expansive description;
 (b) Presentation for a few points, to highlight the fact that
     the previous "get" methods exist only to report the current
     configuration, not to display actual status.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/driver.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 2255468d456f..4848d8dacd90 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -35,11 +35,8 @@ enum regulator_status {
 /**
  * struct regulator_ops - regulator operations.
  *
- * This struct describes regulator operations which can be implemented by
- * regulator chip drivers.
- *
- * @enable: Enable the regulator.
- * @disable: Disable the regulator.
+ * @enable: Configure the regulator as enabled.
+ * @disable: Configure the regulator as disabled.
  * @is_enabled: Return 1 if the regulator is enabled, 0 otherwise.
  *
  * @set_voltage: Set the voltage for the regulator within the range specified.
@@ -51,11 +48,11 @@ enum regulator_status {
  *	regulator_desc.n_voltages.  Voltages may be reported in any order.
  *
  * @set_current_limit: Configure a limit for a current-limited regulator.
- * @get_current_limit: Get the limit for a current-limited regulator.
+ * @get_current_limit: Get the configured limit for a current-limited regulator.
  *
- * @set_mode: Set the operating mode for the regulator.
- * @get_mode: Get the current operating mode for the regulator.
- * @get_status: Report the regulator status.
+ * @get_mode: Get the configured operating mode for the regulator.
+ * @get_status: Return actual (not as-configured) status of regulator, as a
+ *	REGULATOR_STATUS value (or negative errno)
  * @get_optimum_mode: Get the most efficient operating mode for the regulator
  *                    when running with the specified parameters.
  *
@@ -67,6 +64,9 @@ enum regulator_status {
  *                       suspended.
  * @set_suspend_mode: Set the operating mode for the regulator when the
  *                    system is suspended.
+ *
+ * This struct describes regulator operations which can be implemented by
+ * regulator chip drivers.
  */
 struct regulator_ops {
 
@@ -94,6 +94,7 @@ struct regulator_ops {
 	/* report regulator status ... most other accessors report
 	 * control inputs, this reports results of combining inputs
 	 * from Linux (and other sources) with the actual load.
+	 * returns REGULATOR_STATUS_* or negative errno.
 	 */
 	int (*get_status)(struct regulator_dev *);
 
-- 
cgit v1.2.3-71-gd317


From fa16a5c13a2fc1433cfff38a083b4f8c5138d022 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Sun, 8 Feb 2009 10:37:06 -0800
Subject: regulator: twl4030 regulators

Support most of the LDO regulators in the twl4030 family chips.
In the case of LDOs supporting MMC/SD, the voltage controls are
used; but in most other cases, the regulator framework is only
used to enable/disable a supplies, conserving power when a given
voltage rail is not needed.

The drivers/mfd/twl4030-core.c code already sets up the various
regulators according to board-specific configuration, and knows
that some chips don't provide the full set of voltage rails.

The omitted regulators are intended to be under hardware control,
such as during the hardware-mediated system powerup, powerdown,
and suspend states.  Unless/until software hooks are known to
be safe, they won't be exported here.

These regulators implement the new get_status() operation, but
can't realistically implement get_mode(); the status output is
effectively the result of a vote, with the relevant hardware
inputs not exposed.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/Kconfig             |   7 +
 drivers/regulator/Makefile            |   1 +
 drivers/regulator/twl4030-regulator.c | 511 ++++++++++++++++++++++++++++++++++
 include/linux/i2c/twl4030.h           |  47 ++++
 4 files changed, 566 insertions(+)
 create mode 100644 drivers/regulator/twl4030-regulator.c

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 85a1f407e755..e58c0ce65aa6 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -56,6 +56,13 @@ config REGULATOR_BQ24022
 	  charging select between 100 mA and 500 mA charging current
 	  limit.
 
+config REGULATOR_TWL4030
+	bool "TI TWL4030/TWL5030/TPS695x0 PMIC"
+	depends on TWL4030_CORE
+	help
+	  This driver supports the voltage regulators provided by
+	  this family of companion chips.
+
 config REGULATOR_WM8350
 	tristate "Wolfson Microelectroncis WM8350 AudioPlus PMIC"
 	depends on MFD_WM8350
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 61b30c6ddecc..bac133afc061 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
 
 obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
+obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
 obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
diff --git a/drivers/regulator/twl4030-regulator.c b/drivers/regulator/twl4030-regulator.c
new file mode 100644
index 000000000000..23f282670db5
--- /dev/null
+++ b/drivers/regulator/twl4030-regulator.c
@@ -0,0 +1,511 @@
+/*
+ * twl4030-regulator.c -- support regulators in twl4030 family chips
+ *
+ * Copyright (C) 2008 David Brownell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/i2c/twl4030.h>
+
+
+/*
+ * The TWL4030/TW5030/TPS659x0 family chips include power management, a
+ * USB OTG transceiver, an RTC, ADC, PWM, and lots more.  Some versions
+ * include an audio codec, battery charger, and more voltage regulators.
+ * These chips are often used in OMAP-based systems.
+ *
+ * This driver implements software-based resource control for various
+ * voltage regulators.  This is usually augmented with state machine
+ * based control.
+ */
+
+struct twlreg_info {
+	/* start of regulator's PM_RECEIVER control register bank */
+	u8			base;
+
+	/* twl4030 resource ID, for resource control state machine */
+	u8			id;
+
+	/* voltage in mV = table[VSEL]; table_len must be a power-of-two */
+	u8			table_len;
+	const u16		*table;
+
+	/* chip constraints on regulator behavior */
+	u16			min_mV;
+	u16			max_mV;
+
+	/* used by regulator core */
+	struct regulator_desc	desc;
+};
+
+
+/* LDO control registers ... offset is from the base of its register bank.
+ * The first three registers of all power resource banks help hardware to
+ * manage the various resource groups.
+ */
+#define VREG_GRP		0
+#define VREG_TYPE		1
+#define VREG_REMAP		2
+#define VREG_DEDICATED		3	/* LDO control */
+
+
+static inline int
+twl4030reg_read(struct twlreg_info *info, unsigned offset)
+{
+	u8 value;
+	int status;
+
+	status = twl4030_i2c_read_u8(TWL4030_MODULE_PM_RECEIVER,
+			&value, info->base + offset);
+	return (status < 0) ? status : value;
+}
+
+static inline int
+twl4030reg_write(struct twlreg_info *info, unsigned offset, u8 value)
+{
+	return twl4030_i2c_write_u8(TWL4030_MODULE_PM_RECEIVER,
+			value, info->base + offset);
+}
+
+/*----------------------------------------------------------------------*/
+
+/* generic power resource operations, which work on all regulators */
+
+static int twl4030reg_grp(struct regulator_dev *rdev)
+{
+	return twl4030reg_read(rdev_get_drvdata(rdev), VREG_GRP);
+}
+
+/*
+ * Enable/disable regulators by joining/leaving the P1 (processor) group.
+ * We assume nobody else is updating the DEV_GRP registers.
+ */
+
+#define P3_GRP		BIT(7)		/* "peripherals" */
+#define P2_GRP		BIT(6)		/* secondary processor, modem, etc */
+#define P1_GRP		BIT(5)		/* CPU/Linux */
+
+static int twl4030reg_is_enabled(struct regulator_dev *rdev)
+{
+	int	state = twl4030reg_grp(rdev);
+
+	if (state < 0)
+		return state;
+
+	return (state & P1_GRP) != 0;
+}
+
+static int twl4030reg_enable(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			grp;
+
+	grp = twl4030reg_read(info, VREG_GRP);
+	if (grp < 0)
+		return grp;
+
+	grp |= P1_GRP;
+	return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_disable(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			grp;
+
+	grp = twl4030reg_read(info, VREG_GRP);
+	if (grp < 0)
+		return grp;
+
+	grp &= ~P1_GRP;
+	return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_get_status(struct regulator_dev *rdev)
+{
+	int	state = twl4030reg_grp(rdev);
+
+	if (state < 0)
+		return state;
+	state &= 0x0f;
+
+	/* assume state != WARM_RESET; we'd not be running...  */
+	if (!state)
+		return REGULATOR_STATUS_OFF;
+	return (state & BIT(3))
+		? REGULATOR_STATUS_NORMAL
+		: REGULATOR_STATUS_STANDBY;
+}
+
+static int twl4030reg_set_mode(struct regulator_dev *rdev, unsigned mode)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	unsigned		message;
+	int			status;
+
+	/* We can only set the mode through state machine commands... */
+	switch (mode) {
+	case REGULATOR_MODE_NORMAL:
+		message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_ACTIVE);
+		break;
+	case REGULATOR_MODE_STANDBY:
+		message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_SLEEP);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Ensure the resource is associated with some group */
+	status = twl4030reg_grp(rdev);
+	if (status < 0)
+		return status;
+	if (!(status & (P3_GRP | P2_GRP | P1_GRP)))
+		return -EACCES;
+
+	status = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+			message >> 8, 0x15 /* PB_WORD_MSB */ );
+	if (status >= 0)
+		return status;
+
+	return twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+			message, 0x16 /* PB_WORD_LSB */ );
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Support for adjustable-voltage LDOs uses a four bit (or less) voltage
+ * select field in its control register.   We use tables indexed by VSEL
+ * to record voltages in milliVolts.  (Accuracy is about three percent.)
+ *
+ * Note that VSEL values for VAUX2 changed in twl5030 and newer silicon;
+ * currently handled by listing two slightly different VAUX2 regulators,
+ * only one of which will be configured.
+ *
+ * VSEL values documented as "TI cannot support these values" are flagged
+ * in these tables as UNSUP() values; we normally won't assign them.
+ */
+#ifdef CONFIG_TWL4030_ALLOW_UNSUPPORTED
+#define UNSUP_MASK	0x0000
+#else
+#define UNSUP_MASK	0x8000
+#endif
+
+#define UNSUP(x)	(UNSUP_MASK | (x))
+#define IS_UNSUP(x)	(UNSUP_MASK & (x))
+#define LDO_MV(x)	(~UNSUP_MASK & (x))
+
+
+static const u16 VAUX1_VSEL_table[] = {
+	UNSUP(1500), UNSUP(1800), 2500, 2800,
+	3000, 3000, 3000, 3000,
+};
+static const u16 VAUX2_4030_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1000), UNSUP(1200), 1300,
+	1500, 1800, UNSUP(1850), 2500,
+	UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000),
+	UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VAUX2_VSEL_table[] = {
+	1700, 1700, 1900, 1300,
+	1500, 1800, 2000, 2500,
+	2100, 2800, 2200, 2300,
+	2400, 2400, 2400, 2400,
+};
+static const u16 VAUX3_VSEL_table[] = {
+	1500, 1800, 2500, 2800,
+	UNSUP(3000), UNSUP(3000), UNSUP(3000), UNSUP(3000),
+};
+static const u16 VAUX4_VSEL_table[] = {
+	700, 1000, 1200, UNSUP(1300),
+	1500, 1800, UNSUP(1850), 2500,
+};
+static const u16 VMMC1_VSEL_table[] = {
+	1850, 2850, 3000, 3150,
+};
+static const u16 VMMC2_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1000), UNSUP(1200), UNSUP(1300),
+	UNSUP(1500), UNSUP(1800), 1850, UNSUP(2500),
+	2600, 2800, 2850, 3000,
+	3150, 3150, 3150, 3150,
+};
+static const u16 VPLL1_VSEL_table[] = {
+	1000, 1200, 1300, 1800,
+	UNSUP(2800), UNSUP(3000), UNSUP(3000), UNSUP(3000),
+};
+static const u16 VPLL2_VSEL_table[] = {
+	700, 1000, 1200, 1300,
+	UNSUP(1500), 1800, UNSUP(1850), UNSUP(2500),
+	UNSUP(2600), UNSUP(2800), UNSUP(2850), UNSUP(3000),
+	UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VSIM_VSEL_table[] = {
+	UNSUP(1000), UNSUP(1200), UNSUP(1300), 1800,
+	2800, 3000, 3000, 3000,
+};
+static const u16 VDAC_VSEL_table[] = {
+	1200, 1300, 1800, 1800,
+};
+
+
+static int
+twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			vsel;
+
+	for (vsel = 0; vsel < info->table_len; vsel++) {
+		int mV = info->table[vsel];
+		int uV;
+
+		if (IS_UNSUP(mV))
+			continue;
+		uV = LDO_MV(mV) * 1000;
+
+		/* use the first in-range value */
+		if (min_uV <= uV && uV <= max_uV)
+			return twl4030reg_write(info, VREG_DEDICATED, vsel);
+	}
+
+	return -EDOM;
+}
+
+static int twl4030ldo_get_voltage(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+	int			vsel = twl4030reg_read(info, VREG_DEDICATED);
+
+	if (vsel < 0)
+		return vsel;
+
+	vsel &= info->table_len - 1;
+	return LDO_MV(info->table[vsel]) * 1000;
+}
+
+static struct regulator_ops twl4030ldo_ops = {
+	.set_voltage	= twl4030ldo_set_voltage,
+	.get_voltage	= twl4030ldo_get_voltage,
+
+	.enable		= twl4030reg_enable,
+	.disable	= twl4030reg_disable,
+	.is_enabled	= twl4030reg_is_enabled,
+
+	.set_mode	= twl4030reg_set_mode,
+
+	.get_status	= twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Fixed voltage LDOs don't have a VSEL field to update.
+ */
+static int twl4030fixed_get_voltage(struct regulator_dev *rdev)
+{
+	struct twlreg_info	*info = rdev_get_drvdata(rdev);
+
+	return info->min_mV * 1000;
+}
+
+static struct regulator_ops twl4030fixed_ops = {
+	.get_voltage	= twl4030fixed_get_voltage,
+
+	.enable		= twl4030reg_enable,
+	.disable	= twl4030reg_disable,
+	.is_enabled	= twl4030reg_is_enabled,
+
+	.set_mode	= twl4030reg_set_mode,
+
+	.get_status	= twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+#define TWL_ADJUSTABLE_LDO(label, offset, num) { \
+	.base = offset, \
+	.id = num, \
+	.table_len = ARRAY_SIZE(label##_VSEL_table), \
+	.table = label##_VSEL_table, \
+	.desc = { \
+		.name = #label, \
+		.id = TWL4030_REG_##label, \
+		.ops = &twl4030ldo_ops, \
+		.type = REGULATOR_VOLTAGE, \
+		.owner = THIS_MODULE, \
+		}, \
+	}
+
+#define TWL_FIXED_LDO(label, offset, mVolts, num) { \
+	.base = offset, \
+	.id = num, \
+	.min_mV = mVolts, \
+	.max_mV = mVolts, \
+	.desc = { \
+		.name = #label, \
+		.id = TWL4030_REG_##label, \
+		.ops = &twl4030fixed_ops, \
+		.type = REGULATOR_VOLTAGE, \
+		.owner = THIS_MODULE, \
+		}, \
+	}
+
+/*
+ * We list regulators here if systems need some level of
+ * software control over them after boot.
+ */
+static struct twlreg_info twl4030_regs[] = {
+	TWL_ADJUSTABLE_LDO(VAUX1, 0x17, 1),
+	TWL_ADJUSTABLE_LDO(VAUX2_4030, 0x1b, 2),
+	TWL_ADJUSTABLE_LDO(VAUX2, 0x1b, 2),
+	TWL_ADJUSTABLE_LDO(VAUX3, 0x1f, 3),
+	TWL_ADJUSTABLE_LDO(VAUX4, 0x23, 4),
+	TWL_ADJUSTABLE_LDO(VMMC1, 0x27, 5),
+	TWL_ADJUSTABLE_LDO(VMMC2, 0x2b, 6),
+	/*
+	TWL_ADJUSTABLE_LDO(VPLL1, 0x2f, 7),
+	TWL_ADJUSTABLE_LDO(VPLL2, 0x33, 8),
+	*/
+	TWL_ADJUSTABLE_LDO(VSIM, 0x37, 9),
+	TWL_ADJUSTABLE_LDO(VDAC, 0x3b, 10),
+	/*
+	TWL_ADJUSTABLE_LDO(VINTANA1, 0x3f, 11),
+	TWL_ADJUSTABLE_LDO(VINTANA2, 0x43, 12),
+	TWL_ADJUSTABLE_LDO(VINTDIG, 0x47, 13),
+	TWL_SMPS(VIO, 0x4b, 14),
+	TWL_SMPS(VDD1, 0x55, 15),
+	TWL_SMPS(VDD2, 0x63, 16),
+	 */
+	TWL_FIXED_LDO(VUSB1V5, 0x71, 1500, 17),
+	TWL_FIXED_LDO(VUSB1V8, 0x74, 1800, 18),
+	TWL_FIXED_LDO(VUSB3V1, 0x77, 3100, 19),
+	/* VUSBCP is managed *only* by the USB subchip */
+};
+
+static int twl4030reg_probe(struct platform_device *pdev)
+{
+	int				i;
+	struct twlreg_info		*info;
+	struct regulator_init_data	*initdata;
+	struct regulation_constraints	*c;
+	struct regulator_dev		*rdev;
+	int				min_uV, max_uV;
+
+	for (i = 0, info = NULL; i < ARRAY_SIZE(twl4030_regs); i++) {
+		if (twl4030_regs[i].desc.id != pdev->id)
+			continue;
+		info = twl4030_regs + i;
+		min_uV = info->min_mV * 1000;
+		max_uV = info->max_mV * 1000;
+		break;
+	}
+	if (!info)
+		return -ENODEV;
+
+	initdata = pdev->dev.platform_data;
+	if (!initdata)
+		return -EINVAL;
+
+	/* Constrain board-specific capabilities according to what
+	 * this driver and the chip itself can actually do.
+	 */
+	c = &initdata->constraints;
+	if (!c->min_uV || c->min_uV < min_uV)
+		c->min_uV = min_uV;
+	if (!c->max_uV || c->max_uV > max_uV)
+		c->max_uV = max_uV;
+	c->valid_modes_mask &= REGULATOR_MODE_NORMAL | REGULATOR_MODE_STANDBY;
+	c->valid_ops_mask &= REGULATOR_CHANGE_VOLTAGE
+				| REGULATOR_CHANGE_MODE
+				| REGULATOR_CHANGE_STATUS;
+
+	rdev = regulator_register(&info->desc, &pdev->dev, initdata, info);
+	if (IS_ERR(rdev)) {
+		dev_err(&pdev->dev, "can't register %s, %ld\n",
+				info->desc.name, PTR_ERR(rdev));
+		return PTR_ERR(rdev);
+	}
+	platform_set_drvdata(pdev, rdev);
+
+	/* NOTE:  many regulators support short-circuit IRQs (presentable
+	 * as REGULATOR_OVER_CURRENT notifications?) configured via:
+	 *  - SC_CONFIG
+	 *  - SC_DETECT1 (vintana2, vmmc1/2, vaux1/2/3/4)
+	 *  - SC_DETECT2 (vusb, vdac, vio, vdd1/2, vpll2)
+	 *  - IT_CONFIG
+	 */
+
+	return 0;
+}
+
+static int __devexit twl4030reg_remove(struct platform_device *pdev)
+{
+	regulator_unregister(platform_get_drvdata(pdev));
+	return 0;
+}
+
+MODULE_ALIAS("platform:twl4030_reg");
+
+static struct platform_driver twl4030reg_driver = {
+	.probe		= twl4030reg_probe,
+	.remove		= __devexit_p(twl4030reg_remove),
+	/* NOTE: short name, to work around driver model truncation of
+	 * "twl4030_regulator.12" (and friends) to "twl4030_regulator.1".
+	 */
+	.driver.name	= "twl4030_reg",
+	.driver.owner	= THIS_MODULE,
+};
+
+static int __init twl4030reg_init(void)
+{
+	unsigned i, j;
+
+	/* determine min/max voltage constraints, taking into account
+	 * whether set_voltage() will use the "unsupported" settings
+	 */
+	for (i = 0; i < ARRAY_SIZE(twl4030_regs); i++) {
+		struct twlreg_info	*info = twl4030_regs + i;
+		const u16		*table;
+
+		/* fixed-voltage regulators */
+		if (!info->table_len)
+			continue;
+
+		/* LDO regulators: */
+		for (j = 0, table = info->table;
+				j < info->table_len;
+				j++, table++) {
+			u16		mV = *table;
+
+			if (IS_UNSUP(mV))
+				continue;
+			mV = LDO_MV(mV);
+
+			if (info->min_mV == 0 || info->min_mV > mV)
+				info->min_mV = mV;
+			if (info->max_mV < mV)
+				info->max_mV = mV;
+		}
+	}
+
+	return platform_driver_register(&twl4030reg_driver);
+}
+subsys_initcall(twl4030reg_init);
+
+static void __exit twl4030reg_exit(void)
+{
+	platform_driver_unregister(&twl4030reg_driver);
+}
+module_exit(twl4030reg_exit)
+
+MODULE_DESCRIPTION("TWL4030 regulator driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index 8137f660a5cc..0dc80ef24975 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -218,6 +218,53 @@ int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
 
 /*----------------------------------------------------------------------*/
 
+/* Power bus message definitions */
+
+#define DEV_GRP_NULL		0x0
+#define DEV_GRP_P1		0x1
+#define DEV_GRP_P2		0x2
+#define DEV_GRP_P3		0x4
+
+#define RES_GRP_RES		0x0
+#define RES_GRP_PP		0x1
+#define RES_GRP_RC		0x2
+#define RES_GRP_PP_RC		0x3
+#define RES_GRP_PR		0x4
+#define RES_GRP_PP_PR		0x5
+#define RES_GRP_RC_PR		0x6
+#define RES_GRP_ALL		0x7
+
+#define RES_TYPE2_R0		0x0
+
+#define RES_TYPE_ALL		0x7
+
+#define RES_STATE_WRST		0xF
+#define RES_STATE_ACTIVE	0xE
+#define RES_STATE_SLEEP		0x8
+#define RES_STATE_OFF		0x0
+
+/*
+ * Power Bus Message Format ... these can be sent individually by Linux,
+ * but are usually part of downloaded scripts that are run when various
+ * power events are triggered.
+ *
+ *  Broadcast Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_GRP[11:9]  RES_TYPE2[8:7] RES_TYPE[6:4]
+ *    RES_STATE[3:0]
+ *
+ *  Singular Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_ID[11:4]  RES_STATE[3:0]
+ */
+
+#define MSG_BROADCAST(devgrp, grp, type, type2, state) \
+	( (devgrp) << 13 | 1 << 12 | (grp) << 9 | (type2) << 7 \
+	| (type) << 4 | (state))
+
+#define MSG_SINGULAR(devgrp, id, state) \
+	((devgrp) << 13 | 0 << 12 | (id) << 4 | (state))
+
+/*----------------------------------------------------------------------*/
+
 struct twl4030_bci_platform_data {
 	int *battery_tmp_tbl;
 	unsigned int tblsize;
-- 
cgit v1.2.3-71-gd317


From 5c13941acc513669c7d07b28789c3f9ba66ddddf Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 11 Mar 2009 03:30:43 -0800
Subject: MMC: regulator utilities

Glue between MMC and regulator stacks ... verified with
some OMAP3 boards using adjustable and configured-as-fixed
regulators on several MMC controllers.

These calls are intended to be used by MMC host adapters
using at least one regulator per host.  Examples include
slots with regulators supporting multiple voltages and
ones using multiple voltage rails (e.g. DAT4..DAT7 using a
separate supply, or a split rail chip like certain SDIO
WLAN or eMMC solutions).

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/mmc/core/core.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h |   5 +++
 2 files changed, 105 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index df6ce4a06cf3..1445ea8f10a6 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -21,6 +21,7 @@
 #include <linux/leds.h>
 #include <linux/scatterlist.h>
 #include <linux/log2.h>
+#include <linux/regulator/consumer.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -523,6 +524,105 @@ u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max)
 }
 EXPORT_SYMBOL(mmc_vddrange_to_ocrmask);
 
+#ifdef CONFIG_REGULATOR
+
+/**
+ * mmc_regulator_get_ocrmask - return mask of supported voltages
+ * @supply: regulator to use
+ *
+ * This returns either a negative errno, or a mask of voltages that
+ * can be provided to MMC/SD/SDIO devices using the specified voltage
+ * regulator.  This would normally be called before registering the
+ * MMC host adapter.
+ */
+int mmc_regulator_get_ocrmask(struct regulator *supply)
+{
+	int			result = 0;
+	int			count;
+	int			i;
+
+	count = regulator_count_voltages(supply);
+	if (count < 0)
+		return count;
+
+	for (i = 0; i < count; i++) {
+		int		vdd_uV;
+		int		vdd_mV;
+
+		vdd_uV = regulator_list_voltage(supply, i);
+		if (vdd_uV <= 0)
+			continue;
+
+		vdd_mV = vdd_uV / 1000;
+		result |= mmc_vddrange_to_ocrmask(vdd_mV, vdd_mV);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(mmc_regulator_get_ocrmask);
+
+/**
+ * mmc_regulator_set_ocr - set regulator to match host->ios voltage
+ * @vdd_bit: zero for power off, else a bit number (host->ios.vdd)
+ * @supply: regulator to use
+ *
+ * Returns zero on success, else negative errno.
+ *
+ * MMC host drivers may use this to enable or disable a regulator using
+ * a particular supply voltage.  This would normally be called from the
+ * set_ios() method.
+ */
+int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit)
+{
+	int			result = 0;
+	int			min_uV, max_uV;
+	int			enabled;
+
+	enabled = regulator_is_enabled(supply);
+	if (enabled < 0)
+		return enabled;
+
+	if (vdd_bit) {
+		int		tmp;
+		int		voltage;
+
+		/* REVISIT mmc_vddrange_to_ocrmask() may have set some
+		 * bits this regulator doesn't quite support ... don't
+		 * be too picky, most cards and regulators are OK with
+		 * a 0.1V range goof (it's a small error percentage).
+		 */
+		tmp = vdd_bit - ilog2(MMC_VDD_165_195);
+		if (tmp == 0) {
+			min_uV = 1650 * 1000;
+			max_uV = 1950 * 1000;
+		} else {
+			min_uV = 1900 * 1000 + tmp * 100 * 1000;
+			max_uV = min_uV + 100 * 1000;
+		}
+
+		/* avoid needless changes to this voltage; the regulator
+		 * might not allow this operation
+		 */
+		voltage = regulator_get_voltage(supply);
+		if (voltage < 0)
+			result = voltage;
+		else if (voltage < min_uV || voltage > max_uV)
+			result = regulator_set_voltage(supply, min_uV, max_uV);
+		else
+			result = 0;
+
+		if (result == 0 && !enabled)
+			result = regulator_enable(supply);
+	} else if (enabled) {
+		result = regulator_disable(supply);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(mmc_regulator_set_ocr);
+
+#endif
+
 /*
  * Mask off any voltages we don't support and select
  * the lowest voltage
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 4e457256bd33..3e7615e9087e 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -192,5 +192,10 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host)
 	wake_up_process(host->sdio_irq_thread);
 }
 
+struct regulator;
+
+int mmc_regulator_get_ocrmask(struct regulator *supply);
+int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit);
+
 #endif
 
-- 
cgit v1.2.3-71-gd317


From cacf90f24e80cec9334f98e0377149f943fe9f16 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 2 Mar 2009 16:32:46 +0000
Subject: regulator: Allow boot_on regulators to be disabled by clients

Rather than incrementing the reference count for boot_on regulators
(which prevents them being disabled later on) simply force the
regulator to be enabled when applying the constraints. Previously
boot_on was essentially equivalent to always_on.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          | 13 ++++---------
 include/linux/regulator/machine.h |  4 +++-
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 2ff76349f392..08441e24946e 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -776,10 +776,6 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 			}
 	}
 
-	/* are we enabled at boot time by firmware / bootloader */
-	if (rdev->constraints->boot_on)
-		rdev->use_count = 1;
-
 	/* do we need to setup our suspend state */
 	if (constraints->initial_state) {
 		ret = suspend_prepare(rdev, constraints->initial_state);
@@ -808,11 +804,10 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
-	/* if always_on is set then turn the regulator on if it's not
-	 * already on. */
-	if (constraints->always_on && ops->enable &&
-	    ((ops->is_enabled && !ops->is_enabled(rdev)) ||
-	     (!ops->is_enabled && !constraints->boot_on))) {
+	/* If the constraints say the regulator should be on at this point
+	 * and we have control then make sure it is enabled.
+	 */
+	if ((constraints->always_on || constraints->boot_on) && ops->enable) {
 		ret = ops->enable(rdev);
 		if (ret < 0) {
 			printk(KERN_ERR "%s: failed to enable %s\n",
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 1eb861cf4b2c..5de7aa3b02a6 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -73,7 +73,9 @@ struct regulator_state {
  *
  * @always_on: Set if the regulator should never be disabled.
  * @boot_on: Set if the regulator is enabled when the system is initially
- *           started.
+ *           started.  If the regulator is not enabled by the hardware or
+ *           bootloader then it will be enabled when the constraints are
+ *           applied.
  * @apply_uV: Apply the voltage constraint when initialising.
  *
  * @input_uV: Input voltage for regulator when supplied by another regulator.
-- 
cgit v1.2.3-71-gd317


From ca7255614e0861e36480103f4a402a115803d7b5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 16 Mar 2009 19:36:34 +0000
Subject: regulator: Support disabling of unused regulators by machines

At present it is not possible for machine constraints to disable
regulators which have been left on when the system starts, for example
as a result of fixed default configurations in hardware. This means that
power may be wasted by these regulators if they are not in use.

Provide intial support for this with a late_initcall which will disable
any unused regulators if the machine has enabled this feature by calling
regulator_has_full_constraints(). If this has not been called then print
a warning to encourage users to fully specify their constraints so that
we can change this to be the default behaviour in future.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          | 92 +++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/machine.h |  2 +
 2 files changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 8588a2490e0a..01f7702a805d 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -28,6 +28,7 @@
 static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_list);
 static LIST_HEAD(regulator_map_list);
+static int has_full_constraints;
 
 /*
  * struct regulator_map
@@ -2142,6 +2143,23 @@ out:
 }
 EXPORT_SYMBOL_GPL(regulator_suspend_prepare);
 
+/**
+ * regulator_has_full_constraints - the system has fully specified constraints
+ *
+ * Calling this function will cause the regulator API to disable all
+ * regulators which have a zero use count and don't have an always_on
+ * constraint in a late_initcall.
+ *
+ * The intention is that this will become the default behaviour in a
+ * future kernel release so users are encouraged to use this facility
+ * now.
+ */
+void regulator_has_full_constraints(void)
+{
+	has_full_constraints = 1;
+}
+EXPORT_SYMBOL_GPL(regulator_has_full_constraints);
+
 /**
  * rdev_get_drvdata - get rdev regulator driver data
  * @rdev: regulator
@@ -2209,3 +2227,77 @@ static int __init regulator_init(void)
 
 /* init early to allow our consumers to complete system booting */
 core_initcall(regulator_init);
+
+static int __init regulator_init_complete(void)
+{
+	struct regulator_dev *rdev;
+	struct regulator_ops *ops;
+	struct regulation_constraints *c;
+	int enabled, ret;
+	const char *name;
+
+	mutex_lock(&regulator_list_mutex);
+
+	/* If we have a full configuration then disable any regulators
+	 * which are not in use or always_on.  This will become the
+	 * default behaviour in the future.
+	 */
+	list_for_each_entry(rdev, &regulator_list, list) {
+		ops = rdev->desc->ops;
+		c = rdev->constraints;
+
+		if (c->name)
+			name = c->name;
+		else if (rdev->desc->name)
+			name = rdev->desc->name;
+		else
+			name = "regulator";
+
+		if (!ops->disable || c->always_on)
+			continue;
+
+		mutex_lock(&rdev->mutex);
+
+		if (rdev->use_count)
+			goto unlock;
+
+		/* If we can't read the status assume it's on. */
+		if (ops->is_enabled)
+			enabled = ops->is_enabled(rdev);
+		else
+			enabled = 1;
+
+		if (!enabled)
+			goto unlock;
+
+		if (has_full_constraints) {
+			/* We log since this may kill the system if it
+			 * goes wrong. */
+			printk(KERN_INFO "%s: disabling %s\n",
+			       __func__, name);
+			ret = ops->disable(rdev);
+			if (ret != 0) {
+				printk(KERN_ERR
+				       "%s: couldn't disable %s: %d\n",
+				       __func__, name, ret);
+			}
+		} else {
+			/* The intention is that in future we will
+			 * assume that full constraints are provided
+			 * so warn even if we aren't going to do
+			 * anything here.
+			 */
+			printk(KERN_WARNING
+			       "%s: incomplete constraints, leaving %s on\n",
+			       __func__, name);
+		}
+
+unlock:
+		mutex_unlock(&rdev->mutex);
+	}
+
+	mutex_unlock(&regulator_list_mutex);
+
+	return 0;
+}
+late_initcall(regulator_init_complete);
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 5de7aa3b02a6..bac64fa390f2 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -166,4 +166,6 @@ struct regulator_init_data {
 
 int regulator_suspend_prepare(suspend_state_t state);
 
+void regulator_has_full_constraints(void);
+
 #endif
-- 
cgit v1.2.3-71-gd317


From a08915ba594da66145f33a972db578a58b9135f1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:13 +0200
Subject: ide-cd: use scatterlists for PIO transfers (fs requests)

* Export ide_pio_bytes().

* Add ->last_xfer_len field to struct ide_cmd.

* Add ide_cd_error_cmd() helper to ide-cd.

* Convert ide-cd to use scatterlists also for PIO transfers (fs requests
  only for now) and get rid of partial completions (except when the error
  happens -- which is still subject to change later because looking at
  ATAPI spec it seems that the device is free to error the whole transfer
  with setting the Error bit only on the last transfer chunk).

* Update ide_cd_{prepare_rw,restore_request,do_request}() accordingly.

* Inline ide_cd_restore_request() into cdrom_start_rw().

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c       | 146 +++++++++++++--------------------------------
 drivers/ide/ide-taskfile.c |   5 +-
 include/linux/ide.h        |   4 ++
 3 files changed, 50 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 30113e69c8bb..5f15859c2c73 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -539,64 +539,12 @@ static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 {
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd_flags: 0x%x", rq->cmd_flags);
 
-	if (rq_data_dir(rq) == READ) {
-		unsigned short sectors_per_frame =
-			queue_hardsect_size(drive->queue) >> SECTOR_BITS;
-		int nskip = rq->sector & (sectors_per_frame - 1);
-
-		/*
-		 * If the requested sector doesn't start on a frame boundary,
-		 * we must adjust the start of the transfer so that it does,
-		 * and remember to skip the first few sectors.
-		 *
-		 * If the rq->current_nr_sectors field is larger than the size
-		 * of the buffer, it will mean that we're to skip a number of
-		 * sectors equal to the amount by which rq->current_nr_sectors
-		 * is larger than the buffer size.
-		 */
-		if (nskip > 0) {
-			/* sanity check... */
-			if (rq->current_nr_sectors !=
-			    bio_cur_sectors(rq->bio)) {
-				printk(KERN_ERR PFX "%s: %s: buffer botch (%u)\n",
-						drive->name, __func__,
-						rq->current_nr_sectors);
-				return ide_stopped;
-			}
-			rq->current_nr_sectors += nskip;
-		}
-	}
-
 	/* set up the command */
 	rq->timeout = ATAPI_WAIT_PC;
 
 	return ide_started;
 }
 
-/*
- * Fix up a possibly partially-processed request so that we can start it over
- * entirely, or even put it back on the request queue.
- */
-static void ide_cd_restore_request(ide_drive_t *drive, struct request *rq)
-{
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (rq->buffer != bio_data(rq->bio)) {
-		sector_t n =
-			(rq->buffer - (char *)bio_data(rq->bio)) / SECTOR_SIZE;
-
-		rq->buffer = bio_data(rq->bio);
-		rq->nr_sectors += n;
-		rq->sector -= n;
-	}
-	rq->current_nr_sectors = bio_cur_sectors(rq->bio);
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->hard_nr_sectors = rq->nr_sectors;
-	rq->hard_sector = rq->sector;
-	rq->q->prep_rq_fn(rq->q, rq);
-}
-
 static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq)
 {
 	ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]);
@@ -690,6 +638,17 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 	return (flags & REQ_FAILED) ? -EIO : 0;
 }
 
+static void ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	unsigned int nr_bytes = cmd->nbytes - cmd->nleft;
+
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
+		nr_bytes -= cmd->last_xfer_len;
+
+	if (nr_bytes > 0)
+		ide_complete_rq(drive, 0, nr_bytes);
+}
+
 /*
  * Called from blk_end_request_callback() after the data of the request is
  * completed and before the request itself is completed. By returning value '1',
@@ -703,6 +662,7 @@ static int cdrom_newpc_intr_dummy_cb(struct request *rq)
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	xfer_func_t *xferfunc;
 	ide_expiry_t *expiry = NULL;
@@ -769,11 +729,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			 * Otherwise, complete the command normally.
 			 */
 			uptodate = 1;
-			if (rq->current_nr_sectors > 0) {
+			if (cmd->nleft > 0) {
 				printk(KERN_ERR PFX "%s: %s: data underrun "
-						"(%d blocks)\n",
-						drive->name, __func__,
-						rq->current_nr_sectors);
+					"(%u bytes)\n", drive->name, __func__,
+					cmd->nleft);
 				if (!write)
 					rq->cmd_flags |= REQ_FAILED;
 				uptodate = 0;
@@ -795,24 +754,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	if (blk_fs_request(rq)) {
 		if (write == 0) {
-			int nskip;
-
 			if (ide_cd_check_transfer_size(drive, len))
 				goto out_end;
-
-			/*
-			 * First, figure out if we need to bit-bucket
-			 * any of the leading sectors.
-			 */
-			nskip = min_t(int, rq->current_nr_sectors
-					   - bio_cur_sectors(rq->bio),
-					   thislen >> 9);
-			if (nskip > 0) {
-				ide_pad_transfer(drive, write, nskip << 9);
-				rq->current_nr_sectors -= nskip;
-				thislen -= (nskip << 9);
-			}
 		}
+		cmd->last_xfer_len = 0;
 	}
 
 	if (ireason == 0) {
@@ -835,15 +780,15 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		/* bio backed? */
 		if (rq->bio) {
 			if (blk_fs_request(rq)) {
-				ptr = rq->buffer;
-				blen = rq->current_nr_sectors << 9;
+				blen = min_t(int, thislen, cmd->nleft);
 			} else {
 				ptr = bio_data(rq->bio);
 				blen = bio_iovec(rq->bio)->bv_len;
 			}
 		}
 
-		if (!ptr) {
+		if ((blk_fs_request(rq) && cmd->nleft == 0) ||
+		    (blk_fs_request(rq) == 0 && ptr == NULL)) {
 			if (blk_fs_request(rq) && !write)
 				/*
 				 * If the buffers are full, pipe the rest into
@@ -863,26 +808,16 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		if (blen > thislen)
 			blen = thislen;
 
-		xferfunc(drive, NULL, ptr, blen);
+		if (blk_fs_request(rq)) {
+			ide_pio_bytes(drive, cmd, write, blen);
+			cmd->last_xfer_len += blen;
+		} else
+			xferfunc(drive, NULL, ptr, blen);
 
 		thislen -= blen;
 		len -= blen;
 
-		if (blk_fs_request(rq)) {
-			rq->buffer += blen;
-			rq->nr_sectors -= (blen >> 9);
-			rq->current_nr_sectors -= (blen >> 9);
-			rq->sector += (blen >> 9);
-
-			if (rq->current_nr_sectors == 0 && rq->nr_sectors) {
-				nsectors = rq->hard_cur_sectors;
-
-				if (nsectors == 0)
-					nsectors = 1;
-
-				ide_complete_rq(drive, 0, nsectors << 9);
-			}
-		} else {
+		if (blk_fs_request(rq) == 0) {
 			rq->data_len -= blen;
 
 			/*
@@ -933,8 +868,10 @@ out_end:
 			ide_cd_complete_failed_rq(drive, rq);
 
 		if (blk_fs_request(rq)) {
-			if (rq->current_nr_sectors == 0)
+			if (cmd->nleft == 0)
 				uptodate = 1;
+			if (uptodate == 0)
+				ide_cd_error_cmd(drive, cmd);
 		} else {
 			if (uptodate <= 0 && rq->errors == 0)
 				rq->errors = -EIO;
@@ -944,7 +881,7 @@ out_end:
 		if (blk_pc_request(rq))
 			nsectors = (rq->data_len + 511) >> 9;
 		else
-			nsectors = rq->hard_cur_sectors;
+			nsectors = rq->hard_nr_sectors;
 
 		if (nsectors == 0)
 			nsectors = 1;
@@ -960,9 +897,10 @@ out_end:
 static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 {
 	struct cdrom_info *cd = drive->driver_data;
+	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
-		queue_hardsect_size(drive->queue) >> SECTOR_BITS;
+		queue_hardsect_size(q) >> SECTOR_BITS;
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, write: 0x%x, "
 				  "secs_per_frame: %u",
@@ -977,17 +915,16 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 		 * We may be retrying this request after an error.  Fix up any
 		 * weirdness which might be present in the request packet.
 		 */
-		ide_cd_restore_request(drive, rq);
+		q->prep_rq_fn(q, rq);
 	}
 
-	/* use DMA, if possible / writes *must* be hardware frame aligned */
+	/* fs requests *must* be hardware frame aligned */
 	if ((rq->nr_sectors & (sectors_per_frame - 1)) ||
-	    (rq->sector & (sectors_per_frame - 1))) {
-		if (write)
-			return ide_stopped;
-		drive->dma = 0;
-	} else
-		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
+	    (rq->sector & (sectors_per_frame - 1)))
+		return ide_stopped;
+
+	/* use DMA, if possible */
+	drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
 	if (write)
 		cd->devinfo.media_written = 1;
@@ -1050,8 +987,6 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	if (blk_fs_request(rq)) {
 		if (cdrom_start_rw(drive, rq) == ide_stopped ||
 		    ide_cd_prepare_rw_request(drive, rq) == ide_stopped) {
-			if (rq->current_nr_sectors == 0)
-				uptodate = 1;
 			goto out_end;
 		}
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
@@ -1078,6 +1013,11 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
+	if (blk_fs_request(rq)) {
+		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
+		ide_map_sg(drive, &cmd);
+	}
+
 	return ide_issue_pc(drive, &cmd);
 out_end:
 	nsectors = rq->hard_nr_sectors;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 0e333ecf2ad6..a3b7a50562b2 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -188,8 +188,8 @@ static u8 wait_drive_not_busy(ide_drive_t *drive)
 	return stat;
 }
 
-static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
-			  unsigned int write, unsigned int len)
+void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
+		   unsigned int write, unsigned int len)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -243,6 +243,7 @@ static void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 		len -= nr_bytes;
 	}
 }
+EXPORT_SYMBOL_GPL(ide_pio_bytes);
 
 static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 			      unsigned int write)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d5d832271f44..c2841c0c36c8 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -352,6 +352,8 @@ struct ide_cmd {
 
 	unsigned int		nbytes;
 	unsigned int		nleft;
+	unsigned int		last_xfer_len;
+
 	struct scatterlist	*cursg;
 	unsigned int		cursg_ofs;
 
@@ -1226,6 +1228,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_cmd *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *);
 
+void ide_pio_bytes(ide_drive_t *, struct ide_cmd *, unsigned int, unsigned int);
+
 void ide_finish_cmd(ide_drive_t *, struct ide_cmd *, u8);
 
 int ide_raw_taskfile(ide_drive_t *, struct ide_cmd *, u8 *, u16);
-- 
cgit v1.2.3-71-gd317


From 06a449e30135aabb6686c95bf0c42b46d169a3b3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:13 +0200
Subject: ide-cd: fix non-SECTOR_SIZE-multiples PIO transfers for fs requests

We now support arbitrary number of bytes per-IRQ also for fs requests
so remove ide_cd_check_transfer_size() and IDE_AFLAG_LIMIT_NFRAMES.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 36 +-----------------------------------
 include/linux/ide.h  |  5 -----
 2 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5f15859c2c73..c0cefe5becf3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -509,31 +509,6 @@ static int ide_cd_check_ireason(ide_drive_t *drive, struct request *rq,
 	return -1;
 }
 
-/*
- * Assume that the drive will always provide data in multiples of at least
- * SECTOR_SIZE, as it gets hairy to keep track of the transfers otherwise.
- */
-static int ide_cd_check_transfer_size(ide_drive_t *drive, int len)
-{
-	ide_debug_log(IDE_DBG_FUNC, "len: %d", len);
-
-	if ((len % SECTOR_SIZE) == 0)
-		return 0;
-
-	printk(KERN_ERR PFX "%s: %s: Bad transfer size %d\n", drive->name,
-			__func__, len);
-
-	if (drive->atapi_flags & IDE_AFLAG_LIMIT_NFRAMES)
-		printk(KERN_ERR PFX "This drive is not supported by this "
-				"version of the driver\n");
-	else {
-		printk(KERN_ERR PFX "Trying to limit transfer sizes\n");
-		drive->atapi_flags |= IDE_AFLAG_LIMIT_NFRAMES;
-	}
-
-	return 1;
-}
-
 static ide_startstop_t ide_cd_prepare_rw_request(ide_drive_t *drive,
 						 struct request *rq)
 {
@@ -752,13 +727,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (rc)
 		goto out_end;
 
-	if (blk_fs_request(rq)) {
-		if (write == 0) {
-			if (ide_cd_check_transfer_size(drive, len))
-				goto out_end;
-		}
-		cmd->last_xfer_len = 0;
-	}
+	cmd->last_xfer_len = 0;
 
 	if (ireason == 0) {
 		write = 1;
@@ -1619,9 +1588,6 @@ static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive)
 #endif
 
 static const struct cd_list_entry ide_cd_quirks_list[] = {
-	/* Limit transfer size per interrupt. */
-	{ "SAMSUNG CD-ROM SCR-2430", NULL,   IDE_AFLAG_LIMIT_NFRAMES	     },
-	{ "SAMSUNG CD-ROM SCR-2432", NULL,   IDE_AFLAG_LIMIT_NFRAMES	     },
 	/* SCR-3231 doesn't support the SET_CD_SPEED command. */
 	{ "SAMSUNG CD-ROM SCR-3231", NULL,   IDE_AFLAG_NO_SPEED_SELECT	     },
 	/* Old NEC260 (not R) was released before ATAPI 1.2 spec. */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c2841c0c36c8..cb501bf78f7d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -458,11 +458,6 @@ enum {
 	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 3),
 	/* TOC track numbers are in BCD. */
 	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 4),
-	/*
-	 * Drive does not provide data in multiples of SECTOR_SIZE
-	 * when more than one interrupt is needed.
-	 */
-	IDE_AFLAG_LIMIT_NFRAMES		= (1 << 5),
 	/* Saved TOC information is current. */
 	IDE_AFLAG_TOC_VALID		= (1 << 6),
 	/* We think that the drive door is locked. */
-- 
cgit v1.2.3-71-gd317


From 35c9b4daf4c94b30e5cede597d98016ebf31b5ad Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:19 +0200
Subject: ide: add ->dma_clear method and remove ->dma_timeout one

All custom ->dma_timeout implementations call the generic one thus it is
possible to have only an optional method for resetting DMA engine instead:

* Add ->dma_clear method and convert hpt366, pdc202xx_old and sl82c105
  host drivers to use it.

* Always use ide_dma_timeout() in ide_dma_timeout_retry() and remove
 ->dma_timeout method.

* Make ide_dma_timeout() static.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c     |  1 -
 drivers/ide/au1xxx-ide.c   |  1 -
 drivers/ide/cmd64x.c       |  3 ---
 drivers/ide/cs5536.c       |  1 -
 drivers/ide/hpt366.c       | 10 +---------
 drivers/ide/icside.c       |  1 -
 drivers/ide/ide-dma-sff.c  |  3 +--
 drivers/ide/ide-dma.c      | 10 ++++++----
 drivers/ide/it821x.c       |  3 +--
 drivers/ide/ns87415.c      |  1 -
 drivers/ide/pdc202xx_old.c | 10 ++--------
 drivers/ide/pmac.c         |  1 -
 drivers/ide/sc1200.c       |  1 -
 drivers/ide/scc_pata.c     |  1 -
 drivers/ide/sgiioc4.c      |  1 -
 drivers/ide/siimage.c      |  1 -
 drivers/ide/sl82c105.c     |  7 +++----
 drivers/ide/tc86c001.c     |  1 -
 drivers/ide/trm290.c       |  1 -
 drivers/ide/tx4939ide.c    |  1 -
 include/linux/ide.h        |  4 ++--
 21 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index d516168464fc..d3faf0b97f42 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -509,7 +509,6 @@ static const struct ide_dma_ops ali_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index d3a9d6c15328..0c08c5e01f2a 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -353,7 +353,6 @@ static const struct ide_dma_ops au1xxx_dma_ops = {
 	.dma_end		= auide_dma_end,
 	.dma_test_irq		= auide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static int auide_ddma_init(ide_hwif_t *hwif, const struct ide_port_info *d)
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index bf0e3f470824..f0a49d2ff711 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -384,7 +384,6 @@ static const struct ide_dma_ops cmd64x_dma_ops = {
 	.dma_test_irq		= cmd64x_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -396,7 +395,6 @@ static const struct ide_dma_ops cmd646_rev1_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -408,7 +406,6 @@ static const struct ide_dma_ops cmd648_dma_ops = {
 	.dma_test_irq		= cmd648_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c
index d5dcf4899607..353a35bbba63 100644
--- a/drivers/ide/cs5536.c
+++ b/drivers/ide/cs5536.c
@@ -236,7 +236,6 @@ static const struct ide_dma_ops cs5536_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info cs5536_info = {
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index dbaf184ed9c5..a0eb87f59134 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -835,12 +835,6 @@ static int hpt370_dma_end(ide_drive_t *drive)
 	return ide_dma_end(drive);
 }
 
-static void hpt370_dma_timeout(ide_drive_t *drive)
-{
-	hpt370_irq_timeout(drive);
-	ide_dma_timeout(drive);
-}
-
 /* returns 1 if DMA IRQ issued, 0 otherwise */
 static int hpt374_dma_test_irq(ide_drive_t *drive)
 {
@@ -1423,7 +1417,6 @@ static const struct ide_dma_ops hpt37x_dma_ops = {
 	.dma_test_irq		= hpt374_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -1435,7 +1428,7 @@ static const struct ide_dma_ops hpt370_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= hpt370_dma_timeout,
+	.dma_clear		= hpt370_irq_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -1447,7 +1440,6 @@ static const struct ide_dma_ops hpt36x_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= hpt366_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index 51ce404fe532..f069f122ee6e 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -377,7 +377,6 @@ static const struct ide_dma_ops icside_v6_dma_ops = {
 	.dma_start		= icside_dma_start,
 	.dma_end		= icside_dma_end,
 	.dma_test_irq		= icside_dma_test_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 };
 #else
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 75a9ea2e4c82..7836d7e03fff 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -338,9 +338,8 @@ const struct ide_dma_ops sff_dma_ops = {
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 EXPORT_SYMBOL_GPL(sff_dma_ops);
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 3dbf80c15491..dc5d9bc4ced0 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -460,7 +460,7 @@ void ide_dma_lost_irq(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_dma_lost_irq);
 
-void ide_dma_timeout(ide_drive_t *drive)
+static void ide_dma_timeout(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
@@ -473,7 +473,6 @@ void ide_dma_timeout(ide_drive_t *drive)
 
 	hwif->dma_ops->dma_end(drive);
 }
-EXPORT_SYMBOL_GPL(ide_dma_timeout);
 
 /*
  * un-busy the port etc, and clear any pending DMA status. we want to
@@ -483,6 +482,7 @@ EXPORT_SYMBOL_GPL(ide_dma_timeout);
 ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 	struct request *rq;
 	ide_startstop_t ret = ide_stopped;
 
@@ -492,12 +492,14 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 
 	if (error < 0) {
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
-		(void)hwif->dma_ops->dma_end(drive);
+		(void)dma_ops->dma_end(drive);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
 		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
-		hwif->dma_ops->dma_timeout(drive);
+		if (dma_ops->dma_clear)
+			dma_ops->dma_clear(drive);
+		ide_dma_timeout(drive);
 	}
 
 	/*
diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c
index 0d4ac65cf949..51aa745246dc 100644
--- a/drivers/ide/it821x.c
+++ b/drivers/ide/it821x.c
@@ -511,9 +511,8 @@ static struct ide_dma_ops it821x_pass_through_dma_ops = {
 	.dma_start		= it821x_dma_start,
 	.dma_end		= it821x_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 7b65fe5bf449..9039a373020f 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -306,7 +306,6 @@ static const struct ide_dma_ops ns87415_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= superio_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index f7536d1943f7..248a54bd2386 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -258,12 +258,6 @@ static void pdc202xx_dma_lost_irq(ide_drive_t *drive)
 	ide_dma_lost_irq(drive);
 }
 
-static void pdc202xx_dma_timeout(ide_drive_t *drive)
-{
-	pdc202xx_reset(drive);
-	ide_dma_timeout(drive);
-}
-
 static int init_chipset_pdc202xx(struct pci_dev *dev)
 {
 	unsigned long dmabase = pci_resource_start(dev, 4);
@@ -336,7 +330,7 @@ static const struct ide_dma_ops pdc20246_dma_ops = {
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= pdc202xx_dma_timeout,
+	.dma_clear		= pdc202xx_reset,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
@@ -348,7 +342,7 @@ static const struct ide_dma_ops pdc2026x_dma_ops = {
 	.dma_test_irq		= pdc202xx_dma_test_irq,
 	.dma_lost_irq		= pdc202xx_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= pdc202xx_dma_timeout,
+	.dma_clear		= pdc202xx_reset,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 2bfcfedaa076..d15cc46a66e3 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1650,7 +1650,6 @@ static const struct ide_dma_ops pmac_dma_ops = {
 	.dma_start		= pmac_ide_dma_start,
 	.dma_end		= pmac_ide_dma_end,
 	.dma_test_irq		= pmac_ide_dma_test_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= pmac_ide_dma_lost_irq,
 };
 
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index 1c3a82914999..371549d18a01 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -291,7 +291,6 @@ static const struct ide_dma_ops sc1200_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 0cc137cfe76d..64534d150b0c 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -872,7 +872,6 @@ static const struct ide_dma_ops scc_dma_ops = {
 	.dma_end		= scc_dma_end,
 	.dma_test_irq		= scc_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= scc_dma_sff_read_status,
 };
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index b12de8346c73..44df0c750bab 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -533,7 +533,6 @@ static const struct ide_dma_ops sgiioc4_dma_ops = {
 	.dma_end		= sgiioc4_dma_end,
 	.dma_test_irq		= sgiioc4_dma_test_irq,
 	.dma_lost_irq		= sgiioc4_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info sgiioc4_port_info __devinitconst = {
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
index 075cb1243b2a..e4973cd1fba9 100644
--- a/drivers/ide/siimage.c
+++ b/drivers/ide/siimage.c
@@ -715,7 +715,6 @@ static const struct ide_dma_ops sil_dma_ops = {
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= siimage_dma_test_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
index d25137b04e7a..d6f8977191c8 100644
--- a/drivers/ide/sl82c105.c
+++ b/drivers/ide/sl82c105.c
@@ -189,14 +189,13 @@ static void sl82c105_dma_start(ide_drive_t *drive)
 	ide_dma_start(drive);
 }
 
-static void sl82c105_dma_timeout(ide_drive_t *drive)
+static void sl82c105_dma_clear(ide_drive_t *drive)
 {
 	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
 
-	DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name));
+	DBG(("sl82c105_dma_clear(drive:%s)\n", drive->name));
 
 	sl82c105_reset_host(dev);
-	ide_dma_timeout(drive);
 }
 
 static int sl82c105_dma_end(ide_drive_t *drive)
@@ -298,7 +297,7 @@ static const struct ide_dma_ops sl82c105_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= sl82c105_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= sl82c105_dma_timeout,
+	.dma_clear		= sl82c105_dma_clear,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c
index 427d4b3c2c63..b4cf42dc8a6f 100644
--- a/drivers/ide/tc86c001.c
+++ b/drivers/ide/tc86c001.c
@@ -187,7 +187,6 @@ static const struct ide_dma_ops tc86c001_dma_ops = {
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index ed1496845a93..d6a950828e9f 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -314,7 +314,6 @@ static struct ide_dma_ops trm290_dma_ops = {
 	.dma_end		= trm290_dma_end,
 	.dma_test_irq		= trm290_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timeout		= ide_dma_timeout,
 };
 
 static const struct ide_port_info trm290_chipset __devinitdata = {
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index e0e0a803dde3..53f99853b065 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -632,7 +632,6 @@ static const struct ide_dma_ops tx4939ide_dma_ops = {
 	.dma_test_irq		= tx4939ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_timeout		= ide_dma_timeout,
 	.dma_sff_read_status	= tx4939ide_dma_sff_read_status,
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index cb501bf78f7d..d3035f2f1250 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -716,8 +716,9 @@ struct ide_dma_ops {
 	int	(*dma_end)(struct ide_drive_s *);
 	int	(*dma_test_irq)(struct ide_drive_s *);
 	void	(*dma_lost_irq)(struct ide_drive_s *);
+	/* below ones are optional */
 	int	(*dma_timer_expiry)(struct ide_drive_s *);
-	void	(*dma_timeout)(struct ide_drive_s *);
+	void	(*dma_clear)(struct ide_drive_s *);
 	/*
 	 * The following method is optional and only required to be
 	 * implemented for the SFF-8038i compatible controllers.
@@ -1461,7 +1462,6 @@ static inline int config_drive_for_dma(ide_drive_t *drive) { return 0; }
 #endif /* CONFIG_BLK_DEV_IDEDMA_SFF */
 
 void ide_dma_lost_irq(ide_drive_t *);
-void ide_dma_timeout(ide_drive_t *);
 ide_startstop_t ide_dma_timeout_retry(ide_drive_t *, int);
 
 #else
-- 
cgit v1.2.3-71-gd317


From 4453011f959a5f5c6c7a33aea54fe17f5e43a867 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:20 +0200
Subject: ide: destroy DMA mappings after ending DMA (v2)

Move ide_destroy_dmatable() call out from ->dma_end method to
{ide_pc,cdrom_newpc,ide_dma}_intr(), ide_dma_timeout_retry()
and sgiioc4_resetproc().

This causes minor/safe behavior changes w.r.t.:
* cmd64x.c::cmd64{8,x}_dma_end()
* cs5536.c::cs5536_dma_end()
* icside.c::icside_dma_end()
* it821x.c::it821x_dma_end()
* scc_pata.c::__scc_dma_end()
* sl82c105.c::sl82c105_dma_end()
* tx4939ide.c::tx4939ide_dma_end()

v2:
* Fix build for CONFIG_BLK_DEV_IDEDMA=n (reported by Randy Dunlap).

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/au1xxx-ide.c  | 2 --
 drivers/ide/cmd64x.c      | 2 --
 drivers/ide/icside.c      | 3 ---
 drivers/ide/ide-atapi.c   | 7 +++++--
 drivers/ide/ide-cd.c      | 1 +
 drivers/ide/ide-dma-sff.c | 2 --
 drivers/ide/ide-dma.c     | 3 +++
 drivers/ide/ns87415.c     | 2 --
 drivers/ide/pmac.c        | 2 --
 drivers/ide/sc1200.c      | 1 -
 drivers/ide/scc_pata.c    | 2 --
 drivers/ide/sgiioc4.c     | 2 +-
 drivers/ide/trm290.c      | 3 +--
 drivers/ide/tx4939ide.c   | 4 +---
 include/linux/ide.h       | 1 +
 15 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 0c08c5e01f2a..ba2a211758a9 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -280,8 +280,6 @@ static int auide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
 
 static int auide_dma_end(ide_drive_t *drive)
 {
-	ide_destroy_dmatable(drive);
-
 	return 0;
 }
 
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index f0a49d2ff711..f2edf280ef8b 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -327,8 +327,6 @@ static int cmd646_1_dma_end(ide_drive_t *drive)
 	outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD);
 	/* clear the INTR & ERROR bits */
 	outb(dma_stat | 6, hwif->dma_base + ATA_DMA_STATUS);
-	/* and free any DMA resources */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	return (dma_stat & 7) != 4;
 }
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
index f069f122ee6e..9bf57d7c8e57 100644
--- a/drivers/ide/icside.c
+++ b/drivers/ide/icside.c
@@ -291,9 +291,6 @@ static int icside_dma_end(ide_drive_t *drive)
 
 	disable_dma(ec->dma);
 
-	/* Teardown mappings after DMA has completed. */
-	ide_destroy_dmatable(drive);
-
 	return get_dma_residue(ec->dma) != 0;
 }
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f591166d2c93..1481f71f8173 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -342,8 +342,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	stat = tp_ops->read_status(hwif);
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		if (hwif->dma_ops->dma_end(drive) ||
-		    (drive->media == ide_tape && (stat & ATA_ERR))) {
+		int rc = hwif->dma_ops->dma_end(drive);
+
+		ide_destroy_dmatable(drive);
+
+		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy)
 				printk(KERN_ERR "%s: DMA %s error\n",
 					drive->name, rq_data_dir(pc->rq)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5319e7a73708..4a0d66ee9547 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -639,6 +639,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	if (dma) {
 		drive->dma = 0;
 		dma_error = hwif->dma_ops->dma_end(drive);
+		ide_destroy_dmatable(drive);
 		if (dma_error) {
 			printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name,
 					write ? "write" : "read");
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index 7836d7e03fff..f8adbb5eb339 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -310,8 +310,6 @@ int ide_dma_end(ide_drive_t *drive)
 	/* clear INTR & ERROR bits */
 	ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR);
 
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
 	wmb();
 
 	/* verify good DMA status */
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 4e2005071113..b430898bbcd6 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -92,6 +92,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 	u8 stat = 0, dma_stat = 0;
 
 	dma_stat = hwif->dma_ops->dma_end(drive);
+	ide_destroy_dmatable(drive);
 	stat = hwif->tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
@@ -479,6 +480,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	if (error < 0) {
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
 		(void)dma_ops->dma_end(drive);
+		ide_destroy_dmatable(drive);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
@@ -490,6 +492,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 			ide_dump_status(drive, "DMA timeout",
 					hwif->tp_ops->read_status(hwif));
 			(void)dma_ops->dma_end(drive);
+			ide_destroy_dmatable(drive);
 		}
 	}
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 9039a373020f..9ad71a74f93f 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -210,8 +210,6 @@ static int ns87415_dma_end(ide_drive_t *drive)
 	/* from ERRATA: clear the INTR & ERROR bits */
 	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
 	outb(dma_cmd | 6, hwif->dma_base + ATA_DMA_CMD);
-	/* and free any DMA resources */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	return (dma_stat & 7) != 4;
 }
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index d15cc46a66e3..5643a8b957bf 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1562,8 +1562,6 @@ pmac_ide_dma_end (ide_drive_t *drive)
 	dstat = readl(&dma->status);
 	writel(((RUN|WAKE|DEAD) << 16), &dma->control);
 
-	ide_destroy_dmatable(drive);
-
 	/* verify good dma status. we don't check for ACTIVE beeing 0. We should...
 	 * in theory, but with ATAPI decices doing buffer underruns, that would
 	 * cause us to disable DMA, which isn't what we want
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
index 371549d18a01..d9c47034bedd 100644
--- a/drivers/ide/sc1200.c
+++ b/drivers/ide/sc1200.c
@@ -184,7 +184,6 @@ static int sc1200_dma_end(ide_drive_t *drive)
 	outb(inb(dma_base)&~1, dma_base);	/* !! DO THIS HERE !! stop DMA */
 
 	drive->waiting_for_dma = 0;
-	ide_destroy_dmatable(drive);		/* purge DMA mappings */
 
 	return (dma_stat & 7) != 4;		/* verify good DMA status */
 }
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 64534d150b0c..693536ebe331 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -365,8 +365,6 @@ static int __scc_dma_end(ide_drive_t *drive)
 	dma_stat = scc_dma_sff_read_status(hwif);
 	/* clear the INTR & ERROR bits */
 	scc_ide_outb(dma_stat | 6, hwif->dma_base + 4);
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
 	/* verify good DMA status */
 	wmb();
 	return (dma_stat & 7) != 4 ? (0x10 | dma_stat) : 0;
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 44df0c750bab..457a762a1f29 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -259,7 +259,6 @@ static int sgiioc4_dma_end(ide_drive_t *drive)
 	}
 
 	drive->waiting_for_dma = 0;
-	ide_destroy_dmatable(drive);
 
 	return dma_stat;
 }
@@ -284,6 +283,7 @@ static void
 sgiioc4_resetproc(ide_drive_t * drive)
 {
 	sgiioc4_dma_end(drive);
+	ide_destroy_dmatable(drive);
 	sgiioc4_clearirq(drive);
 }
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index d6a950828e9f..8dd3d8226870 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -216,8 +216,7 @@ static int trm290_dma_end(ide_drive_t *drive)
 	u16 status;
 
 	drive->waiting_for_dma = 0;
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
+
 	status = inw(drive->hwif->dma_base + 2);
 
 	return status != 0x00ff;
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 53f99853b065..f62ced855cf3 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -335,11 +335,9 @@ static int tx4939ide_dma_end(ide_drive_t *drive)
 	/* read and clear the INTR & ERROR bits */
 	dma_stat = tx4939ide_clear_dma_status(base);
 
-	/* purge DMA mappings */
-	ide_destroy_dmatable(drive);
-	/* verify good DMA status */
 	wmb();
 
+	/* verify good DMA status */
 	if ((dma_stat & (ATA_DMA_INTR | ATA_DMA_ERR | ATA_DMA_ACTIVE)) == 0 &&
 	    (ctl & (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST)) ==
 	    (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d3035f2f1250..b6c4942fde11 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1479,6 +1479,7 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_build_sglist(ide_drive_t *drive,
 				   struct ide_cmd *cmd) { return 0; }
+static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3-71-gd317


From 5ae5412d9a23b05ab08461b202bad21ad8f6b66d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:20 +0200
Subject: ide: add ide_dma_prepare() helper

* Add ide_dma_prepare() helper.

* Convert ide_issue_pc() and do_rw_taskfile() to use it.

* Make ide_build_sglist() static.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 18 ++++--------------
 drivers/ide/ide-dma.c      | 11 ++++++++++-
 drivers/ide/ide-taskfile.c |  4 +---
 include/linux/ide.h        |  7 ++++---
 4 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 1481f71f8173..89d2339bdef3 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -638,7 +638,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
 	ide_expiry_t *expiry = NULL;
 	struct request *rq = hwif->rq;
 	unsigned int timeout;
@@ -652,12 +651,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 		expiry = ide_cd_expiry;
 		timeout = ATAPI_WAIT_PC;
 
-		if (drive->dma) {
-			if (ide_build_sglist(drive, cmd))
-				drive->dma = !dma_ops->dma_setup(drive, cmd);
-			else
-				drive->dma = 0;
-		}
+		if (drive->dma)
+			drive->dma = !ide_dma_prepare(drive, cmd);
 	} else {
 		pc = drive->pc;
 
@@ -675,13 +670,8 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 			ide_dma_off(drive);
 		}
 
-		if ((pc->flags & PC_FLAG_DMA_OK) &&
-		     (drive->dev_flags & IDE_DFLAG_USING_DMA)) {
-			if (ide_build_sglist(drive, cmd))
-				drive->dma = !dma_ops->dma_setup(drive, cmd);
-			else
-				drive->dma = 0;
-		}
+		if (pc->flags & PC_FLAG_DMA_OK)
+			drive->dma = !ide_dma_prepare(drive, cmd);
 
 		if (!drive->dma)
 			pc->flags &= ~PC_FLAG_DMA_OK;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index b430898bbcd6..cf5897f5533f 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -128,7 +128,7 @@ int ide_dma_good_drive(ide_drive_t *drive)
  *	operate in a portable fashion.
  */
 
-int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
@@ -563,3 +563,12 @@ int ide_allocate_dma_engine(ide_hwif_t *hwif)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_allocate_dma_engine);
+
+int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
+	    ide_build_sglist(drive, cmd) == 0 ||
+	    drive->hwif->dma_ops->dma_setup(drive, cmd))
+		return 1;
+	return 0;
+}
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index a3b7a50562b2..dba68db629bf 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -100,9 +100,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 		ide_execute_command(drive, cmd, handler, WAIT_WORSTCASE);
 		return ide_started;
 	case ATA_PROT_DMA:
-		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-		    ide_build_sglist(drive, cmd) == 0 ||
-		    dma_ops->dma_setup(drive, cmd))
+		if (ide_dma_prepare(drive, cmd))
 			return ide_stopped;
 		hwif->expiry = dma_ops->dma_timer_expiry;
 		ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b6c4942fde11..78892e2a432c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1443,7 +1443,8 @@ ide_startstop_t ide_dma_intr(ide_drive_t *);
 int ide_allocate_dma_engine(ide_hwif_t *);
 void ide_release_dma_engine(ide_hwif_t *);
 
-int ide_build_sglist(ide_drive_t *, struct ide_cmd *);
+int ide_dma_prepare(ide_drive_t *, struct ide_cmd *);
+
 void ide_destroy_dmatable(ide_drive_t *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_SFF
@@ -1477,8 +1478,8 @@ static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
 static inline ide_startstop_t ide_dma_intr(ide_drive_t *drive) { return ide_stopped; }
 static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
-static inline int ide_build_sglist(ide_drive_t *drive,
-				   struct ide_cmd *cmd) { return 0; }
+static inline int ide_dma_prepare(ide_drive_t *drive,
+				  struct ide_cmd *cmd) { return 1; }
 static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
-- 
cgit v1.2.3-71-gd317


From 8a4a5738ba499083cf4c5668895efe220b1946d3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:21 +0200
Subject: ide: add ->dma_check method

* Add (an optional) ->dma_check method for checking if DMA can be
  used for a given command and fail DMA setup in ide_dma_prepare()
  if necessary.

* Convert alim15x3 and trm290 host drivers to use ->dma_check.

* Rename ali15x3_dma_setup() to ali_dma_check() while at it.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c |  9 +++++----
 drivers/ide/ide-dma.c  |  5 ++++-
 drivers/ide/trm290.c   | 17 ++++++++++-------
 include/linux/ide.h    |  1 +
 4 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index d3faf0b97f42..537da1cde16d 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -189,20 +189,20 @@ static void ali_set_dma_mode(ide_drive_t *drive, const u8 speed)
 }
 
 /**
- *	ali15x3_dma_setup	-	begin a DMA phase
+ *	ali_dma_check	-	DMA check
  *	@drive:	target device
  *	@cmd: command
  *
  *	Returns 1 if the DMA cannot be performed, zero on success.
  */
 
-static int ali15x3_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ali_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	if (m5229_revision < 0xC2 && drive->media != ide_disk) {
 		if (cmd->tf_flags & IDE_TFLAG_WRITE)
 			return 1;	/* try PIO instead of DMA */
 	}
-	return ide_dma_setup(drive, cmd);
+	return 0;
 }
 
 /**
@@ -503,11 +503,12 @@ static const struct ide_port_ops ali_port_ops = {
 
 static const struct ide_dma_ops ali_dma_ops = {
 	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ali15x3_dma_setup,
+	.dma_setup		= ide_dma_setup,
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= ide_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_check		= ali_dma_check,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index cf5897f5533f..c0505e2dfc2e 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -566,9 +566,12 @@ EXPORT_SYMBOL_GPL(ide_allocate_dma_engine);
 
 int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 {
+	const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops;
+
 	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
+	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) ||
 	    ide_build_sglist(drive, cmd) == 0 ||
-	    drive->hwif->dma_ops->dma_setup(drive, cmd))
+	    dma_ops->dma_setup(drive, cmd))
 		return 1;
 	return 0;
 }
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index b91bb709af40..1076efd050dc 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -176,19 +176,21 @@ static void trm290_selectproc (ide_drive_t *drive)
 	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
 }
 
-static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned int count, rw;
-
 	if (cmd->tf_flags & IDE_TFLAG_WRITE) {
 #ifdef TRM290_NO_DMA_WRITES
 		/* always use PIO for writes */
 		return 1;
 #endif
-		rw = 1;
-	} else
-		rw = 2;
+	}
+	return 0;
+}
+
+static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	unsigned int count, rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 2;
 
 	count = ide_build_dmatable(drive, cmd);
 	if (count == 0) {
@@ -312,6 +314,7 @@ static struct ide_dma_ops trm290_dma_ops = {
 	.dma_end		= trm290_dma_end,
 	.dma_test_irq		= trm290_dma_test_irq,
 	.dma_lost_irq		= ide_dma_lost_irq,
+	.dma_check		= trm290_dma_check,
 };
 
 static const struct ide_port_info trm290_chipset __devinitdata = {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 78892e2a432c..b350667b83ad 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -717,6 +717,7 @@ struct ide_dma_ops {
 	int	(*dma_test_irq)(struct ide_drive_s *);
 	void	(*dma_lost_irq)(struct ide_drive_s *);
 	/* below ones are optional */
+	int	(*dma_check)(struct ide_drive_s *, struct ide_cmd *);
 	int	(*dma_timer_expiry)(struct ide_drive_s *);
 	void	(*dma_clear)(struct ide_drive_s *);
 	/*
-- 
cgit v1.2.3-71-gd317


From f094d4d83bccee9277ddb6aadccf35747889426b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:24 +0200
Subject: ide: sanitize ide_build_sglist() and ide_destroy_dmatable()

* Move ide_map_sg() calls out from ide_build_sglist()
  to ide_dma_prepare().

* Pass command to ide_destroy_dmatable().

* Rename ide_build_sglist() to ide_dma_map_sg()
  and ide_destroy_dmatable() to ide_dma_unmap_sg().

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c |  3 ++-
 drivers/ide/ide-cd.c    |  2 +-
 drivers/ide/ide-dma.c   | 50 ++++++++++++++++++++++++-------------------------
 drivers/ide/sgiioc4.c   |  7 ++++---
 include/linux/ide.h     |  6 +++---
 5 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index db6e617790bd..3f3fc7c7b2fe 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -326,6 +326,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 {
 	struct ide_atapi_pc *pc = drive->pc;
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	xfer_func_t *xferfunc;
@@ -346,7 +347,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		drive->waiting_for_dma = 0;
 		rc = hwif->dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 
 		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
 			if (drive->media == ide_floppy)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f5c7bb739f45..35729a47f797 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -640,7 +640,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		drive->dma = 0;
 		drive->waiting_for_dma = 0;
 		dma_error = hwif->dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 		if (dma_error) {
 			printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name,
 					write ? "write" : "read");
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 4d3102887d9c..a5612eadc306 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -89,17 +89,16 @@ static const struct drive_list_entry drive_blacklist[] = {
 ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	struct ide_cmd *cmd = &hwif->cmd;
 	u8 stat = 0, dma_stat = 0;
 
 	drive->waiting_for_dma = 0;
 	dma_stat = hwif->dma_ops->dma_end(drive);
-	ide_destroy_dmatable(drive);
+	ide_dma_unmap_sg(drive, cmd);
 	stat = hwif->tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
 		if (!dma_stat) {
-			struct ide_cmd *cmd = &hwif->cmd;
-
 			if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
 				ide_finish_cmd(drive, cmd, stat);
 			else
@@ -119,8 +118,8 @@ int ide_dma_good_drive(ide_drive_t *drive)
 }
 
 /**
- *	ide_build_sglist	-	map IDE scatter gather for DMA I/O
- *	@drive: the drive to build the DMA table for
+ *	ide_dma_map_sg	-	map IDE scatter gather for DMA I/O
+ *	@drive: the drive to map the DMA table for
  *	@cmd: command
  *
  *	Perform the DMA mapping magic necessary to access the source or
@@ -129,23 +128,19 @@ int ide_dma_good_drive(ide_drive_t *drive)
  *	operate in a portable fashion.
  */
 
-static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
+static int ide_dma_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
 	int i;
 
-	ide_map_sg(drive, cmd);
-
 	if (cmd->tf_flags & IDE_TFLAG_WRITE)
 		cmd->sg_dma_direction = DMA_TO_DEVICE;
 	else
 		cmd->sg_dma_direction = DMA_FROM_DEVICE;
 
 	i = dma_map_sg(hwif->dev, sg, cmd->sg_nents, cmd->sg_dma_direction);
-	if (i == 0)
-		ide_map_sg(drive, cmd);
-	else {
+	if (i) {
 		cmd->orig_sg_nents = cmd->sg_nents;
 		cmd->sg_nents = i;
 	}
@@ -154,7 +149,7 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
 }
 
 /**
- *	ide_destroy_dmatable	-	clean up DMA mapping
+ *	ide_dma_unmap_sg	-	clean up DMA mapping
  *	@drive: The drive to unmap
  *
  *	Teardown mappings after DMA has completed. This must be called
@@ -164,15 +159,14 @@ static int ide_build_sglist(ide_drive_t *drive, struct ide_cmd *cmd)
  *	time.
  */
 
-void ide_destroy_dmatable(ide_drive_t *drive)
+void ide_dma_unmap_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
 
 	dma_unmap_sg(hwif->dev, hwif->sg_table, cmd->orig_sg_nents,
 		     cmd->sg_dma_direction);
 }
-EXPORT_SYMBOL_GPL(ide_destroy_dmatable);
+EXPORT_SYMBOL_GPL(ide_dma_unmap_sg);
 
 /**
  *	ide_dma_off_quietly	-	Generic DMA kill
@@ -471,6 +465,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
+	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq;
 	ide_startstop_t ret = ide_stopped;
 
@@ -482,7 +477,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
 		drive->waiting_for_dma = 0;
 		(void)dma_ops->dma_end(drive);
-		ide_destroy_dmatable(drive);
+		ide_dma_unmap_sg(drive, cmd);
 		ret = ide_error(drive, "dma timeout error",
 				hwif->tp_ops->read_status(hwif));
 	} else {
@@ -495,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 					hwif->tp_ops->read_status(hwif));
 			drive->waiting_for_dma = 0;
 			(void)dma_ops->dma_end(drive);
-			ide_destroy_dmatable(drive);
+			ide_dma_unmap_sg(drive, cmd);
 		}
 	}
 
@@ -572,14 +567,19 @@ int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
 	const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops;
 
 	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)) ||
-	    ide_build_sglist(drive, cmd) == 0)
-		return 1;
-	if (dma_ops->dma_setup(drive, cmd)) {
-		ide_destroy_dmatable(drive);
-		ide_map_sg(drive, cmd);
-		return 1;
-	}
+	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)))
+		goto out;
+	ide_map_sg(drive, cmd);
+	if (ide_dma_map_sg(drive, cmd) == 0)
+		goto out_map;
+	if (dma_ops->dma_setup(drive, cmd))
+		goto out_dma_unmap;
 	drive->waiting_for_dma = 1;
 	return 0;
+out_dma_unmap:
+	ide_dma_unmap_sg(drive, cmd);
+out_map:
+	ide_map_sg(drive, cmd);
+out:
+	return 1;
 }
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index cb2657c4c976..6ef5a567d377 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -277,11 +277,12 @@ static void sgiioc4_dma_host_set(ide_drive_t *drive, int on)
 		sgiioc4_clearirq(drive);
 }
 
-static void
-sgiioc4_resetproc(ide_drive_t * drive)
+static void sgiioc4_resetproc(ide_drive_t *drive)
 {
+	struct ide_cmd *cmd = &drive->hwif->cmd;
+
 	sgiioc4_dma_end(drive);
-	ide_destroy_dmatable(drive);
+	ide_dma_unmap_sg(drive, cmd);
 	sgiioc4_clearirq(drive);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b350667b83ad..03c520917b7a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1445,8 +1445,7 @@ int ide_allocate_dma_engine(ide_hwif_t *);
 void ide_release_dma_engine(ide_hwif_t *);
 
 int ide_dma_prepare(ide_drive_t *, struct ide_cmd *);
-
-void ide_destroy_dmatable(ide_drive_t *);
+void ide_dma_unmap_sg(ide_drive_t *, struct ide_cmd *);
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_SFF
 int config_drive_for_dma(ide_drive_t *);
@@ -1481,7 +1480,8 @@ static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int erro
 static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
 static inline int ide_dma_prepare(ide_drive_t *drive,
 				  struct ide_cmd *cmd) { return 1; }
-static inline void ide_destroy_dmatable(ide_drive_t *drive) { ; }
+static inline void ide_dma_unmap_sg(ide_drive_t *drive,
+				    struct ide_cmd *cmd) { ; }
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
-- 
cgit v1.2.3-71-gd317


From 41fa9f863baacd32dd049daf8050d55a0c9e6f1a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:25 +0200
Subject: ide: decrease size of ->pc_buf field in struct ide_atapi_pc

struct ide_atapi_pc is often allocated on the stack and size of ->pc_buf
size is 256 bytes.  However since only ide_floppy_create_read_capacity_cmd()
and idetape_create_inquiry_cmd() require such size allocate buffers for
these pc-s explicitely and decrease ->pc_buf size to 64 bytes.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c       | 5 ++++-
 drivers/ide/ide-floppy_ioctl.c | 5 ++++-
 drivers/ide/ide-tape.c         | 4 ++++
 include/linux/ide.h            | 2 +-
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 7ae662334835..0faae3098295 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -385,7 +385,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	struct gendisk *disk = floppy->disk;
 	struct ide_atapi_pc pc;
 	u8 *cap_desc;
-	u8 header_len, desc_cnt;
+	u8 pc_buf[256], header_len, desc_cnt;
 	int i, rc = 1, blocks, length;
 
 	drive->bios_cyl = 0;
@@ -395,6 +395,9 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	drive->capacity64 = 0;
 
 	ide_floppy_create_read_capacity_cmd(&pc);
+	pc.buf = &pc_buf[0];
+	pc.buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, disk, &pc)) {
 		printk(KERN_ERR PFX "Can't get floppy parameters\n");
 		return 1;
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 8f8be8546038..cd8a42027ede 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -36,9 +36,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 					    int __user *arg)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
-	u8 header_len, desc_cnt;
 	int i, blocks, length, u_array_size, u_index;
 	int __user *argp;
+	u8 pc_buf[256], header_len, desc_cnt;
 
 	if (get_user(u_array_size, arg))
 		return -EFAULT;
@@ -47,6 +47,9 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive,
 		return -EINVAL;
 
 	ide_floppy_create_read_capacity_cmd(pc);
+	pc->buf = &pc_buf[0];
+	pc->buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, floppy->disk, pc)) {
 		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
 		return -EIO;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 64dfa7458f8d..cafc67d9e2e8 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2014,9 +2014,13 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc pc;
+	u8 pc_buf[256];
 	char fw_rev[4], vendor_id[8], product_id[16];
 
 	idetape_create_inquiry_cmd(&pc);
+	pc.buf = &pc_buf[0];
+	pc.buf_size = sizeof(pc_buf);
+
 	if (ide_queue_pc_tail(drive, tape->disk, &pc)) {
 		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
 				tape->name);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 03c520917b7a..0f48fbd46028 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -377,7 +377,7 @@ enum {
  * With each packet command, we allocate a buffer of IDE_PC_BUFFER_SIZE bytes.
  * This is used for several packet commands (not for READ/WRITE commands).
  */
-#define IDE_PC_BUFFER_SIZE	256
+#define IDE_PC_BUFFER_SIZE	64
 #define ATAPI_WAIT_PC		(60 * HZ)
 
 struct ide_atapi_pc {
-- 
cgit v1.2.3-71-gd317


From 349d12a1fe57d49287a539909cf14f362634342d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 31 Mar 2009 20:15:26 +0200
Subject: ide-floppy: use ide_pio_bytes()

* Fix ide_init_sg_cmd() setup for non-fs requests.

* Convert ide_pc_intr() to use ide_pio_bytes() for floppy media.

* Remove no longer needed ide_io_buffers() and sg/sg_cnt fields
  from struct ide_atapi_pc.

* Remove partial completions; kill idefloppy_update_buffers(), as a
  result.

* Add some more debugging statements.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 68 ++++++++++++------------------------------------
 drivers/ide/ide-floppy.c | 24 ++++-------------
 include/linux/ide.h      |  5 ----
 3 files changed, 21 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 40f413b20bd8..100e6f94b4f0 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -71,49 +71,6 @@ int ide_check_atapi_device(ide_drive_t *drive, const char *s)
 }
 EXPORT_SYMBOL_GPL(ide_check_atapi_device);
 
-/* PIO data transfer routine using the scatter gather table. */
-int ide_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-		    unsigned int bcount, int write)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xf = write ? tp_ops->output_data : tp_ops->input_data;
-	struct scatterlist *sg = pc->sg;
-	char *buf;
-	int count, done = 0;
-
-	while (bcount) {
-		count = min(sg->length - pc->b_count, bcount);
-
-		if (PageHighMem(sg_page(sg))) {
-			unsigned long flags;
-
-			local_irq_save(flags);
-			buf = kmap_atomic(sg_page(sg), KM_IRQ0) + sg->offset;
-			xf(drive, NULL, buf + pc->b_count, count);
-			kunmap_atomic(buf - sg->offset, KM_IRQ0);
-			local_irq_restore(flags);
-		} else {
-			buf = sg_virt(sg);
-			xf(drive, NULL, buf + pc->b_count, count);
-		}
-
-		bcount -= count;
-		pc->b_count += count;
-		done += count;
-
-		if (pc->b_count == sg->length) {
-			if (!--pc->sg_cnt)
-				break;
-			pc->sg = sg = sg_next(sg);
-			pc->b_count = 0;
-		}
-	}
-
-	return done;
-}
-EXPORT_SYMBOL_GPL(ide_io_buffers);
-
 void ide_init_pc(struct ide_atapi_pc *pc)
 {
 	memset(pc, 0, sizeof(*pc));
@@ -353,6 +310,9 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			pc->xferred = pc->req_xfer;
 			if (drive->pc_update_buffers)
 				drive->pc_update_buffers(drive, pc);
+
+			if (drive->media == ide_floppy)
+				ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
@@ -408,12 +368,19 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			rq->errors = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 		} else {
+			unsigned int done;
+
 			if (blk_fs_request(rq) == 0 && uptodate <= 0) {
 				if (rq->errors == 0)
 					rq->errors = -EIO;
 			}
-			ide_complete_rq(drive, uptodate ? 0 : -EIO,
-					ide_rq_bytes(rq));
+
+			if (drive->media == ide_tape)
+				done = ide_rq_bytes(rq); /* FIXME */
+			else
+				done = blk_rq_bytes(rq);
+
+			ide_complete_rq(drive, uptodate ? 0 : -EIO, done);
 		}
 
 		return ide_stopped;
@@ -446,14 +413,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
 
-	if ((drive->media == ide_floppy && !pc->buf) ||
-	    (drive->media == ide_tape && pc->bh)) {
+	if (drive->media == ide_floppy && pc->buf == NULL) {
+		done = min_t(unsigned int, bcount, cmd->nleft);
+		ide_pio_bytes(drive, cmd, write, done);
+	} else if (drive->media == ide_tape && pc->bh) {
 		done = drive->pc_io_buffers(drive, pc, bcount, write);
-
-		/* FIXME: don't do partial completions */
-		if (drive->media == ide_floppy)
-			ide_complete_rq(drive, 0,
-					done ? done : ide_rq_bytes(rq));
 	} else {
 		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
 		xferfunc(drive, NULL, pc->cur_pos, done);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 0faae3098295..2b4868d95f8b 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -61,16 +61,6 @@
  */
 #define IDEFLOPPY_PC_DELAY	(HZ/20)	/* default delay for ZIP 100 (50ms) */
 
-static void idefloppy_update_buffers(ide_drive_t *drive,
-				struct ide_atapi_pc *pc)
-{
-	struct request *rq = pc->rq;
-	struct bio *bio = rq->bio;
-
-	while ((bio = rq->bio) != NULL)
-		ide_complete_rq(drive, 0, ide_rq_bytes(rq));
-}
-
 static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
@@ -213,7 +203,6 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 	memcpy(rq->cmd, pc->c, 12);
 
 	pc->rq = rq;
-	pc->b_count = 0;
 	if (rq->cmd_flags & REQ_RW)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = NULL;
@@ -227,7 +216,6 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	pc->b_count = 0;
 	if (rq->data_len && rq_data_dir(rq) == WRITE)
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = rq->data;
@@ -244,10 +232,11 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 					     struct request *rq, sector_t block)
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
-	ide_hwif_t *hwif = drive->hwif;
 	struct ide_cmd cmd;
 	struct ide_atapi_pc *pc;
 
+	ide_debug_log(IDE_DBG_FUNC, "enter, cmd: 0x%x\n", rq->cmd[0]);
+
 	if (drive->debug_mask & IDE_DBG_RQ)
 		blk_dump_rq_flags(rq, (rq->rq_disk
 					? rq->rq_disk->disk_name
@@ -294,13 +283,10 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	cmd.rq = rq;
 
 	if (blk_fs_request(rq) || pc->req_xfer) {
-		ide_init_sg_cmd(&cmd, rq->nr_sectors << 9);
+		ide_init_sg_cmd(&cmd, pc->req_xfer);
 		ide_map_sg(drive, &cmd);
 	}
 
-	pc->sg = hwif->sg_table;
-	pc->sg_cnt = cmd.sg_nents;
-
 	pc->rq = rq;
 
 	return ide_floppy_issue_pc(drive, &cmd, pc);
@@ -388,6 +374,8 @@ static int ide_floppy_get_capacity(ide_drive_t *drive)
 	u8 pc_buf[256], header_len, desc_cnt;
 	int i, rc = 1, blocks, length;
 
+	ide_debug_log(IDE_DBG_FUNC, "enter");
+
 	drive->bios_cyl = 0;
 	drive->bios_head = drive->bios_sect = 0;
 	floppy->blocks = 0;
@@ -488,8 +476,6 @@ static void ide_floppy_setup(ide_drive_t *drive)
 	u16 *id = drive->id;
 
 	drive->pc_callback	 = ide_floppy_callback;
-	drive->pc_update_buffers = idefloppy_update_buffers;
-	drive->pc_io_buffers	 = ide_io_buffers;
 
 	/*
 	 * We used to check revisions here. At this point however I'm giving up.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0f48fbd46028..836c4c6cb7e3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -415,9 +415,6 @@ struct ide_atapi_pc {
 	struct idetape_bh *bh;
 	char *b_data;
 
-	struct scatterlist *sg;
-	unsigned int sg_cnt;
-
 	unsigned long timeout;
 };
 
@@ -1177,8 +1174,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 
-int ide_io_buffers(ide_drive_t *, struct ide_atapi_pc *, unsigned int, int);
-
 extern void SELECT_DRIVE(ide_drive_t *);
 void SELECT_MASK(ide_drive_t *, int);
 
-- 
cgit v1.2.3-71-gd317


From ecf3a31d2a08a419bdf919456f1724f5b72bde2c Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:30 +0200
Subject: ide: turn set_irq() method into write_devctl() method

Turn set_irq() method with its software reset hack into write_devctl() method
(for just writing a value into the device control register) at last...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c     |  2 +-
 drivers/ide/au1xxx-ide.c   |  3 +--
 drivers/ide/falconide.c    |  3 +--
 drivers/ide/ide-eh.c       |  7 +++----
 drivers/ide/ide-h8300.c    |  3 +--
 drivers/ide/ide-io-std.c   | 16 +++-------------
 drivers/ide/ide-io.c       |  4 +++-
 drivers/ide/ide-iops.c     |  4 ++--
 drivers/ide/ide-pm.c       |  2 +-
 drivers/ide/ide-probe.c    |  6 +++---
 drivers/ide/ide-taskfile.c |  2 +-
 drivers/ide/ns87415.c      |  3 +--
 drivers/ide/pmac.c         | 14 ++------------
 drivers/ide/q40ide.c       |  3 +--
 drivers/ide/scc_pata.c     | 14 ++------------
 drivers/ide/sgiioc4.c      |  3 +--
 drivers/ide/tx4938ide.c    |  3 +--
 drivers/ide/tx4939ide.c    |  6 ++----
 include/linux/ide.h        |  6 ++----
 19 files changed, 32 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 8fc6ae958b0b..e6e96743aa7b 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -295,7 +295,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = {
 	.exec_command	= ide_exec_command,
 	.read_status	= ide_read_status,
 	.read_altstatus	= ide_read_altstatus,
-	.set_irq	= ide_set_irq,
+	.write_devctl	= ide_write_devctl,
 
 	.tf_load	= at91_ide_tf_load,
 	.tf_read	= at91_ide_tf_read,
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 1bfb43d0d3a8..2ca10d533dad 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -467,8 +467,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index b368a5effc3a..5063be85dc33 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -89,8 +89,7 @@ static const struct ide_tp_ops falconide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 11664976eea3..de4b7f1c9c9f 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -401,15 +401,14 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	 * immediate interrupt due to the edge transition it produces.
 	 * This single interrupt gives us a "fast poll" for drives that
 	 * recover from reset very quickly, saving us the first 50ms wait time.
-	 *
-	 * TODO: add ->softreset method and stop abusing ->set_irq
 	 */
 	/* set SRST and nIEN */
-	tp_ops->set_irq(hwif, 4);
+	tp_ops->write_devctl(hwif, ATA_SRST | ATA_NIEN | ATA_DEVCTL_OBS);
 	/* more than enough time */
 	udelay(10);
 	/* clear SRST, leave nIEN (unless device is on the quirk list) */
-	tp_ops->set_irq(hwif, drive->quirk_list == 2);
+	tp_ops->write_devctl(hwif, (drive->quirk_list == 2 ? 0 : ATA_NIEN) |
+			     ATA_DEVCTL_OBS);
 	/* more than enough time */
 	udelay(10);
 	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 7492f28d1290..a57ccad61acf 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -159,8 +159,7 @@ static const struct ide_tp_ops h8300_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= h8300_tf_load,
 	.tf_read		= h8300_tf_read,
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 3a867e49a0af..bbeedce6b17d 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -64,23 +64,14 @@ u8 ide_read_altstatus(ide_hwif_t *hwif)
 }
 EXPORT_SYMBOL_GPL(ide_read_altstatus);
 
-void ide_set_irq(ide_hwif_t *hwif, int on)
+void ide_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	if (hwif->host_flags & IDE_HFLAG_MMIO)
 		writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
 	else
 		outb(ctl, hwif->io_ports.ctl_addr);
 }
-EXPORT_SYMBOL_GPL(ide_set_irq);
+EXPORT_SYMBOL_GPL(ide_write_devctl);
 
 void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
@@ -312,8 +303,7 @@ const struct ide_tp_ops default_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3c52317d8524..5b57905a7d71 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -494,7 +494,9 @@ repeat:
 			 * quirk_list may not like intr setups/cleanups
 			 */
 			if (prev_port && prev_port->cur_dev->quirk_list == 0)
-				prev_port->tp_ops->set_irq(prev_port, 0);
+				prev_port->tp_ops->write_devctl(prev_port,
+								ATA_NIEN |
+								ATA_DEVCTL_OBS);
 
 			hwif->host->cur_port = hwif;
 		}
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 0caca342802d..ae227dd8466f 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -360,7 +360,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	SELECT_DRIVE(drive);
 	SELECT_MASK(drive, 1);
 	udelay(1);
-	tp_ops->set_irq(hwif, 0);
+	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf_flags = IDE_TFLAG_OUT_FEATURE | IDE_TFLAG_OUT_NSECT;
@@ -372,7 +372,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
 
 	if (drive->quirk_list == 2)
-		tp_ops->set_irq(hwif, 1);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	error = __ide_wait_stat(drive, drive->ready_stat,
 				ATA_BUSY | ATA_DRQ | ATA_ERR,
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ebf2d21ebdcb..20553d4c42a2 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -233,7 +233,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		if (rc)
 			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
 		SELECT_DRIVE(drive);
-		hwif->tp_ops->set_irq(hwif, 1);
+		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		rc = ide_wait_not_busy(hwif, 100000);
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 7c1f1bf81836..d240f76b0da6 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -260,7 +260,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 	 * during the identify phase that the IRQ handler isn't expecting.
 	 */
 	if (io_ports->ctl_addr)
-		tp_ops->set_irq(hwif, 0);
+		tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
 
 	/* take a deep breath */
 	msleep(50);
@@ -628,7 +628,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 		if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 ||
 		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
 			SELECT_DRIVE(drive);
-			hwif->tp_ops->set_irq(hwif, 1);
+			hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 			mdelay(2);
 			rc = ide_wait_not_busy(hwif, 35000);
 			if (rc)
@@ -845,7 +845,7 @@ static int init_irq (ide_hwif_t *hwif)
 		irq_handler = ide_intr;
 
 	if (io_ports->ctl_addr)
-		hwif->tp_ops->set_irq(hwif, 1);
+		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 
 	if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif))
 		goto out_up;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dba68db629bf..47f13cd11031 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -80,7 +80,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
 
 	if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
 		ide_tf_dump(drive->name, tf);
-		tp_ops->set_irq(hwif, 1);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		SELECT_MASK(drive, 0);
 		tp_ops->tf_load(drive, cmd);
 	}
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 13a9e00efa13..00ab0be7335a 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -109,8 +109,7 @@ static const struct ide_tp_ops superio_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= superio_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= superio_tf_read,
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 879c3d8d9f36..7aa45ea37eeb 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -476,17 +476,8 @@ static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd)
 				     + IDE_TIMING_CONFIG));
 }
 
-static void pmac_set_irq(ide_hwif_t *hwif, int on)
+static void pmac_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
 	(void)readl((void __iomem *)(hwif->io_ports.data_addr
 				     + IDE_TIMING_CONFIG));
@@ -954,8 +945,7 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.exec_command		= pmac_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= pmac_set_irq,
+	.write_devctl		= pmac_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 2a43a2f49633..7fddfd34fcce 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -99,8 +99,7 @@ static const struct ide_tp_ops q40ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 6e47eac1cd7f..6ba4983d831c 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -148,17 +148,8 @@ static u8 scc_dma_sff_read_status(ide_hwif_t *hwif)
 	return (u8)in_be32((void *)(hwif->dma_base + 4));
 }
 
-static void scc_set_irq(ide_hwif_t *hwif, int on)
+static void scc_write_devctl(ide_hwif_t *hwif, u8 ctl)
 {
-	u8 ctl = ATA_DEVCTL_OBS;
-
-	if (on == 4) { /* hack for SRST */
-		ctl |= 4;
-		on &= ~4;
-	}
-
-	ctl |= on ? 0 : 2;
-
 	out_be32((void *)hwif->io_ports.ctl_addr, ctl);
 	eieio();
 	in_be32((void *)(hwif->dma_base + 0x01c));
@@ -843,8 +834,7 @@ static const struct ide_tp_ops scc_tp_ops = {
 	.exec_command		= scc_exec_command,
 	.read_status		= scc_read_status,
 	.read_altstatus		= scc_read_altstatus,
-
-	.set_irq		= scc_set_irq,
+	.write_devctl		= scc_write_devctl,
 
 	.tf_load		= scc_tf_load,
 	.tf_read		= scc_tf_read,
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 6ef5a567d377..58980fcafc3b 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -503,8 +503,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= sgiioc4_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index 1c4a78ac1a20..ec3aa32fbbe0 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -204,8 +204,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4938ide_tf_load,
 	.tf_read		= tx4938ide_tf_read,
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 77aee5b2ce95..43bc0372413a 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -571,8 +571,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= tx4939ide_tf_read,
@@ -595,8 +594,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= ide_read_status,
 	.read_altstatus		= ide_read_altstatus,
-
-	.set_irq		= ide_set_irq,
+	.write_devctl		= ide_write_devctl,
 
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= ide_tf_read,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 836c4c6cb7e3..ccb70abe991b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -655,8 +655,7 @@ struct ide_tp_ops {
 	void	(*exec_command)(struct hwif_s *, u8);
 	u8	(*read_status)(struct hwif_s *);
 	u8	(*read_altstatus)(struct hwif_s *);
-
-	void	(*set_irq)(struct hwif_s *, int);
+	void	(*write_devctl)(struct hwif_s *, u8);
 
 	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
@@ -1165,8 +1164,7 @@ void ide_tf_dump(const char *, struct ide_taskfile *);
 void ide_exec_command(ide_hwif_t *, u8);
 u8 ide_read_status(ide_hwif_t *);
 u8 ide_read_altstatus(ide_hwif_t *);
-
-void ide_set_irq(ide_hwif_t *, int);
+void ide_write_devctl(ide_hwif_t *, u8);
 
 void ide_tf_load(ide_drive_t *, struct ide_cmd *);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
-- 
cgit v1.2.3-71-gd317


From 6762511934e6e7287ce3c8baac0d52ef64e3787b Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:30 +0200
Subject: ide: rename IDE_TFLAG_IN_[HOB_]FEATURE

The feature register has never been readable -- when its location is read, one
gets the error register value; hence rename IDE_TFLAG_IN_[HOB_]FEATURE into
IDE_TFLAG_IN_[HOB_]ERROR and introduce the 'hob_error' field into the 'struct
ide_taskfile' (despite the error register not really depending on the HOB bit).

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   | 16 ++++++++--------
 drivers/ide/ide-h8300.c  | 16 ++++++++--------
 drivers/ide/ide-io-std.c | 16 ++++++++--------
 drivers/ide/ide-iops.c   |  2 +-
 drivers/ide/ns87415.c    | 16 ++++++++--------
 drivers/ide/scc_pata.c   | 16 ++++++++--------
 drivers/ide/tx4938ide.c  | 17 ++++++++---------
 drivers/ide/tx4939ide.c  | 17 ++++++++---------
 include/linux/ide.h      | 12 ++++++++----
 9 files changed, 65 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index e6e96743aa7b..9dce793d93b4 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -244,8 +244,8 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	ide_mm_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = ide_mm_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = ide_mm_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = ide_mm_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -260,16 +260,16 @@ static void at91_ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		ide_mm_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = ide_mm_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = ide_mm_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = ide_mm_inb(io_ports->nsect_addr);
+			tf->hob_nsect = ide_mm_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = ide_mm_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = ide_mm_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = ide_mm_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = ide_mm_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = ide_mm_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = ide_mm_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index a57ccad61acf..1d45cd5b6a1c 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -100,8 +100,8 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -116,16 +116,16 @@ static void h8300_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = inb(io_ports->nsect_addr);
+			tf->hob_nsect = inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = inb(io_ports->lbal_addr);
+			tf->hob_lbal  = inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = inb(io_ports->lbam_addr);
+			tf->hob_lbam  = inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = inb(io_ports->lbah_addr);
+			tf->hob_lbah  = inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index bbeedce6b17d..31f5c5f4c093 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -159,8 +159,8 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tf_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tf_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tf_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tf_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -175,16 +175,16 @@ void ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tf_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = tf_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = tf_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tf_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tf_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tf_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tf_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tf_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tf_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tf_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tf_inb(io_ports->lbah_addr);
 	}
 }
 EXPORT_SYMBOL_GPL(ide_tf_read);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index ae227dd8466f..6f363a26700d 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -55,7 +55,7 @@ u8 ide_read_error(ide_drive_t *drive)
 	struct ide_cmd cmd;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_IN_FEATURE;
+	cmd.tf_flags = IDE_TFLAG_IN_ERROR;
 
 	drive->hwif->tp_ops->tf_read(drive, &cmd);
 
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 00ab0be7335a..0a6cf74c3265 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -76,8 +76,8 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -92,16 +92,16 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = inb(io_ports->nsect_addr);
+			tf->hob_nsect = inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = inb(io_ports->lbal_addr);
+			tf->hob_lbal  = inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = inb(io_ports->lbam_addr);
+			tf->hob_lbam  = inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = inb(io_ports->lbah_addr);
+			tf->hob_lbah  = inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 6ba4983d831c..ea0a9752c6f9 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -702,8 +702,8 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	scc_ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = scc_ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = scc_ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = scc_ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -718,16 +718,16 @@ static void scc_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		scc_ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature = scc_ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = scc_ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = scc_ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = scc_ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = scc_ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = scc_ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = scc_ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = scc_ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = scc_ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = scc_ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index ec3aa32fbbe0..606c37f5267d 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -144,8 +144,8 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tx4938ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tx4938ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tx4938ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4938ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -160,17 +160,16 @@ static void tx4938ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4938ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature =
-				tx4938ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error = tx4938ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tx4938ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tx4938ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tx4938ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tx4938ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tx4938ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tx4938ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tx4938ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tx4938ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 43bc0372413a..f1e9da71110c 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -511,8 +511,8 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	/* be sure we're looking at the low order bits */
 	tx4939ide_outb(ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-	if (cmd->tf_flags & IDE_TFLAG_IN_FEATURE)
-		tf->feature = tx4939ide_inb(io_ports->feature_addr);
+	if (cmd->tf_flags & IDE_TFLAG_IN_ERROR)
+		tf->error  = tx4939ide_inb(io_ports->feature_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tx4939ide_inb(io_ports->nsect_addr);
 	if (cmd->tf_flags & IDE_TFLAG_IN_LBAL)
@@ -527,17 +527,16 @@ static void tx4939ide_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
 		tx4939ide_outb(ATA_HOB | ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
-		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
-			tf->hob_feature =
-				tx4939ide_inb(io_ports->feature_addr);
+		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_ERROR)
+			tf->hob_error =	tx4939ide_inb(io_ports->feature_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_NSECT)
-			tf->hob_nsect   = tx4939ide_inb(io_ports->nsect_addr);
+			tf->hob_nsect = tx4939ide_inb(io_ports->nsect_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAL)
-			tf->hob_lbal    = tx4939ide_inb(io_ports->lbal_addr);
+			tf->hob_lbal  = tx4939ide_inb(io_ports->lbal_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAM)
-			tf->hob_lbam    = tx4939ide_inb(io_ports->lbam_addr);
+			tf->hob_lbam  = tx4939ide_inb(io_ports->lbam_addr);
 		if (cmd->tf_flags & IDE_TFLAG_IN_HOB_LBAH)
-			tf->hob_lbah    = tx4939ide_inb(io_ports->lbah_addr);
+			tf->hob_lbah  = tx4939ide_inb(io_ports->lbah_addr);
 	}
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ccb70abe991b..e919c865f0c7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -265,7 +265,7 @@ enum {
 	IDE_TFLAG_WRITE			= (1 << 12),
 	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 13),
 	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 14),
-	IDE_TFLAG_IN_HOB_FEATURE	= (1 << 15),
+	IDE_TFLAG_IN_HOB_ERROR		= (1 << 15),
 	IDE_TFLAG_IN_HOB_NSECT		= (1 << 16),
 	IDE_TFLAG_IN_HOB_LBAL		= (1 << 17),
 	IDE_TFLAG_IN_HOB_LBAM		= (1 << 18),
@@ -273,10 +273,10 @@ enum {
 	IDE_TFLAG_IN_HOB_LBA		= IDE_TFLAG_IN_HOB_LBAL |
 					  IDE_TFLAG_IN_HOB_LBAM |
 					  IDE_TFLAG_IN_HOB_LBAH,
-	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_FEATURE |
+	IDE_TFLAG_IN_HOB		= IDE_TFLAG_IN_HOB_ERROR |
 					  IDE_TFLAG_IN_HOB_NSECT |
 					  IDE_TFLAG_IN_HOB_LBA,
-	IDE_TFLAG_IN_FEATURE		= (1 << 20),
+	IDE_TFLAG_IN_ERROR		= (1 << 20),
 	IDE_TFLAG_IN_NSECT		= (1 << 21),
 	IDE_TFLAG_IN_LBAL		= (1 << 22),
 	IDE_TFLAG_IN_LBAM		= (1 << 23),
@@ -310,8 +310,12 @@ enum {
 
 struct ide_taskfile {
 	u8	hob_data;	/*  0: high data byte (for TASKFILE IOCTL) */
+				/*  1-5: additional data to support LBA48 */
+	union {
+		u8 hob_error;	/*   read: error */
+		u8 hob_feature;	/*  write: feature */
+	};
 
-	u8	hob_feature;	/*  1-5: additional data to support LBA48 */
 	u8	hob_nsect;
 	u8	hob_lbal;
 	u8	hob_lbam;
-- 
cgit v1.2.3-71-gd317


From abb596b25edac1ec1acc4ef53df190771661c3d2 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:32 +0200
Subject: ide: turn selectproc() method into dev_select() method (take 5)

Turn selectproc() method into dev_select() method by teaching it to write to the
device register and moving it from 'struct ide_port_ops' to 'struct ide_tp_ops'.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: benh@kernel.crashing.org
Cc: petkovbb@gmail.com
[bart: add ->dev_select to at91_ide.c and tx4939.c (__BIG_ENDIAN case)]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/at91_ide.c   |  1 +
 drivers/ide/au1xxx-ide.c |  1 +
 drivers/ide/falconide.c  |  1 +
 drivers/ide/ht6560b.c    | 20 +++++++++++++++--
 drivers/ide/ide-h8300.c  |  1 +
 drivers/ide/ide-io-std.c | 13 +++++++++++
 drivers/ide/ide-iops.c   | 12 +---------
 drivers/ide/ns87415.c    | 25 ++++++++++++++++-----
 drivers/ide/pmac.c       | 58 ++++++++++++++++++++++++++++++++----------------
 drivers/ide/q40ide.c     |  1 +
 drivers/ide/qd65xx.c     | 21 +++++++++++++++---
 drivers/ide/scc_pata.c   |  1 +
 drivers/ide/sgiioc4.c    |  1 +
 drivers/ide/trm290.c     | 20 +++++++++++++----
 drivers/ide/tx4938ide.c  |  1 +
 drivers/ide/tx4939ide.c  |  4 +++-
 include/linux/ide.h      |  6 ++---
 17 files changed, 139 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c
index 04b39ff02d76..8eda552326e9 100644
--- a/drivers/ide/at91_ide.c
+++ b/drivers/ide/at91_ide.c
@@ -283,6 +283,7 @@ static const struct ide_tp_ops at91_ide_tp_ops = {
 	.read_altstatus	= ide_read_altstatus,
 	.write_devctl	= ide_write_devctl,
 
+	.dev_select	= ide_dev_select,
 	.tf_load	= at91_ide_tf_load,
 	.tf_read	= at91_ide_tf_read,
 
diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c
index 2ca10d533dad..46013644c965 100644
--- a/drivers/ide/au1xxx-ide.c
+++ b/drivers/ide/au1xxx-ide.c
@@ -469,6 +469,7 @@ static const struct ide_tp_ops au1xxx_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
index 5063be85dc33..afa2af9a362b 100644
--- a/drivers/ide/falconide.c
+++ b/drivers/ide/falconide.c
@@ -91,6 +91,7 @@ static const struct ide_tp_ops falconide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c
index c7e5c2246b79..2fb0f2965009 100644
--- a/drivers/ide/ht6560b.c
+++ b/drivers/ide/ht6560b.c
@@ -103,7 +103,7 @@
 /*
  * This routine is invoked from ide.c to prepare for access to a given drive.
  */
-static void ht6560b_selectproc (ide_drive_t *drive)
+static void ht6560b_dev_select(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
@@ -143,6 +143,8 @@ static void ht6560b_selectproc (ide_drive_t *drive)
 #endif
 	}
 	local_irq_restore(flags);
+
+	outb(drive->select | ATA_DEVICE_OBS, hwif->io_ports.device_addr);
 }
 
 /*
@@ -305,15 +307,29 @@ static int probe_ht6560b;
 module_param_named(probe, probe_ht6560b, bool, 0);
 MODULE_PARM_DESC(probe, "probe for HT6560B chipset");
 
+static const struct ide_tp_ops ht6560b_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= ht6560b_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
+};
+
 static const struct ide_port_ops ht6560b_port_ops = {
 	.init_dev		= ht6560b_init_dev,
 	.set_pio_mode		= ht6560b_set_pio_mode,
-	.selectproc		= ht6560b_selectproc,
 };
 
 static const struct ide_port_info ht6560b_port_info __initdata = {
 	.name			= DRV_NAME,
 	.chipset		= ide_ht6560b,
+	.tp_ops 		= &ht6560b_tp_ops,
 	.port_ops		= &ht6560b_port_ops,
 	.host_flags		= IDE_HFLAG_SERIALIZE | /* is this needed? */
 				  IDE_HFLAG_NO_DMA |
diff --git a/drivers/ide/ide-h8300.c b/drivers/ide/ide-h8300.c
index 8541a9abd7ac..dac9a6d44963 100644
--- a/drivers/ide/ide-h8300.c
+++ b/drivers/ide/ide-h8300.c
@@ -151,6 +151,7 @@ static const struct ide_tp_ops h8300_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= h8300_tf_load,
 	.tf_read		= h8300_tf_read,
 
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 7f77bb7db488..9cac281d82c4 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -73,6 +73,18 @@ void ide_write_devctl(ide_hwif_t *hwif, u8 ctl)
 }
 EXPORT_SYMBOL_GPL(ide_write_devctl);
 
+void ide_dev_select(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	u8 select = drive->select | ATA_DEVICE_OBS;
+
+	if (hwif->host_flags & IDE_HFLAG_MMIO)
+		writeb(select, (void __iomem *)hwif->io_ports.device_addr);
+	else
+		outb(select, hwif->io_ports.device_addr);
+}
+EXPORT_SYMBOL_GPL(ide_dev_select);
+
 void ide_tf_load(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -280,6 +292,7 @@ const struct ide_tp_ops default_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 6f363a26700d..dfb0ec317fa3 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -29,17 +29,7 @@
 
 void SELECT_DRIVE(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	struct ide_cmd cmd;
-
-	if (port_ops && port_ops->selectproc)
-		port_ops->selectproc(drive);
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf_flags = IDE_TFLAG_OUT_DEVICE;
-
-	drive->hwif->tp_ops->tf_load(drive, &cmd);
+	drive->hwif->tp_ops->dev_select(drive);
 }
 
 void SELECT_MASK(ide_drive_t *drive, int mask)
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index 9f6dff83b141..af1b421eb450 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -98,12 +98,15 @@ static void superio_tf_read(ide_drive_t *drive, struct ide_cmd *cmd)
 	}
 }
 
+static void ns87415_dev_select(ide_drive_t *drive);
+
 static const struct ide_tp_ops superio_tp_ops = {
 	.exec_command		= ide_exec_command,
 	.read_status		= superio_read_status,
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ns87415_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= superio_tf_read,
 
@@ -182,10 +185,12 @@ static void ns87415_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
 	local_irq_restore(flags);
 }
 
-static void ns87415_selectproc (ide_drive_t *drive)
+static void ns87415_dev_select(ide_drive_t *drive)
 {
 	ns87415_prepare_drive(drive,
 			      !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 static void ns87415_dma_start(ide_drive_t *drive)
@@ -229,7 +234,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 	 * Also, leave IRQ masked during drive probing, to prevent infinite
 	 * interrupts from a potentially floating INTA..
 	 *
-	 * IRQs get unmasked in selectproc when drive is first used.
+	 * IRQs get unmasked in dev_select() when drive is first used.
 	 */
 	(void) pci_read_config_dword(dev, 0x40, &ctrl);
 	(void) pci_read_config_byte(dev, 0x09, &progif);
@@ -281,8 +286,18 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 	outb(0x60, hwif->dma_base + ATA_DMA_STATUS);
 }
 
-static const struct ide_port_ops ns87415_port_ops = {
-	.selectproc		= ns87415_selectproc,
+static const struct ide_tp_ops ns87415_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= ns87415_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static const struct ide_dma_ops ns87415_dma_ops = {
@@ -299,7 +314,7 @@ static const struct ide_dma_ops ns87415_dma_ops = {
 static const struct ide_port_info ns87415_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_hwif	= init_hwif_ns87415,
-	.port_ops	= &ns87415_port_ops,
+	.tp_ops 	= &ns87415_tp_ops,
 	.dma_ops	= &ns87415_dma_ops,
 	.host_flags	= IDE_HFLAG_TRUST_BIOS_FOR_DMA |
 			  IDE_HFLAG_NO_ATAPI_DMA,
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 7aa45ea37eeb..24ce1f805cd7 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -404,8 +404,6 @@ kauai_lookup_timing(struct kauai_timing* table, int cycle_time)
 #define IDE_WAKEUP_DELAY	(1*HZ)
 
 static int pmac_ide_init_dma(ide_hwif_t *, const struct ide_port_info *);
-static void pmac_ide_selectproc(ide_drive_t *drive);
-static void pmac_ide_kauai_selectproc(ide_drive_t *drive);
 
 #define PMAC_IDE_REG(x) \
 	((void __iomem *)((drive)->hwif->io_ports.data_addr + (x)))
@@ -415,8 +413,7 @@ static void pmac_ide_kauai_selectproc(ide_drive_t *drive);
  * timing register when selecting that unit. This version is for
  * ASICs with a single timing register
  */
-static void
-pmac_ide_selectproc(ide_drive_t *drive)
+static void pmac_ide_apply_timings(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
@@ -434,8 +431,7 @@ pmac_ide_selectproc(ide_drive_t *drive)
  * timing register when selecting that unit. This version is for
  * ASICs with a dual timing register (Kauai)
  */
-static void
-pmac_ide_kauai_selectproc(ide_drive_t *drive)
+static void pmac_ide_kauai_apply_timings(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	pmac_ide_hwif_t *pmif =
@@ -464,9 +460,25 @@ pmac_ide_do_update_timings(ide_drive_t *drive)
 	if (pmif->kind == controller_sh_ata6 ||
 	    pmif->kind == controller_un_ata6 ||
 	    pmif->kind == controller_k2_ata6)
-		pmac_ide_kauai_selectproc(drive);
+		pmac_ide_kauai_apply_timings(drive);
 	else
-		pmac_ide_selectproc(drive);
+		pmac_ide_apply_timings(drive);
+}
+
+static void pmac_dev_select(ide_drive_t *drive)
+{
+	pmac_ide_apply_timings(drive);
+
+	writeb(drive->select | ATA_DEVICE_OBS,
+	       (void __iomem *)drive->hwif->io_ports.device_addr);
+}
+
+static void pmac_kauai_dev_select(ide_drive_t *drive)
+{
+	pmac_ide_kauai_apply_timings(drive);
+
+	writeb(drive->select | ATA_DEVICE_OBS,
+	       (void __iomem *)drive->hwif->io_ports.device_addr);
 }
 
 static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd)
@@ -947,6 +959,7 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= pmac_write_devctl,
 
+	.dev_select		= pmac_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
@@ -954,19 +967,24 @@ static const struct ide_tp_ops pmac_tp_ops = {
 	.output_data		= ide_output_data,
 };
 
-static const struct ide_port_ops pmac_ide_ata6_port_ops = {
-	.init_dev		= pmac_ide_init_dev,
-	.set_pio_mode		= pmac_ide_set_pio_mode,
-	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_kauai_selectproc,
-	.cable_detect		= pmac_ide_cable_detect,
+static const struct ide_tp_ops pmac_ata6_tp_ops = {
+	.exec_command		= pmac_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= pmac_write_devctl,
+
+	.dev_select		= pmac_kauai_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static const struct ide_port_ops pmac_ide_ata4_port_ops = {
 	.init_dev		= pmac_ide_init_dev,
 	.set_pio_mode		= pmac_ide_set_pio_mode,
 	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_selectproc,
 	.cable_detect		= pmac_ide_cable_detect,
 };
 
@@ -974,7 +992,6 @@ static const struct ide_port_ops pmac_ide_port_ops = {
 	.init_dev		= pmac_ide_init_dev,
 	.set_pio_mode		= pmac_ide_set_pio_mode,
 	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.selectproc		= pmac_ide_selectproc,
 };
 
 static const struct ide_dma_ops pmac_dma_ops;
@@ -1011,15 +1028,18 @@ static int __devinit pmac_ide_setup_device(pmac_ide_hwif_t *pmif, hw_regs_t *hw)
 	pmif->broken_dma = pmif->broken_dma_warn = 0;
 	if (of_device_is_compatible(np, "shasta-ata")) {
 		pmif->kind = controller_sh_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA6;
 	} else if (of_device_is_compatible(np, "kauai-ata")) {
 		pmif->kind = controller_un_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA5;
 	} else if (of_device_is_compatible(np, "K2-UATA")) {
 		pmif->kind = controller_k2_ata6;
-		d.port_ops = &pmac_ide_ata6_port_ops;
+		d.tp_ops = &pmac_ata6_tp_ops;
+		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA5;
 	} else if (of_device_is_compatible(np, "keylargo-ata")) {
 		if (strcmp(np->name, "ata-4") == 0) {
diff --git a/drivers/ide/q40ide.c b/drivers/ide/q40ide.c
index 7fddfd34fcce..d007e7f66598 100644
--- a/drivers/ide/q40ide.c
+++ b/drivers/ide/q40ide.c
@@ -101,6 +101,7 @@ static const struct ide_tp_ops q40ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c
index 08c4fa35e9b1..c9a134986891 100644
--- a/drivers/ide/qd65xx.c
+++ b/drivers/ide/qd65xx.c
@@ -90,13 +90,15 @@ static int timings[4]={-1,-1,-1,-1}; /* stores current timing for each timer */
  * This routine is invoked to prepare for access to a given drive.
  */
 
-static void qd65xx_select(ide_drive_t *drive)
+static void qd65xx_dev_select(ide_drive_t *drive)
 {
 	u8 index = ((	(QD_TIMREG(drive)) & 0x80 ) >> 7) |
 			(QD_TIMREG(drive) & 0x02);
 
 	if (timings[index] != QD_TIMING(drive))
 		outb(timings[index] = QD_TIMING(drive), QD_TIMREG(drive));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 /*
@@ -309,20 +311,33 @@ static void __init qd6580_init_dev(ide_drive_t *drive)
 	drive->drive_data = (drive->dn & 1) ? t2 : t1;
 }
 
+static const struct ide_tp_ops qd65xx_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= qd65xx_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
+};
+
 static const struct ide_port_ops qd6500_port_ops = {
 	.init_dev		= qd6500_init_dev,
 	.set_pio_mode		= qd6500_set_pio_mode,
-	.selectproc		= qd65xx_select,
 };
 
 static const struct ide_port_ops qd6580_port_ops = {
 	.init_dev		= qd6580_init_dev,
 	.set_pio_mode		= qd6580_set_pio_mode,
-	.selectproc		= qd65xx_select,
 };
 
 static const struct ide_port_info qd65xx_port_info __initdata = {
 	.name			= DRV_NAME,
+	.tp_ops 		= &qd65xx_tp_ops,
 	.chipset		= ide_qd65xx,
 	.host_flags		= IDE_HFLAG_IO_32BIT |
 				  IDE_HFLAG_NO_DMA,
diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c
index 97f8e0ef21b1..6d8dbd9c10bc 100644
--- a/drivers/ide/scc_pata.c
+++ b/drivers/ide/scc_pata.c
@@ -825,6 +825,7 @@ static const struct ide_tp_ops scc_tp_ops = {
 	.read_altstatus		= scc_read_altstatus,
 	.write_devctl		= scc_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= scc_tf_load,
 	.tf_read		= scc_tf_read,
 
diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c
index 58980fcafc3b..e5d2a48a84de 100644
--- a/drivers/ide/sgiioc4.c
+++ b/drivers/ide/sgiioc4.c
@@ -505,6 +505,7 @@ static const struct ide_tp_ops sgiioc4_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index c0528f27fcae..4b42ca091534 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -171,9 +171,11 @@ static void trm290_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
 	local_irq_restore(flags);
 }
 
-static void trm290_selectproc (ide_drive_t *drive)
+static void trm290_dev_select(ide_drive_t *drive)
 {
 	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
+
+	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
 }
 
 static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
@@ -298,8 +300,18 @@ static void __devinit init_hwif_trm290(ide_hwif_t *hwif)
 #endif
 }
 
-static const struct ide_port_ops trm290_port_ops = {
-	.selectproc		= trm290_selectproc,
+static const struct ide_tp_ops trm290_tp_ops = {
+	.exec_command		= ide_exec_command,
+	.read_status		= ide_read_status,
+	.read_altstatus		= ide_read_altstatus,
+	.write_devctl		= ide_write_devctl,
+
+	.dev_select		= trm290_dev_select,
+	.tf_load		= ide_tf_load,
+	.tf_read		= ide_tf_read,
+
+	.input_data		= ide_input_data,
+	.output_data		= ide_output_data,
 };
 
 static struct ide_dma_ops trm290_dma_ops = {
@@ -315,7 +327,7 @@ static struct ide_dma_ops trm290_dma_ops = {
 static const struct ide_port_info trm290_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_hwif	= init_hwif_trm290,
-	.port_ops	= &trm290_port_ops,
+	.tp_ops 	= &trm290_tp_ops,
 	.dma_ops	= &trm290_dma_ops,
 	.host_flags	= IDE_HFLAG_TRM290 |
 			  IDE_HFLAG_NO_ATAPI_DMA |
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
index be391b615963..4cb79c4c2604 100644
--- a/drivers/ide/tx4938ide.c
+++ b/drivers/ide/tx4938ide.c
@@ -189,6 +189,7 @@ static const struct ide_tp_ops tx4938ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4938ide_tf_load,
 	.tf_read		= tx4938ide_tf_read,
 
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 5a614d1c94f1..0040a9a3e26e 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -429,7 +429,7 @@ static void tx4939ide_tf_load_fixup(ide_drive_t *drive)
 	 * Fix ATA100 CORE System Control Register. (The write to the
 	 * Device/Head register may write wrong data to the System
 	 * Control Register)
-	 * While Sys_Ctl is written here, selectproc is not needed.
+	 * While Sys_Ctl is written here, dev_select() is not needed.
 	 */
 	tx4939ide_writew(sysctl, base, TX4939IDE_Sys_Ctl);
 }
@@ -556,6 +556,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= tx4939ide_tf_read,
 
@@ -579,6 +580,7 @@ static const struct ide_tp_ops tx4939ide_tp_ops = {
 	.read_altstatus		= ide_read_altstatus,
 	.write_devctl		= ide_write_devctl,
 
+	.dev_select		= ide_dev_select,
 	.tf_load		= tx4939ide_tf_load,
 	.tf_read		= ide_tf_read,
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e919c865f0c7..c69181c61fd8 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -603,7 +603,7 @@ struct ide_drive_s {
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
-	unsigned int	drive_data;	/* used by set_pio_mode/selectproc */
+	unsigned int	drive_data;	/* used by set_pio_mode/dev_select() */
 	unsigned int	failures;	/* current failure count */
 	unsigned int	max_failures;	/* maximum allowed failure count */
 	u64		probed_capacity;/* initial reported media capacity (ide-cd only currently) */
@@ -661,6 +661,7 @@ struct ide_tp_ops {
 	u8	(*read_altstatus)(struct hwif_s *);
 	void	(*write_devctl)(struct hwif_s *, u8);
 
+	void	(*dev_select)(ide_drive_t *);
 	void	(*tf_load)(ide_drive_t *, struct ide_cmd *);
 	void	(*tf_read)(ide_drive_t *, struct ide_cmd *);
 
@@ -678,7 +679,6 @@ extern const struct ide_tp_ops default_tp_ops;
  * @init_dev:		host specific initialization of a device
  * @set_pio_mode:	routine to program host for PIO mode
  * @set_dma_mode:	routine to program host for DMA mode
- * @selectproc:		tweaks hardware to select drive
  * @reset_poll:		chipset polling based on hba specifics
  * @pre_reset:		chipset specific changes to default for device-hba resets
  * @resetproc:		routine to reset controller after a disk reset
@@ -695,7 +695,6 @@ struct ide_port_ops {
 	void	(*init_dev)(ide_drive_t *);
 	void	(*set_pio_mode)(ide_drive_t *, const u8);
 	void	(*set_dma_mode)(ide_drive_t *, const u8);
-	void	(*selectproc)(ide_drive_t *);
 	int	(*reset_poll)(ide_drive_t *);
 	void	(*pre_reset)(ide_drive_t *);
 	void	(*resetproc)(ide_drive_t *);
@@ -1170,6 +1169,7 @@ u8 ide_read_status(ide_hwif_t *);
 u8 ide_read_altstatus(ide_hwif_t *);
 void ide_write_devctl(ide_hwif_t *, u8);
 
+void ide_dev_select(ide_drive_t *);
 void ide_tf_load(ide_drive_t *, struct ide_cmd *);
 void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 
-- 
cgit v1.2.3-71-gd317


From fdd88f0af616db59a6a36bdf0185181d2b779f53 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 31 Mar 2009 20:15:33 +0200
Subject: ide: inline SELECT_DRIVE()

Since SELECT_DRIVE() has boiled down to a mere dev_select() method call, it now
makes sense to just inline it...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-eh.c    |  7 ++++---
 drivers/ide/ide-io.c    |  2 +-
 drivers/ide/ide-iops.c  |  7 +------
 drivers/ide/ide-pm.c    |  5 +++--
 drivers/ide/ide-probe.c | 15 ++++++++-------
 drivers/ide/ns87415.c   |  2 +-
 include/linux/ide.h     |  1 -
 7 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index de4b7f1c9c9f..5d5fb961b5ce 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -165,11 +165,12 @@ static ide_startstop_t do_reset1(ide_drive_t *, int);
 static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	u8 stat;
 
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	udelay(10);
-	stat = hwif->tp_ops->read_status(hwif);
+	stat = tp_ops->read_status(hwif);
 
 	if (OK_STAT(stat, 0, ATA_BUSY))
 		printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name);
@@ -348,7 +349,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
 	/* For an ATAPI device, first try an ATAPI SRST. */
 	if (drive->media != ide_disk && !do_not_try_atapi) {
 		pre_reset(drive);
-		SELECT_DRIVE(drive);
+		tp_ops->dev_select(drive);
 		udelay(20);
 		tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 		ndelay(400);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 5589dce88674..1deb6d29b186 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -348,7 +348,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 	if (blk_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
-	SELECT_DRIVE(drive);
+	drive->hwif->tp_ops->dev_select(drive);
 	if (ide_wait_stat(&startstop, drive, drive->ready_stat,
 			  ATA_BUSY | ATA_DRQ, WAIT_READY)) {
 		printk(KERN_ERR "%s: drive not ready for command\n", drive->name);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index dfb0ec317fa3..27bb70ddd459 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -27,11 +27,6 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-void SELECT_DRIVE(ide_drive_t *drive)
-{
-	drive->hwif->tp_ops->dev_select(drive);
-}
-
 void SELECT_MASK(ide_drive_t *drive, int mask)
 {
 	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
@@ -347,7 +342,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	disable_irq_nosync(hwif->irq);
 
 	udelay(1);
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	SELECT_MASK(drive, 1);
 	udelay(1);
 	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 20553d4c42a2..bb7858ebb7d1 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -223,6 +223,7 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		 * point.
 		 */
 		ide_hwif_t *hwif = drive->hwif;
+		const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 		struct request_queue *q = drive->queue;
 		unsigned long flags;
 		int rc;
@@ -232,8 +233,8 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		rc = ide_wait_not_busy(hwif, 35000);
 		if (rc)
 			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
-		SELECT_DRIVE(drive);
-		hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
+		tp_ops->dev_select(drive);
+		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 		rc = ide_wait_not_busy(hwif, 100000);
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index d240f76b0da6..d8c1c3e735bb 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -390,13 +390,13 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 	 * (e.g. crw9624 as drive0 with disk as slave)
 	 */
 	msleep(50);
-	SELECT_DRIVE(drive);
+	tp_ops->dev_select(drive);
 	msleep(50);
 
 	if (ide_read_device(drive) != drive->select && present == 0) {
 		if (drive->dn & 1) {
 			/* exit with drive0 selected */
-			SELECT_DRIVE(hwif->devices[0]);
+			tp_ops->dev_select(hwif->devices[0]);
 			/* allow ATA_BUSY to assert & clear */
 			msleep(50);
 		}
@@ -422,7 +422,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			printk(KERN_ERR "%s: no response (status = 0x%02x), "
 					"resetting drive\n", drive->name, stat);
 			msleep(50);
-			SELECT_DRIVE(drive);
+			tp_ops->dev_select(drive);
 			msleep(50);
 			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
 			(void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0);
@@ -441,7 +441,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 	}
 	if (drive->dn & 1) {
 		/* exit with drive0 selected */
-		SELECT_DRIVE(hwif->devices[0]);
+		tp_ops->dev_select(hwif->devices[0]);
 		msleep(50);
 		/* ensure drive irq is clear */
 		(void)tp_ops->read_status(hwif);
@@ -605,6 +605,7 @@ out:
 
 static int ide_port_wait_ready(ide_hwif_t *hwif)
 {
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	ide_drive_t *drive;
 	int i, rc;
 
@@ -627,8 +628,8 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 		/* Ignore disks that we will not probe for later. */
 		if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 ||
 		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
-			SELECT_DRIVE(drive);
-			hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
+			tp_ops->dev_select(drive);
+			tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
 			mdelay(2);
 			rc = ide_wait_not_busy(hwif, 35000);
 			if (rc)
@@ -640,7 +641,7 @@ static int ide_port_wait_ready(ide_hwif_t *hwif)
 out:
 	/* Exit function with master reselected (let's be sane) */
 	if (i)
-		SELECT_DRIVE(hwif->devices[0]);
+		tp_ops->dev_select(hwif->devices[0]);
 
 	return rc;
 }
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
index af1b421eb450..71a39fb3856f 100644
--- a/drivers/ide/ns87415.c
+++ b/drivers/ide/ns87415.c
@@ -262,7 +262,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 #ifdef __sparc_v9__
 		/*
 		 * XXX: Reset the device, if we don't it will not respond to
-		 *      SELECT_DRIVE() properly during first ide_probe_port().
+		 *      dev_select() properly during first ide_probe_port().
 		 */
 		timeout = 10000;
 		outb(12, hwif->io_ports.ctl_addr);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c69181c61fd8..a5d26f66ef78 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1176,7 +1176,6 @@ void ide_tf_read(ide_drive_t *, struct ide_cmd *);
 void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
 
-extern void SELECT_DRIVE(ide_drive_t *);
 void SELECT_MASK(ide_drive_t *, int);
 
 u8 ide_read_error(ide_drive_t *);
-- 
cgit v1.2.3-71-gd317


From 3e93cd671813e204c258f1e6c797959920cf7772 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:00:13 -0400
Subject: Take fs_struct handling to new file (fs/fs_struct.c)

Pure code move; two new helper functions for nfsd and daemonize
(unshare_fs_struct() and daemonize_fs_struct() resp.; for now -
the same code as used to be in callers).  unshare_fs_struct()
exported (for nfsd, as copy_fs_struct()/exit_fs() used to be),
copy_fs_struct() and exit_fs() don't need exports anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Makefile               |   2 +-
 fs/fs_struct.c            | 141 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h             |   6 ++
 fs/namei.c                |   7 ---
 fs/namespace.c            |  68 ----------------------
 fs/nfsd/nfssvc.c          |   7 +--
 include/linux/fs_struct.h |   2 +
 kernel/exit.c             |  31 +---------
 kernel/fork.c             |  29 +---------
 9 files changed, 155 insertions(+), 138 deletions(-)
 create mode 100644 fs/fs_struct.c

(limited to 'include/linux')

diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..b5cd8e18dd9f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o
+		stack.o fs_struct.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..36e0a123bbf3
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,141 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+	struct path old_root;
+
+	write_lock(&fs->lock);
+	old_root = fs->root;
+	fs->root = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+	struct path old_pwd;
+
+	write_lock(&fs->lock);
+	old_pwd = fs->pwd;
+	fs->pwd = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+	struct task_struct *g, *p;
+	struct fs_struct *fs;
+	int count = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+		fs = p->fs;
+		if (fs) {
+			write_lock(&fs->lock);
+			if (fs->root.dentry == old_root->dentry
+			    && fs->root.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->root = *new_root;
+				count++;
+			}
+			if (fs->pwd.dentry == old_root->dentry
+			    && fs->pwd.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->pwd = *new_root;
+				count++;
+			}
+			write_unlock(&fs->lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	while (count--)
+		path_put(old_root);
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+	/* No need to hold fs->lock if we are killing it */
+	if (atomic_dec_and_test(&fs->count)) {
+		path_put(&fs->root);
+		path_put(&fs->pwd);
+		kmem_cache_free(fs_cachep, fs);
+	}
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct * fs = tsk->fs;
+
+	if (fs) {
+		task_lock(tsk);
+		tsk->fs = NULL;
+		task_unlock(tsk);
+		put_fs_struct(fs);
+	}
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		atomic_set(&fs->count, 1);
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->root = old->root;
+		path_get(&old->root);
+		fs->pwd = old->pwd;
+		path_get(&old->pwd);
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+int unshare_fs_struct(void)
+{
+	struct fs_struct *fsp = copy_fs_struct(current->fs);
+	if (!fsp)
+		return -ENOMEM;
+	exit_fs(current);
+	current->fs = fsp;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.count		= ATOMIC_INIT(1),
+	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.umask		= 0022,
+};
+
+void daemonize_fs_struct(void)
+{
+	struct fs_struct *fs;
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = &init_fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+}
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..477a105f8df3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct linux_binprm;
+struct path;
 
 /*
  * block_dev.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..4c65a6460138 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2897,10 +2897,3 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
-
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
-	.umask		= 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index f7ec283ccfbb..1e56303c718e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2092,74 +2092,6 @@ out1:
 	return retval;
 }
 
-/*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
-	struct path old_root;
-
-	write_lock(&fs->lock);
-	old_root = fs->root;
-	fs->root = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-	if (old_root.dentry)
-		path_put(&old_root);
-}
-
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-	struct path old_pwd;
-
-	write_lock(&fs->lock);
-	old_pwd = fs->pwd;
-	fs->pwd = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-
-	if (old_pwd.dentry)
-		path_put(&old_pwd);
-}
-
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
-	struct task_struct *g, *p;
-	struct fs_struct *fs;
-	int count = 0;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		task_lock(p);
-		fs = p->fs;
-		if (fs) {
-			write_lock(&fs->lock);
-			if (fs->root.dentry == old_root->dentry
-			    && fs->root.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->root = *new_root;
-				count++;
-			}
-			if (fs->pwd.dentry == old_root->dentry
-			    && fs->pwd.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->pwd = *new_root;
-				count++;
-			}
-			write_unlock(&fs->lock);
-		}
-		task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-	while (count--)
-		path_put(old_root);
-}
-
 /*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..144d69918614 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -404,7 +404,6 @@ static int
 nfsd(void *vrqstp)
 {
 	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-	struct fs_struct *fsp;
 	int err, preverr = 0;
 
 	/* Lock module and set up kernel thread */
@@ -413,13 +412,11 @@ nfsd(void *vrqstp)
 	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
-	fsp = copy_fs_struct(current->fs);
-	if (!fsp) {
+	if (unshare_fs_struct() < 0) {
 		printk("Unable to start nfsd thread: out of memory\n");
 		goto out;
 	}
-	exit_fs(current);
-	current->fs = fsp;
+
 	current->fs->umask = 0;
 
 	/*
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 18b467dbe278..298cef1c0793 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -20,5 +20,7 @@ extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
 extern void put_fs_struct(struct fs_struct *);
+extern void daemonize_fs_struct(void);
+extern int unshare_fs_struct(void);
 
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..ad8375758a79 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -429,7 +429,6 @@ EXPORT_SYMBOL(disallow_signal);
 void daemonize(const char *name, ...)
 {
 	va_list args;
-	struct fs_struct *fs;
 	sigset_t blocked;
 
 	va_start(args, name);
@@ -462,11 +461,7 @@ void daemonize(const char *name, ...)
 
 	/* Become as one with the init task */
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = init_task.fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
-
+	daemonize_fs_struct();
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
@@ -565,30 +560,6 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-void put_fs_struct(struct fs_struct *fs)
-{
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
-}
-
-void exit_fs(struct task_struct *tsk)
-{
-	struct fs_struct * fs = tsk->fs;
-
-	if (fs) {
-		task_lock(tsk);
-		tsk->fs = NULL;
-		task_unlock(tsk);
-		put_fs_struct(fs);
-	}
-}
-
-EXPORT_SYMBOL_GPL(exit_fs);
-
 #ifdef CONFIG_MM_OWNER
 /*
  * Task p is exiting and it owned mm, lets find a new owner for it
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..05c02dc586b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -681,38 +681,13 @@ fail_nomem:
 	return retval;
 }
 
-static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
-{
-	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
-	/* We don't need to lock fs - think why ;-) */
-	if (fs) {
-		atomic_set(&fs->count, 1);
-		rwlock_init(&fs->lock);
-		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
-	}
-	return fs;
-}
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old)
-{
-	return __copy_fs_struct(old);
-}
-
-EXPORT_SYMBOL_GPL(copy_fs_struct);
-
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
 	if (clone_flags & CLONE_FS) {
 		atomic_inc(&current->fs->count);
 		return 0;
 	}
-	tsk->fs = __copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(current->fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1545,7 +1520,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 
 	if ((unshare_flags & CLONE_FS) &&
 	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = __copy_fs_struct(current->fs);
+		*new_fsp = copy_fs_struct(current->fs);
 		if (!*new_fsp)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3-71-gd317


From 498052bba55ecaff58db6a1436b0e25bfd75a7ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 07:20:30 -0400
Subject: New locking/refcounting for fs_struct

* all changes of current->fs are done under task_lock and write_lock of
  old fs->lock
* refcount is not atomic anymore (same protection)
* its decrements are done when removing reference from current; at the
  same time we decide whether to free it.
* put_fs_struct() is gone
* new field - ->in_exec.  Set by check_unsafe_exec() if we are trying to do
  execve() and only subthreads share fs_struct.  Cleared when finishing exec
  (success and failure alike).  Makes CLONE_FS fail with -EAGAIN if set.
* check_unsafe_exec() may fail with -EAGAIN if another execve() from subthread
  is in progress.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c               | 16 +++++++++--
 fs/exec.c                 | 31 +++++++++++++++++----
 fs/fs_struct.c            | 69 +++++++++++++++++++++++++++++++++--------------
 fs/internal.h             |  2 +-
 fs/proc/task_nommu.c      |  2 +-
 include/linux/fs_struct.h |  8 +++---
 kernel/fork.c             | 37 ++++++++++++++++++-------
 7 files changed, 121 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..baabf203b847 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1441,12 +1442,15 @@ int compat_do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1488,6 +1492,9 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1506,6 +1513,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..07a059664b73 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1056,16 +1056,18 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold current->cred_exec_mutex to protect against
  *   PTRACE_ATTACH
  */
-void check_unsafe_exec(struct linux_binprm *bprm)
+int check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
 	unsigned long flags;
 	unsigned n_fs, n_sighand;
+	int res = 0;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
 	n_sighand = 1;
+	write_lock(&p->fs->lock);
 	lock_task_sighand(p, &flags);
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
@@ -1073,11 +1075,19 @@ void check_unsafe_exec(struct linux_binprm *bprm)
 		n_sighand++;
 	}
 
-	if (atomic_read(&p->fs->count) > n_fs ||
-	    atomic_read(&p->sighand->count) > n_sighand)
+	if (p->fs->users > n_fs ||
+	    atomic_read(&p->sighand->count) > n_sighand) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+	} else {
+		if (p->fs->in_exec)
+			res = -EAGAIN;
+		p->fs->in_exec = 1;
+	}
 
 	unlock_task_sighand(p, &flags);
+	write_unlock(&p->fs->lock);
+
+	return res;
 }
 
 /* 
@@ -1296,12 +1306,15 @@ int do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1344,6 +1357,9 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1362,6 +1378,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 36e0a123bbf3..41cff72b377b 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -72,25 +72,27 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 		path_put(old_root);
 }
 
-void put_fs_struct(struct fs_struct *fs)
+void free_fs_struct(struct fs_struct *fs)
 {
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
+	path_put(&fs->root);
+	path_put(&fs->pwd);
+	kmem_cache_free(fs_cachep, fs);
 }
 
 void exit_fs(struct task_struct *tsk)
 {
-	struct fs_struct * fs = tsk->fs;
+	struct fs_struct *fs = tsk->fs;
 
 	if (fs) {
+		int kill;
 		task_lock(tsk);
+		write_lock(&fs->lock);
 		tsk->fs = NULL;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
 		task_unlock(tsk);
-		put_fs_struct(fs);
+		if (kill)
+			free_fs_struct(fs);
 	}
 }
 
@@ -99,7 +101,8 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 	/* We don't need to lock fs - think why ;-) */
 	if (fs) {
-		atomic_set(&fs->count, 1);
+		fs->users = 1;
+		fs->in_exec = 0;
 		rwlock_init(&fs->lock);
 		fs->umask = old->umask;
 		read_lock(&old->lock);
@@ -114,28 +117,54 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 
 int unshare_fs_struct(void)
 {
-	struct fs_struct *fsp = copy_fs_struct(current->fs);
-	if (!fsp)
+	struct fs_struct *fs = current->fs;
+	struct fs_struct *new_fs = copy_fs_struct(fs);
+	int kill;
+
+	if (!new_fs)
 		return -ENOMEM;
-	exit_fs(current);
-	current->fs = fsp;
+
+	task_lock(current);
+	write_lock(&fs->lock);
+	kill = !--fs->users;
+	current->fs = new_fs;
+	write_unlock(&fs->lock);
+	task_unlock(current);
+
+	if (kill)
+		free_fs_struct(fs);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
+	.users		= 1,
 	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
 
 void daemonize_fs_struct(void)
 {
-	struct fs_struct *fs;
+	struct fs_struct *fs = current->fs;
+
+	if (fs) {
+		int kill;
+
+		task_lock(current);
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = &init_fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
+		write_lock(&init_fs.lock);
+		init_fs.users++;
+		write_unlock(&init_fs.lock);
+
+		write_lock(&fs->lock);
+		current->fs = &init_fs;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
+
+		task_unlock(current);
+		if (kill)
+			free_fs_struct(fs);
+	}
 }
diff --git a/fs/internal.h b/fs/internal.h
index 477a105f8df3..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -44,7 +44,7 @@ extern void __init chrdev_init(void);
 /*
  * exec.c
  */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern int check_unsafe_exec(struct linux_binprm *);
 
 /*
  * namespace.c
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..6ca01052c5bc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -49,7 +49,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	else
 		bytes += kobjsize(mm);
 	
-	if (current->fs && atomic_read(&current->fs->count) > 1)
+	if (current->fs && current->fs->users > 1)
 		sbytes += kobjsize(current->fs);
 	else
 		bytes += kobjsize(current->fs);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 298cef1c0793..78a05bfcd8eb 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -4,12 +4,10 @@
 #include <linux/path.h>
 
 struct fs_struct {
-	atomic_t count;	/* This usage count is used by check_unsafe_exec() for
-			 * security checking purposes - therefore it may not be
-			 * incremented, except by clone(CLONE_FS).
-			 */
+	int users;
 	rwlock_t lock;
 	int umask;
+	int in_exec;
 	struct path root, pwd;
 };
 
@@ -19,7 +17,7 @@ extern void exit_fs(struct task_struct *);
 extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
-extern void put_fs_struct(struct fs_struct *);
+extern void free_fs_struct(struct fs_struct *);
 extern void daemonize_fs_struct(void);
 extern int unshare_fs_struct(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 05c02dc586b1..51f138a131de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -683,11 +683,19 @@ fail_nomem:
 
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
+	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
-		atomic_inc(&current->fs->count);
+		/* tsk->fs is already what we want */
+		write_lock(&fs->lock);
+		if (fs->in_exec) {
+			write_unlock(&fs->lock);
+			return -EAGAIN;
+		}
+		fs->users++;
+		write_unlock(&fs->lock);
 		return 0;
 	}
-	tsk->fs = copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1518,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 {
 	struct fs_struct *fs = current->fs;
 
-	if ((unshare_flags & CLONE_FS) &&
-	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = copy_fs_struct(current->fs);
-		if (!*new_fsp)
-			return -ENOMEM;
-	}
+	if (!(unshare_flags & CLONE_FS) || !fs)
+		return 0;
+
+	/* don't need lock here; in the worst case we'll do useless copy */
+	if (fs->users == 1)
+		return 0;
+
+	*new_fsp = copy_fs_struct(fs);
+	if (!*new_fsp)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -1639,8 +1651,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
+			write_lock(&fs->lock);
 			current->fs = new_fs;
-			new_fs = fs;
+			if (--fs->users)
+				new_fs = NULL;
+			else
+				new_fs = fs;
+			write_unlock(&fs->lock);
 		}
 
 		if (new_mm) {
@@ -1679,7 +1696,7 @@ bad_unshare_cleanup_sigh:
 
 bad_unshare_cleanup_fs:
 	if (new_fs)
-		put_fs_struct(new_fs);
+		free_fs_struct(new_fs);
 
 bad_unshare_cleanup_thread:
 bad_unshare_out:
-- 
cgit v1.2.3-71-gd317


From ce3b0f8d5c2203301fc87f3aaaed73e5819e2a48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:08:22 -0400
Subject: New helper - current_umask()

current->fs->umask is what most of fs_struct users are doing.
Put that into a helper function.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 2 +-
 fs/btrfs/acl.c                            | 2 +-
 fs/btrfs/ioctl.c                          | 2 +-
 fs/cifs/dir.c                             | 4 ++--
 fs/cifs/inode.c                           | 4 ++--
 fs/ext2/acl.c                             | 2 +-
 fs/ext3/acl.c                             | 2 +-
 fs/ext4/acl.c                             | 2 +-
 fs/fat/inode.c                            | 2 +-
 fs/fs_struct.c                            | 6 ++++++
 fs/generic_acl.c                          | 2 +-
 fs/gfs2/acl.c                             | 2 +-
 fs/hfsplus/options.c                      | 2 +-
 fs/hpfs/super.c                           | 2 +-
 fs/jffs2/acl.c                            | 2 +-
 fs/jfs/acl.c                              | 2 +-
 fs/namei.c                                | 6 +++---
 fs/nfs/nfs3proc.c                         | 6 +++---
 fs/nfs/nfs4proc.c                         | 2 +-
 fs/ocfs2/acl.c                            | 2 +-
 fs/omfs/inode.c                           | 2 +-
 fs/reiserfs/xattr_acl.c                   | 2 +-
 fs/xfs/linux-2.6/xfs_iops.c               | 4 ++--
 include/linux/fs.h                        | 2 ++
 ipc/mqueue.c                              | 2 +-
 net/unix/af_unix.c                        | 2 +-
 26 files changed, 39 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 64f068540d0d..706eb5c7e2ee 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -635,7 +635,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode,
 	if (dentry->d_inode)
 		goto out_dput;
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 
 	if (flags & SPU_CREATE_GANG)
 		ret = spufs_create_gang(nd->path.dentry->d_inode,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 		}
 
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 
 	if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 		goto out_dput;
 
 	if (!IS_POSIXACL(parent->dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 
 	error = mnt_want_write(parent->mnt);
 	if (error)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8d..54dce78fbb73 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 
@@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		rc = -ENOMEM;
 	else if (pTcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
-			.mode	= mode & ~current->fs->umask,
+			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
 			.atime	= NO_CHANGE_64,
 			.mtime	= NO_CHANGE_64,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc60805..f121a80fdd6f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			goto mkdir_out;
 		}
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
 				mode, NULL /* netfid */, pInfo, &oplock,
 				full_path, cifs_sb->local_nls,
@@ -1204,7 +1204,7 @@ mkdir_get_info:
 		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
 				direntry->d_inode->i_nlink = 2;
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		/* must turn on setgid bit if parent dir has it */
 		if (inode->i_mode & S_ISGID)
 			mode |= S_ISGID;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e00..ab657db4c94e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -930,7 +930,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 
 	opts->fs_uid = current_uid();
 	opts->fs_gid = current_gid();
-	opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+	opts->fs_fmask = current_umask();
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 41cff72b377b..6ac219338670 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -138,6 +138,12 @@ int unshare_fs_struct(void)
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
+int current_umask(void)
+{
+	return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
 	mode_t mode = inode->i_mode;
 	int error;
 
-	inode->i_mode = mode & ~current->fs->umask;
+	inode->i_mode = mode & ~current_umask();
 	if (!S_ISLNK(inode->i_mode))
 		acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
 	if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa763..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	if (error)
 		return error;
 	if (!acl) {
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		if (mode != ip->i_inode.i_mode)
 			error = munge_mode(ip, mode);
 		return error;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
 
 	opts->creator = HFSPLUS_DEF_CR_TYPE;
 	opts->type = HFSPLUS_DEF_CR_TYPE;
-	opts->umask = current->fs->umask;
+	opts->umask = current_umask();
 	opts->uid = current_uid();
 	opts->gid = current_gid();
 	opts->part = -1;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..c696d01bc8f7 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -477,7 +477,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	uid = current_uid();
 	gid = current_gid();
-	umask = current->fs->umask;
+	umask = current_umask();
 	lowercase = 0;
 	conv = CONV_BINARY;
 	eas = 2;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..77ccf8cb0823 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 		return PTR_ERR(acl);
 
 	if (!acl) {
-		*i_mode &= ~current->fs->umask;
+		*i_mode &= ~current_umask();
 	} else {
 		if (S_ISDIR(*i_mode))
 			jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e82..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 cleanup:
 		posix_acl_release(acl);
 	} else
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 
 	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
 			       inode->i_mode;
diff --git a/fs/namei.c b/fs/namei.c
index 4c65a6460138..964c0249444b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1578,7 +1578,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
 	struct dentry *dir = nd->path.dentry;
 
 	if (!IS_POSIXACL(dir->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
 	if (error)
 		goto out_unlock;
@@ -1989,7 +1989,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
 		goto out_unlock;
 	}
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = may_mknod(mode);
 	if (error)
 		goto out_dput;
@@ -2067,7 +2067,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 		goto out_unlock;
 
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..e47d4400fb87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		data->arg.create.verifier[1] = current->pid;
 	}
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	for (;;) {
 		status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
 			MAJOR(rdev), MINOR(rdev));
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..bbee587dd597 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1509,7 +1509,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		attr.ia_mode = nd->intent.open.create_mode;
 		attr.ia_valid = ATTR_MODE;
 		if (!IS_POSIXACL(dir))
-			attr.ia_mode &= ~current->fs->umask;
+			attr.ia_mode &= ~current_umask();
 	} else {
 		attr.ia_valid = 0;
 		BUG_ON(nd->intent.open.flags & O_CREAT);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..aa6fc30772af 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -421,7 +421,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_uid = current_uid();
 	sbi->s_gid = current_gid();
-	sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+	sbi->s_dmask = sbi->s_fmask = current_umask();
 
 	if (!parse_options((char *) data, sbi))
 		goto end;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d1..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 	} else {
 	      apply_umask:
 		/* no ACL, apply umask */
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 	}
 
 	return err;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67f..2940612e3aeb 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -227,7 +227,7 @@ xfs_vn_mknod(
 	xfs_dentry_to_name(&name, dentry);
 
 	if (IS_POSIXACL(dir) && !default_acl)
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
@@ -416,7 +416,7 @@ xfs_vn_symlink(
 	mode_t		mode;
 
 	mode = S_IFLNK |
-		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	xfs_dentry_to_name(&name, dentry);
 
 	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87e7bfc5ebd7..3d7bd5447ca3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1741,6 +1741,8 @@ extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
 
+extern int current_umask(void);
+
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a8ddadbc7459..916785363f0f 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -602,7 +602,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 		dentry->d_fsdata = attr;
 	}
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 	ret = mnt_want_write(mqueue_mnt);
 	if (ret)
 		goto out;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index baac91049b0e..9dcc6e7f96ec 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -832,7 +832,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		 * All right, let's create it.
 		 */
 		mode = S_IFSOCK |
-		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
+		       (SOCK_INODE(sock)->i_mode & ~current_umask());
 		err = mnt_want_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
-- 
cgit v1.2.3-71-gd317


From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:50:06 -0400
Subject: Get rid of indirect include of fs_struct.h

Don't pull it in sched.h; very few files actually need it and those
can include directly.  sched.h itself only needs forward declaration
of struct fs_struct;

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/cris/kernel/process.c    | 1 -
 fs/dcache.c                   | 1 +
 fs/exec.c                     | 1 +
 fs/fs_struct.c                | 1 +
 fs/namei.c                    | 1 +
 fs/namespace.c                | 1 +
 fs/open.c                     | 1 +
 fs/proc/base.c                | 1 +
 fs/proc/task_nommu.c          | 1 +
 include/linux/mnt_namespace.h | 2 ++
 include/linux/nsproxy.h       | 1 +
 include/linux/sched.h         | 3 ++-
 init/do_mounts.c              | 1 +
 kernel/auditsc.c              | 1 +
 kernel/exec_domain.c          | 1 +
 kernel/exit.c                 | 1 +
 kernel/fork.c                 | 1 +
 kernel/sys.c                  | 1 +
 security/tomoyo/realpath.c    | 1 +
 19 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 60816e876455..4df0b320d524 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -19,7 +19,6 @@
 #include <asm/system.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..0dc4de21f088 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/exec.c b/fs/exec.c
index 614991bf0c87..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 6ac219338670..eee059052db5 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
+#include <linux/fs_struct.h>
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
diff --git a/fs/namei.c b/fs/namei.c
index 964c0249444b..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
diff --git a/fs/namespace.c b/fs/namespace.c
index 1e56303c718e..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/ramfs.h>
 #include <linux/log2.h>
 #include <linux/idr.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/fs_struct.h>
 
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 /* NOTE:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 6ca01052c5bc..253afc04484c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 830bbcd449d6..3a059298cc19 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -22,6 +22,8 @@ struct proc_mounts {
 	int event;
 };
 
+struct fs_struct;
+
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void __put_mnt_ns(struct mnt_namespace *ns);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index afad7dec1b36..7b370c7cfeff 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
 struct uts_namespace;
 struct ipc_namespace;
 struct pid_namespace;
+struct fs_struct;
 
 /*
  * A structure to contain pointers to all per-process
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29df6374d2de..b4e065ea0de1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,7 +68,7 @@ struct sched_param {
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
-#include <linux/fs_struct.h>
+#include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
@@ -97,6 +97,7 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio;
 struct bts_tracer;
+struct fs_struct;
 
 /*
  * List of flags we want to share for kernel threads,
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 8d4ff5afc1d8..dd7ee5f203f3 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/initrd.h>
 #include <linux/async.h>
+#include <linux/fs_struct.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..2bfc64786765 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
 #include <linux/capability.h>
+#include <linux/fs_struct.h>
 
 #include "audit.h"
 
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index cb8e9626c215..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
+#include <linux/fs_struct.h>
 
 
 static void default_handler(int, struct pt_regs *);
diff --git a/kernel/exit.c b/kernel/exit.c
index ad8375758a79..b5d656845c90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <trace/sched.h>
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51f138a131de..e82a14577a98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..ce182aaed204 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
 #include <linux/ptrace.h>
+#include <linux/fs_struct.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index d47f16b844b2..3bbe01a7a4b5 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
+#include <linux/fs_struct.h>
 #include "common.h"
 #include "realpath.h"
 
-- 
cgit v1.2.3-71-gd317


From 47e4491b40df73c3b117e3d80b31b5b512a4b19f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Apr 2009 07:07:16 -0400
Subject: Cleanup after commit 585d3bc06f4ca57f975a5a1f698f65a45ea66225

fsync_bdev() export and a bunch of stubs for !CONFIG_BLOCK case had
been left behind

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c              |  1 +
 fs/buffer.c                 |  1 -
 include/linux/buffer_head.h | 12 ------------
 include/linux/fs.h          | 12 ++++++++++++
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf3..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 	}
 	return sync_blockdev(bdev);
 }
+EXPORT_SYMBOL(fsync_bdev);
 
 /**
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..b71e52925c83 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3281,7 +3281,6 @@ EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f19fd9045ea0..fc91665d39d0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -332,22 +332,10 @@ extern int __set_page_dirty_buffers(struct page *page);
 
 static inline void buffer_init(void) {}
 static inline int try_to_free_buffers(struct page *page) { return 1; }
-static inline int sync_blockdev(struct block_device *bdev) { return 0; }
 static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
-static inline void invalidate_bdev(struct block_device *bdev) {}
-
-static inline struct super_block *freeze_bdev(struct block_device *sb)
-{
-	return NULL;
-}
-
-static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-	return 0;
-}
 
 #endif /* CONFIG_BLOCK */
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3d7bd5447ca3..674134725597 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1886,6 +1886,18 @@ extern int fsync_super(struct super_block *);
 extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
+static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+static inline void invalidate_bdev(struct block_device *bdev) {}
+
+static inline struct super_block *freeze_bdev(struct block_device *sb)
+{
+	return NULL;
+}
+
+static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	return 0;
+}
 #endif
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
-- 
cgit v1.2.3-71-gd317


From ced117c73edc917e96dea7cca98c91383f0792f7 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Tue, 31 Mar 2009 00:41:20 +0300
Subject: Remove two unneeded exports and make two symbols static in fs/mpage.c

Commit 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 (vfs: add hooks for
ext4's delayed allocation support) exported the following functions

mpage_bio_submit()
__mpage_writepage()

for the benefit of ext4's delayed allocation support. Since commit
a1d6cc563bfdf1bf2829d3e6ce4d8b774251796b (ext4: Rework the
ext4_da_writepages() function), these functions are not used by the
ext4 driver anymore. However, the now unnecessary exports still
remain, and this patch removes those. Moreover, these two functions
can become static again.

The issue was spotted by namespacecheck.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mpage.c            | 13 +++++++++----
 include/linux/mpage.h | 10 ----------
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
-EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
  * just allocate full-size (16-page) BIOs.
  */
 
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 		      void *data)
 {
 	struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
 	mpd->bio = bio;
 	return ret;
 }
-EXPORT_SYMBOL(__mpage_writepage);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 5c42821da2d1..068a0c9946af 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,21 +11,11 @@
  */
 #ifdef CONFIG_BLOCK
 
-struct mpage_data {
-	struct bio *bio;
-	sector_t last_block_in_bio;
-	get_block_t *get_block;
-	unsigned use_writepage;
-};
-
 struct writeback_control;
 
-struct bio *mpage_bio_submit(int rw, struct bio *bio);
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-		      void *data);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
-- 
cgit v1.2.3-71-gd317


From ae149b6bec64a09373ba20fce75f8aa6b14b78fd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:15 -0700
Subject: proc tty: add struct tty_operations::proc_fops

Used for gradual switch of TTY drivers from using ->read_proc which helps
with gradual switch from ->read_proc for the whole tree.

As side effect, fix possible race condition when ->data initialized after
PDE is hooked into proc tree.

->proc_fops takes precedence over ->read_proc.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_tty.c         | 20 +++++++++++++-------
 include/linux/tty_driver.h |  1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae60..854827b1d463 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,22 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if (!driver->ops->read_proc || !driver->driver_name ||
-	    driver->proc_entry)
+	if (!driver->driver_name || driver->proc_entry)
 		return;
 
-	ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
-	if (!ent)
+	if (driver->ops->proc_fops) {
+		ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+				       driver->ops->proc_fops, driver);
+		if (!ent)
+			return;
+	} else if (driver->ops->read_proc) {
+		ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
+		if (!ent)
+			return;
+		ent->read_proc = driver->ops->read_proc;
+		ent->data = driver;
+	} else
 		return;
-	ent->read_proc = driver->ops->read_proc;
-	ent->data = driver;
-
 	driver->proc_entry = ent;
 }
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 08e088334dba..c9a69575ded6 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -264,6 +264,7 @@ struct tty_operations {
 	int (*poll_get_char)(struct tty_driver *driver, int line);
 	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
 #endif
+	const struct file_operations *proc_fops;
 };
 
 struct tty_driver {
-- 
cgit v1.2.3-71-gd317


From 0f043a81ebe84be3576667f04fdda481609e3816 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:19:25 -0700
Subject: proc tty: remove struct tty_operations::read_proc

struct tty_operations::proc_fops took it's place and there is one less
create_proc_read_entry() user now!

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/isdn/capi/capi.c   |  7 -------
 fs/proc/proc_tty.c         | 18 ++++--------------
 include/linux/tty_driver.h |  2 --
 net/bluetooth/rfcomm/tty.c |  6 ------
 4 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 3e468d2cf730..2d8352419c0d 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1331,12 +1331,6 @@ static void capinc_tty_send_xchar(struct tty_struct *tty, char ch)
 #endif
 }
 
-static int capinc_tty_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
-{
-	return 0;
-}
-
 static struct tty_driver *capinc_tty_driver;
 
 static const struct tty_operations capinc_ops = {
@@ -1358,7 +1352,6 @@ static const struct tty_operations capinc_ops = {
 	.flush_buffer = capinc_tty_flush_buffer,
 	.set_ldisc = capinc_tty_set_ldisc,
 	.send_xchar = capinc_tty_send_xchar,
-	.read_proc = capinc_tty_read_proc,
 };
 
 static int capinc_tty_init(void)
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 854827b1d463..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,22 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if (!driver->driver_name || driver->proc_entry)
+	if (!driver->driver_name || driver->proc_entry ||
+	    !driver->ops->proc_fops)
 		return;
 
-	if (driver->ops->proc_fops) {
-		ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
-				       driver->ops->proc_fops, driver);
-		if (!ent)
-			return;
-	} else if (driver->ops->read_proc) {
-		ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
-		if (!ent)
-			return;
-		ent->read_proc = driver->ops->read_proc;
-		ent->data = driver;
-	} else
-		return;
+	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_fops, driver);
 	driver->proc_entry = ent;
 }
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index c9a69575ded6..8615d661ab60 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -252,8 +252,6 @@ struct tty_operations {
 	void (*set_ldisc)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, int timeout);
 	void (*send_xchar)(struct tty_struct *tty, char ch);
-	int (*read_proc)(char *page, char **start, off_t off,
-			  int count, int *eof, void *data);
 	int (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int (*tiocmset)(struct tty_struct *tty, struct file *file,
 			unsigned int set, unsigned int clear);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index abdc703a11d2..cab71ea2796d 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -1093,11 +1093,6 @@ static void rfcomm_tty_hangup(struct tty_struct *tty)
 	}
 }
 
-static int rfcomm_tty_read_proc(char *buf, char **start, off_t offset, int len, int *eof, void *unused)
-{
-	return 0;
-}
-
 static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
 {
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
@@ -1156,7 +1151,6 @@ static const struct tty_operations rfcomm_ops = {
 	.send_xchar		= rfcomm_tty_send_xchar,
 	.hangup			= rfcomm_tty_hangup,
 	.wait_until_sent	= rfcomm_tty_wait_until_sent,
-	.read_proc		= rfcomm_tty_read_proc,
 	.tiocmget		= rfcomm_tty_tiocmget,
 	.tiocmset		= rfcomm_tty_tiocmset,
 };
-- 
cgit v1.2.3-71-gd317


From 9de1581e75ba9d7979766d4ab6d56f0f2d87f7c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 31 Mar 2009 15:19:29 -0700
Subject: get_mm_hiwater_xxx: trivial, s/define/inline/

Andrew pointed out get_mm_hiwater_xxx() evaluate "mm" argument thrice/twice,
make them inline.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29df6374d2de..481fad3a9b42 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -391,8 +391,15 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
-#define get_mm_hiwater_rss(mm)	max((mm)->hiwater_rss, get_mm_rss(mm))
-#define get_mm_hiwater_vm(mm)	max((mm)->hiwater_vm, (mm)->total_vm)
+static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
+{
+	return max(mm->hiwater_rss, get_mm_rss(mm));
+}
+
+static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
+{
+	return max(mm->hiwater_vm, mm->total_vm);
+}
 
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
-- 
cgit v1.2.3-71-gd317


From ee99c71c59f897436ec65debb99372b3146f9985 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:19:31 -0700
Subject: mm: introduce for_each_populated_zone() macro

Impact: cleanup

In almost cases, for_each_zone() is used with populated_zone().  It's
because almost function doesn't need memoryless node information.
Therefore, for_each_populated_zone() can help to make code simplify.

This patch has no functional change.

[akpm@linux-foundation.org: small cleanup]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h  |  8 ++++++++
 kernel/power/snapshot.c |  9 +++------
 kernel/power/swsusp.c   | 17 ++++++++---------
 mm/page_alloc.c         | 26 +++++---------------------
 mm/vmscan.c             |  4 +---
 mm/vmstat.c             | 11 ++---------
 6 files changed, 27 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1aca6cebbb78..26ef24076b76 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -806,6 +806,14 @@ extern struct zone *next_zone(struct zone *zone);
 	     zone;					\
 	     zone = next_zone(zone))
 
+#define for_each_populated_zone(zone)		        \
+	for (zone = (first_online_pgdat())->node_zones; \
+	     zone;					\
+	     zone = next_zone(zone))			\
+		if (!populated_zone(zone))		\
+			; /* do nothing */		\
+		else
+
 static inline struct zone *zonelist_zone(struct zoneref *zoneref)
 {
 	return zoneref->zone;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f2..33e2e4a819f9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 
 	INIT_LIST_HEAD(list);
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long zone_start, zone_end;
 		struct mem_extent *ext, *cur, *aux;
 
-		if (!populated_zone(zone))
-			continue;
-
 		zone_start = zone->zone_start_pfn;
 		zone_end = zone->zone_start_pfn + zone->spanned_pages;
 
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
 	struct zone *zone;
 	unsigned int cnt = 0;
 
-	for_each_zone(zone)
-		if (populated_zone(zone) && is_highmem(zone))
+	for_each_populated_zone(zone)
+		if (is_highmem(zone))
 			cnt += zone_page_state(zone, NR_FREE_PAGES);
 
 	return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c91451559..1ee6636414b2 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -229,17 +229,16 @@ int swsusp_shrink_memory(void)
 		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
 		tmp = size;
 		size += highmem_size;
-		for_each_zone (zone)
-			if (populated_zone(zone)) {
-				tmp += snapshot_additional_pages(zone);
-				if (is_highmem(zone)) {
-					highmem_size -=
+		for_each_populated_zone(zone) {
+			tmp += snapshot_additional_pages(zone);
+			if (is_highmem(zone)) {
+				highmem_size -=
 					zone_page_state(zone, NR_FREE_PAGES);
-				} else {
-					tmp -= zone_page_state(zone, NR_FREE_PAGES);
-					tmp += zone->lowmem_reserve[ZONE_NORMAL];
-				}
+			} else {
+				tmp -= zone_page_state(zone, NR_FREE_PAGES);
+				tmp += zone->lowmem_reserve[ZONE_NORMAL];
 			}
+		}
 
 		if (highmem_size < 0)
 			highmem_size = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3803ea8c27d..cbd532161f68 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
 	unsigned long flags;
 	struct zone *zone;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		if (!populated_zone(zone))
-			continue;
-
 		pset = zone_pcp(zone, cpu);
 
 		pcp = &pset->pcp;
@@ -1879,10 +1876,7 @@ void show_free_areas(void)
 	int cpu;
 	struct zone *zone;
 
-	for_each_zone(zone) {
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 
@@ -1922,12 +1916,9 @@ void show_free_areas(void)
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		int i;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -1967,12 +1958,9 @@ void show_free_areas(void)
 		printk("\n");
 	}
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s: ", zone->name);
 
@@ -2784,11 +2772,7 @@ static int __cpuinit process_zones(int cpu)
 
 	node_set_state(node, N_CPU);	/* this node has a cpu */
 
-	for_each_zone(zone) {
-
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, node);
 		if (!zone_pcp(zone, cpu))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1bca60f0c527..301f057fd115 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2061,11 +2061,9 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 	struct zone *zone;
 	unsigned long ret = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		enum lru_list l;
 
-		if (!populated_zone(zone))
-			continue;
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8cd81ea1ddc1..9826766f1274 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
 	int cpu;
 	int threshold;
 
-	for_each_zone(zone) {
-
-		if (!zone->present_pages)
-			continue;
-
+	for_each_populated_zone(zone) {
 		threshold = calculate_threshold(zone);
 
 		for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
 	int i;
 	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *p;
 
-		if (!populated_zone(zone))
-			continue;
-
 		p = zone_pcp(zone, cpu);
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-- 
cgit v1.2.3-71-gd317


From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Tue, 31 Mar 2009 15:19:39 -0700
Subject: vfs: add/use account_page_dirtied()

Add a helper function account_page_dirtied().  Use that from two
callsites.  reiser4 adds a function which adds a third callsite.

Signed-off-by: Edward Shishkin<edward.shishkin@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c         |  9 +--------
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 22 +++++++++++++++-------
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..73abe6d8218c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,14 +621,7 @@ static void __set_page_dirty(struct page *page,
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-
-		if (mapping_cap_account_dirty(mapping)) {
-			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			__inc_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
-			task_dirty_inc(current);
-			task_io_account_write(PAGE_CACHE_SIZE);
-		}
+		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1ea37fc7a24..2223f8dfa568 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -834,6 +834,7 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
+void account_page_dirtied(struct page *page, struct address_space *mapping);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 40ca7cdb653e..6aa92b03c747 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1197,6 +1197,20 @@ int __set_page_dirty_no_writeback(struct page *page)
 	return 0;
 }
 
+/*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+	if (mapping_cap_account_dirty(mapping)) {
+		__inc_zone_page_state(page, NR_FILE_DIRTY);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		task_dirty_inc(current);
+		task_io_account_write(PAGE_CACHE_SIZE);
+	}
+}
+
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
@@ -1226,13 +1240,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 		if (mapping2) { /* Race with truncate? */
 			BUG_ON(mapping2 != mapping);
 			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-			if (mapping_cap_account_dirty(mapping)) {
-				__inc_zone_page_state(page, NR_FILE_DIRTY);
-				__inc_bdi_stat(mapping->backing_dev_info,
-						BDI_RECLAIMABLE);
-				task_dirty_inc(current);
-				task_io_account_write(PAGE_CACHE_SIZE);
-			}
+			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
-- 
cgit v1.2.3-71-gd317


From d1d7487173eab8352125cf6cc271940f24254bd4 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:14 -0700
Subject: mm: remove pagevec_swap_free()

pagevec_swap_free() is now unused.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  1 -
 mm/swap.c               | 23 -----------------------
 2 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 7b2886fa7fdc..bab82f4c571c 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -24,7 +24,6 @@ void __pagevec_release(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
-void pagevec_swap_free(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/mm/swap.c b/mm/swap.c
index 8adb9feb61e1..6e83084c1f6c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -456,29 +456,6 @@ void pagevec_strip(struct pagevec *pvec)
 	}
 }
 
-/**
- * pagevec_swap_free - try to free swap space from the pages in a pagevec
- * @pvec: pagevec with swapcache pages to free the swap space of
- *
- * The caller needs to hold an extra reference to each page and
- * not hold the page lock on the pages.  This function uses a
- * trylock on the page lock so it may not always free the swap
- * space associated with a page.
- */
-void pagevec_swap_free(struct pagevec *pvec)
-{
-	int i;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
-
-		if (PageSwapCache(page) && trylock_page(page)) {
-			try_to_free_swap(page);
-			unlock_page(page);
-		}
-	}
-}
-
 /**
  * pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting pages are placed
-- 
cgit v1.2.3-71-gd317


From 610a77e04a8d9fe8764dc484e2182fa251ce1cc2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:16 -0700
Subject: memdup_user(): introduce

I notice there are many places doing copy_from_user() which follows
kmalloc():

        dst = kmalloc(len, GFP_KERNEL);
        if (!dst)
                return -ENOMEM;
        if (copy_from_user(dst, src, len)) {
		kfree(dst);
		return -EFAULT
	}

memdup_user() is a wrapper of the above code.  With this new function, we
don't have to write 'len' twice, which can lead to typos/mistakes.  It
also produces smaller code and kernel text.

A quick grep shows 250+ places where memdup_user() *may* be used.  I'll
prepare a patchset to do this conversion.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h |  1 +
 mm/util.c              | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2f..8852739f36df 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -12,6 +12,7 @@
 #include <linux/stddef.h>	/* for NULL */
 
 extern char *strndup_user(const char __user *, long);
+extern void *memdup_user(const void __user *, size_t);
 
 /*
  * Include machine specific inline routines
diff --git a/mm/util.c b/mm/util.c
index 37eaccdf3054..7c122e49f769 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -69,6 +69,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 }
 EXPORT_SYMBOL(kmemdup);
 
+/**
+ * memdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure.
+ */
+void *memdup_user(const void __user *src, size_t len)
+{
+	void *p;
+
+	/*
+	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
+	 * cause pagefault, which makes it pointless to use GFP_NOFS
+	 * or GFP_ATOMIC.
+	 */
+	p = kmalloc_track_caller(len, GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(p, src, len)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return p;
+}
+EXPORT_SYMBOL(memdup_user);
+
 /**
  * __krealloc - like krealloc() but don't free @p.
  * @p: object to reallocate memory for.
-- 
cgit v1.2.3-71-gd317


From 6a11f75b6a17b5d9ac5025f8d048382fd1f47377 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 31 Mar 2009 15:23:17 -0700
Subject: generic debug pagealloc

CONFIG_DEBUG_PAGEALLOC is now supported by x86, powerpc, sparc64, and
s390.  This patch implements it for the rest of the architectures by
filling the pages with poison byte patterns after free_pages() and
verifying the poison patterns before alloc_pages().

This generic one cannot detect invalid page accesses immediately but
invalid read access may cause invalid dereference by poisoned memory and
invalid write access can be detected after a long delay.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/mm/fault.c            |  18 ------
 arch/powerpc/Kconfig             |   3 +
 arch/powerpc/Kconfig.debug       |   1 +
 arch/s390/Kconfig                |   3 +
 arch/s390/Kconfig.debug          |   1 +
 arch/sparc/Kconfig               |   3 +
 arch/sparc/Kconfig.debug         |   3 +-
 arch/x86/Kconfig                 |   3 +
 arch/x86/Kconfig.debug           |   1 +
 include/linux/mm_types.h         |   4 ++
 include/linux/page-debug-flags.h |  30 +++++++++
 include/linux/poison.h           |   3 +
 lib/Kconfig.debug                |   1 +
 mm/Kconfig.debug                 |  17 ++++++
 mm/Makefile                      |   1 +
 mm/debug-pagealloc.c             | 129 +++++++++++++++++++++++++++++++++++++++
 16 files changed, 202 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/page-debug-flags.h
 create mode 100644 mm/Kconfig.debug
 create mode 100644 mm/debug-pagealloc.c

(limited to 'include/linux')

diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index ce4e4296b954..62d4abbaa654 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -250,21 +250,3 @@ asmlinkage void do_bus_error(unsigned long addr, int write_access,
 	dump_dtlb();
 	die("Bus Error", regs, SIGKILL);
 }
-
-/*
- * This functionality is currently not possible to implement because
- * we're using segmentation to ensure a fixed mapping of the kernel
- * virtual address space.
- *
- * It would be possible to implement this, but it would require us to
- * disable segmentation at startup and load the kernel mappings into
- * the TLB like any other pages. There will be lots of trickery to
- * avoid recursive invocation of the TLB miss handler, though...
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
-
-}
-EXPORT_SYMBOL(kernel_map_pages);
-#endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ad6b1c084fe3..45192dce65c4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -228,6 +228,9 @@ config PPC_OF_PLATFORM_PCI
 	depends on PPC64 # not supported on 32 bits yet
 	default n
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 22091bbfdc9b..6aa0b5e087cd 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -30,6 +30,7 @@ config DEBUG_STACK_USAGE
 config DEBUG_PAGEALLOC
         bool "Debug page memory allocations"
         depends on DEBUG_KERNEL && !HIBERNATION
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
         help
           Unmap pages from the kernel linear mapping after free_pages().
           This results in a large slowdown, but helps to find certain types
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2a8af5e16345..dcb667c4375a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -72,6 +72,9 @@ config PGSTE
 config VIRT_CPU_ACCOUNTING
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 mainmenu "Linux Kernel Configuration"
 
 config S390
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 4599fa06bd82..7e297a3cde34 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -9,6 +9,7 @@ source "lib/Kconfig.debug"
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a slowdown, but helps to find certain types of
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c3ea215334f6..cc12cd48bbc5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -124,6 +124,9 @@ config ARCH_NO_VIRT_TO_BUS
 config OF
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y if SPARC64
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug
index b8a15e271bfa..d001b42041a5 100644
--- a/arch/sparc/Kconfig.debug
+++ b/arch/sparc/Kconfig.debug
@@ -24,7 +24,8 @@ config STACK_DEBUG
 
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
-	depends on SPARC64 && DEBUG_KERNEL && !HIBERNATION
+	depends on DEBUG_KERNEL && !HIBERNATION
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 45161b816313..748e50a1a152 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,9 @@ config AUDIT_ARCH
 config ARCH_SUPPORTS_OPTIMIZED_INLINING
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
 	bool
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fdb45df608b6..a345cb5447a8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -75,6 +75,7 @@ config DEBUG_STACK_USAGE
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
+	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d84feb7bdbf0..ddadb4defe00 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
+#include <linux/page-debug-flags.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -174,6 +175,9 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+	unsigned long debug_flags;	/* Use atomic bitops on this */
+#endif
 };
 
 struct core_thread {
diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h
new file mode 100644
index 000000000000..b0638fd91e92
--- /dev/null
+++ b/include/linux/page-debug-flags.h
@@ -0,0 +1,30 @@
+#ifndef LINUX_PAGE_DEBUG_FLAGS_H
+#define  LINUX_PAGE_DEBUG_FLAGS_H
+
+/*
+ * page->debug_flags bits:
+ *
+ * PAGE_DEBUG_FLAG_POISON is set for poisoned pages. This is used to
+ * implement generic debug pagealloc feature. The pages are filled with
+ * poison patterns and set this flag after free_pages(). The poisoned
+ * pages are verified whether the patterns are not corrupted and clear
+ * the flag before alloc_pages().
+ */
+
+enum page_debug_flags {
+	PAGE_DEBUG_FLAG_POISON,		/* Page is poisoned */
+};
+
+/*
+ * Ensure that CONFIG_WANT_PAGE_DEBUG_FLAGS reliably
+ * gets turned off when no debug features are enabling it!
+ */
+
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+#if !defined(CONFIG_PAGE_POISONING) \
+/* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */
+#error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features!
+#endif
+#endif /* CONFIG_WANT_PAGE_DEBUG_FLAGS */
+
+#endif /* LINUX_PAGE_DEBUG_FLAGS_H */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 9f31683728fd..6729f7dcd60e 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -17,6 +17,9 @@
  */
 #define TIMER_ENTRY_STATIC	((void *) 0x74737461)
 
+/********** mm/debug-pagealloc.c **********/
+#define PAGE_POISON 0xaa
+
 /********** mm/slab.c **********/
 /*
  * Magic nums for obj red zoning.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 58bfe7e8faba..9638d99644af 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -796,6 +796,7 @@ config SYSCTL_SYSCALL_CHECK
 	  to properly maintain and use. This enables checks that help
 	  you to keep things correct.
 
+source mm/Kconfig.debug
 source kernel/trace/Kconfig
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 000000000000..c8d62d49a44e
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,17 @@
+config WANT_PAGE_DEBUG_FLAGS
+	bool
+
+config PAGE_POISONING
+	bool "Debug page memory allocations"
+	depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	depends on !HIBERNATION
+	select DEBUG_PAGEALLOC
+	select WANT_PAGE_DEBUG_FLAGS
+	help
+	   Fill the pages with poison patterns after free_pages() and verify
+	   the patterns before alloc_pages(). This results in a large slowdown,
+	   but helps to find certain types of memory corruptions.
+
+	   This option cannot enalbe with hibernation. Otherwise, it will get
+	   wrong messages for memory corruption because the free pages are not
+	   saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 818569b68f46..ec73c68b6015 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 000000000000..a1e3324de2b5
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-debug-flags.h>
+#include <linux/poison.h>
+
+static inline void set_page_poison(struct page *page)
+{
+	__set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+	__clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline bool page_poison(struct page *page)
+{
+	return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static void poison_highpage(struct page *page)
+{
+	/*
+	 * Page poisoning for highmem pages is not implemented.
+	 *
+	 * This can be called from interrupt contexts.
+	 * So we need to create a new kmap_atomic slot for this
+	 * application and it will need interrupt protection.
+	 */
+}
+
+static void poison_page(struct page *page)
+{
+	void *addr;
+
+	if (PageHighMem(page)) {
+		poison_highpage(page);
+		return;
+	}
+	set_page_poison(page);
+	addr = page_address(page);
+	memset(addr, PAGE_POISON, PAGE_SIZE);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+	unsigned char error = a ^ b;
+
+	return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+	unsigned char *start;
+	unsigned char *end;
+
+	for (start = mem; start < mem + bytes; start++) {
+		if (*start != PAGE_POISON)
+			break;
+	}
+	if (start == mem + bytes)
+		return;
+
+	for (end = mem + bytes - 1; end > start; end--) {
+		if (*end != PAGE_POISON)
+			break;
+	}
+
+	if (!printk_ratelimit())
+		return;
+	else if (start == end && single_bit_flip(*start, PAGE_POISON))
+		printk(KERN_ERR "pagealloc: single bit error\n");
+	else
+		printk(KERN_ERR "pagealloc: memory corruption\n");
+
+	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+			end - start + 1, 1);
+	dump_stack();
+}
+
+static void unpoison_highpage(struct page *page)
+{
+	/*
+	 * See comment in poison_highpage().
+	 * Highmem pages should not be poisoned for now
+	 */
+	BUG_ON(page_poison(page));
+}
+
+static void unpoison_page(struct page *page)
+{
+	if (PageHighMem(page)) {
+		unpoison_highpage(page);
+		return;
+	}
+	if (page_poison(page)) {
+		void *addr = page_address(page);
+
+		check_poison_mem(addr, PAGE_SIZE);
+		clear_page_poison(page);
+	}
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		unpoison_page(page + i);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (!debug_pagealloc_enabled)
+		return;
+
+	if (enable)
+		unpoison_pages(page, numpages);
+	else
+		poison_pages(page, numpages);
+}
-- 
cgit v1.2.3-71-gd317


From 704503d836042d4a4c7685b7036e7de0418fbc0f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:23:18 -0700
Subject: mm: fix proc_dointvec_userhz_jiffies "breakage"

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9838

On i386, HZ=1000, jiffies_to_clock_t() converts time in a somewhat strange
way from the user's point of view:

	# echo 500 >/proc/sys/vm/dirty_writeback_centisecs
	# cat /proc/sys/vm/dirty_writeback_centisecs
	499

So, we have 5000 jiffies converted to only 499 clock ticks and reported
back.

TICK_NSEC = 999848
ACTHZ = 256039

Keeping in-kernel variable in units passed from userspace will fix issue
of course, but this probably won't be right for every sysctl.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/writeback.h |  4 ++--
 kernel/sysctl.c           |  2 +-
 mm/page-writeback.c       | 20 +++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7300ecdc480c..93445477f86a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -109,8 +109,8 @@ extern int dirty_background_ratio;
 extern unsigned long dirty_background_bytes;
 extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
-extern int dirty_writeback_interval;
-extern int dirty_expire_interval;
+extern unsigned int dirty_writeback_interval;
+extern unsigned int dirty_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850f..2e490a389dd2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1010,7 +1010,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_userhz_jiffies,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6aa92b03c747..30351f0063ac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 20;
 unsigned long vm_dirty_bytes;
 
 /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
  */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
 
 /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
  */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
 
 	sync_supers();
 
-	oldest_jif = jiffies - dirty_expire_interval;
+	oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
 	start_jif = jiffies;
-	next_jif = start_jif + dirty_writeback_interval;
+	next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (dirty_writeback_interval)
-		mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+		mod_timer(&wb_timer, jiffies +
+			msecs_to_jiffies(dirty_writeback_interval * 10));
 	else
 		del_timer(&wb_timer);
 	return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+	mod_timer(&wb_timer,
+		  jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
-- 
cgit v1.2.3-71-gd317


From c2fdf3a9b2d52842808a8e551b53b55dd9b45030 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 31 Mar 2009 15:23:19 -0700
Subject: mm: enable hashdist by default on 64bit NUMA

On PowerPC we allocate large boot time hashes on node 0.  This leads to an
imbalance in the free memory, for example on a 64GB box (4 x 16GB nodes):

Free memory:
Node 0: 97.03%
Node 1: 98.54%
Node 2: 98.42%
Node 3: 98.53%

If we switch to using vmalloc (like ia64 and x86-64) things are more
balanced:

Free memory:
Node 0: 97.53%
Node 1: 98.35%
Node 2: 98.33%
Node 3: 98.33%

For many HPC applications we are limited by the free available memory on
the smallest node, so even though the same amount of memory is used the
better balancing helps.

Since all 64bit NUMA capable architectures should have sufficient vmalloc
space, it makes sense to enable it via CONFIG_64BIT.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 455d83219fae..bc3ab7073695 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -146,10 +146,10 @@ extern void *alloc_large_system_hash(const char *tablename,
 
 #define HASH_EARLY	0x00000001	/* Allocating during early boot? */
 
-/* Only NUMA needs hash distribution.
- * IA64 and x86_64 have sufficient vmalloc space.
+/* Only NUMA needs hash distribution. 64bit NUMA architectures have
+ * sufficient vmalloc space.
  */
-#if defined(CONFIG_NUMA) && (defined(CONFIG_IA64) || defined(CONFIG_X86_64))
+#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
 #define HASHDIST_DEFAULT 1
 #else
 #define HASHDIST_DEFAULT 0
-- 
cgit v1.2.3-71-gd317


From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 31 Mar 2009 15:23:21 -0700
Subject: mm: page_mkwrite change prototype to match fault

Change the page_mkwrite prototype to take a struct vm_fault, and return
VM_FAULT_xxx flags.  There should be no functional change.

This makes it possible to return much more detailed error information to
the VM (and also can provide more information eg.  virtual_address to the
driver, which might be important in some special cases).

This is required for a subsequent fix.  And will also make it easier to
merge page_mkwrite() with fault() in future.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Artem Bityutskiy <dedekind@infradead.org>
Cc: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  2 +-
 drivers/video/fb_defio.c          |  3 ++-
 fs/btrfs/ctree.h                  |  2 +-
 fs/btrfs/inode.c                  |  5 ++++-
 fs/buffer.c                       |  6 +++++-
 fs/ext4/ext4.h                    |  2 +-
 fs/ext4/inode.c                   |  5 ++++-
 fs/fuse/file.c                    |  3 ++-
 fs/gfs2/ops_file.c                |  5 ++++-
 fs/nfs/file.c                     |  5 ++++-
 fs/ocfs2/mmap.c                   |  6 ++++--
 fs/ubifs/file.c                   |  9 ++++++---
 fs/xfs/linux-2.6/xfs_file.c       |  4 ++--
 include/linux/buffer_head.h       |  2 +-
 include/linux/mm.h                |  3 ++-
 mm/memory.c                       | 26 ++++++++++++++++++++++----
 16 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4e78ce677843..76efe5b71d7d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -505,7 +505,7 @@ prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
 	int (*fault)(struct vm_area_struct*, struct vm_fault *);
-	int (*page_mkwrite)(struct vm_area_struct *, struct page *);
+	int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
 	int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 
 locking rules:
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 082026546aee..0a7a6679ee6e 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -85,8 +85,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
 /* vm_ops->page_mkwrite handler */
 static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
-				  struct page *page)
+				  struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct fb_info *info = vma->vm_private_data;
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *cur;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d8..7dd1b6d0bf32 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2060,7 +2060,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..ec5423790bbb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4362,6 +4363,8 @@ again:
 out_unlock:
 	unlock_page(page);
 out:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 73abe6d8218c..6d51a3da362c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
  * unlock the page.
  */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		   get_block_t get_block)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
@@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 		ret = block_commit_write(page, 0, end);
 
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
+
 	unlock_page(page);
 	return ret;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057b..990c94000924 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1098,7 +1098,7 @@ extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t ext4_get_reserved_space(struct inode *inode);
 
 /* ioctl.c */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db79..dd82ff390067 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5146,8 +5146,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
@@ -5199,6 +5200,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		goto out_unlock;
 	ret = 0;
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	up_read(&inode->i_alloc_sem);
 	return ret;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719bd..4e340fedf768 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	/*
 	 * Don't use page->mapping as it may become NULL from a
 	 * concurrent truncate.
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500b..70b9b8548945 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,8 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..cec79392e4ba 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 };
 
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
 	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
@@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		ret = pagelen;
 out_unlock:
 	unlock_page(page);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct buffer_head *di_bh = NULL;
 	sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
 	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret2 < 0)
 		mlog_errno(ret2);
-
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f261..0ff89fe71e51 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made
  * writable. UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
 
 	if (unlikely(c->ro_media))
-		return -EROFS;
+		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
 	 * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		if (err == -ENOSPC)
 			ubifs_warn("out of space for mmapped file "
 				   "(inode number %lu)", inode->i_ino);
-		return err;
+		return VM_FAULT_SIGBUS;
 	}
 
 	lock_page(page);
@@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
 	unlock_page(page);
 	ubifs_release_budget(c, &req);
+	if (err)
+		err = VM_FAULT_SIGBUS;
 	return err;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0c..f4e255441574 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
 STATIC int
 xfs_vm_page_mkwrite(
 	struct vm_area_struct	*vma,
-	struct page		*page)
+	struct vm_fault		*vmf)
 {
-	return block_page_mkwrite(vma, page, xfs_get_blocks);
+	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
 const struct file_operations xfs_file_operations = {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f19fd9045ea0..3d7bcde2e332 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -216,7 +216,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
-int block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2223f8dfa568..aeabe953ba4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -135,6 +135,7 @@ extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
 
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -187,7 +188,7 @@ struct vm_operations_struct {
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
diff --git a/mm/memory.c b/mm/memory.c
index 5b4ad5e4f98d..cf6873e91c6a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * get_user_pages(.write=1, .force=1).
 		 */
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			struct vm_fault vmf;
+			int tmp;
+
+			vmf.virtual_address = (void __user *)(address &
+								PAGE_MASK);
+			vmf.pgoff = old_page->index;
+			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+			vmf.page = old_page;
+
 			/*
 			 * Notify the address space that the page is about to
 			 * become writable so that it can prohibit this or wait
@@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 
-			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+			if (unlikely(tmp &
+					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+				ret = tmp;
 				goto unwritable_page;
+			}
 
 			/*
 			 * Since we dropped the lock we need to revalidate
@@ -2106,7 +2119,7 @@ oom:
 
 unwritable_page:
 	page_cache_release(old_page);
-	return VM_FAULT_SIGBUS;
+	return ret;
 }
 
 /*
@@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			 * to become writable
 			 */
 			if (vma->vm_ops->page_mkwrite) {
+				int tmp;
+
 				unlock_page(page);
-				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
-					ret = VM_FAULT_SIGBUS;
+				vmf.flags |= FAULT_FLAG_MKWRITE;
+				tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+				if (unlikely(tmp &
+					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+					ret = tmp;
 					anon = 1; /* no anon but release vmf.page */
 					goto out_unlocked;
 				}
-- 
cgit v1.2.3-71-gd317


From f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 31 Mar 2009 15:23:25 -0700
Subject: mm: introduce debug_kmap_atomic

x86 has debug_kmap_atomic_prot() which is error checking function for
kmap_atomic.  It is usefull for the other architectures, although it needs
CONFIG_TRACE_IRQFLAGS_SUPPORT.

This patch exposes it to the other architectures.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/highmem_32.c | 45 +--------------------------------------------
 include/linux/highmem.h  | 12 ++++++++++++
 mm/highmem.c             | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 522db5e3d0bf..8126e8d1a2a4 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -19,49 +19,6 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	static unsigned warn_count = 10;
-
-	if (unlikely(warn_count == 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#endif
-}
-
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -81,7 +38,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic_prot(type);
+	debug_kmap_atomic(type);
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 13875ce9112a..7ff5c55f9b55 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -187,4 +187,16 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	kunmap_atomic(vto, KM_USER1);
 }
 
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type);
+
+#else
+
+static inline void debug_kmap_atomic(enum km_type type)
+{
+}
+
+#endif
+
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/mm/highmem.c b/mm/highmem.c
index 910198037bf5..68eb1d9b63fa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,3 +422,48 @@ void __init page_address_init(void)
 }
 
 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type)
+{
+	static unsigned warn_count = 10;
+
+	if (unlikely(warn_count == 0))
+		return;
+
+	if (unlikely(in_interrupt())) {
+		if (in_irq()) {
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		} else if (!irqs_disabled()) {	/* softirq */
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+			    type != KM_SKB_SUNRPC_DATA &&
+			    type != KM_SKB_DATA_SOFTIRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		}
+	}
+
+	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+		if (!irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+		if (irq_count() == 0 && !irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	}
+}
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 33925b25d2c00a29664f1994ab350a9bff70f7a2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 31 Mar 2009 15:23:26 -0700
Subject: nommu: there is no mlock() for NOMMU, so don't provide the bits

The mlock() facility does not exist for NOMMU since all mappings are
effectively locked anyway, so we don't make the bits available when
they're not useful.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Enrik Berkhan <Enrik.Berkhan@ge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 20 +++++++++++++-------
 mm/Kconfig                 |  8 ++++++++
 mm/internal.h              |  8 +++++---
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 219a523ecdb0..61df1779b2a5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -96,6 +96,8 @@ enum pageflags {
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 #ifdef CONFIG_UNEVICTABLE_LRU
 	PG_unevictable,		/* Page is "unevictable"  */
+#endif
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
@@ -234,20 +236,20 @@ PAGEFLAG_FALSE(SwapCache)
 #ifdef CONFIG_UNEVICTABLE_LRU
 PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 	TESTCLEARFLAG(Unevictable, unevictable)
+#else
+PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
+	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
+	__CLEARPAGEFLAG_NOOP(Unevictable)
+#endif
 
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 #define MLOCK_PAGES 1
 PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
 	TESTSCFLAG(Mlocked, mlocked)
-
 #else
-
 #define MLOCK_PAGES 0
 PAGEFLAG_FALSE(Mlocked)
 	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
-
-PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
-	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
-	__CLEARPAGEFLAG_NOOP(Unevictable)
 #endif
 
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
@@ -367,9 +369,13 @@ static inline void __ClearPageTail(struct page *page)
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 #define __PG_UNEVICTABLE	(1 << PG_unevictable)
-#define __PG_MLOCKED		(1 << PG_mlocked)
 #else
 #define __PG_UNEVICTABLE	0
+#endif
+
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
+#define __PG_MLOCKED		(1 << PG_mlocked)
+#else
 #define __PG_MLOCKED		0
 #endif
 
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf2..8c895973dfba 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,5 +214,13 @@ config UNEVICTABLE_LRU
 	  will use one page flag and increase the code size a little,
 	  say Y unless you know what you are doing.
 
+config HAVE_MLOCK
+	bool
+	default y if MMU=y
+
+config HAVE_MLOCKED_PAGE_BIT
+	bool
+	default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
+
 config MMU_NOTIFIER
 	bool
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2a..987bb03fbdd8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+#ifdef CONFIG_HAVE_MLOCK
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
 	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
 }
+#endif
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 /*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
 }
 #endif
 
-#ifdef CONFIG_UNEVICTABLE_LRU
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
  * Called only in fault path via page_evictable() for a new page
  * to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
 	}
 }
 
-#else /* CONFIG_UNEVICTABLE_LRU */
+#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
 	return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 static inline void free_page_mlock(struct page *page) { }
 
-#endif /* CONFIG_UNEVICTABLE_LRU */
+#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 
 /*
  * Return the mem_map entry representing the 'offset' subpage within
-- 
cgit v1.2.3-71-gd317


From 327c0e968645f2601a43f5ea7c19c7b3a5fa0a34 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:31 -0700
Subject: vmscan: fix it to take care of nodemask

try_to_free_pages() is used for the direct reclaim of up to
SWAP_CLUSTER_MAX pages when watermarks are low.  The caller to
alloc_pages_nodemask() can specify a nodemask of nodes that are allowed to
be used but this is not passed to try_to_free_pages().  This can lead to
unnecessary reclaim of pages that are unusable by the caller and int the
worst case lead to allocation failure as progress was not been make where
it is needed.

This patch passes the nodemask used for alloc_pages_nodemask() to
try_to_free_pages().

Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c          |  2 +-
 include/linux/swap.h |  2 +-
 mm/page_alloc.c      |  3 ++-
 mm/vmscan.c          | 13 +++++++++++--
 4 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0c14f8d52ee5..c77b848c3d43 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -290,7 +290,7 @@ static void free_more_memory(void)
 						&zone);
 		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS);
+						GFP_NOFS, NULL);
 	}
 }
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d30215578877..b8b0c4ce83e6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -212,7 +212,7 @@ static inline void lru_cache_add_active_file(struct page *page)
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-					gfp_t gfp_mask);
+					gfp_t gfp_mask, nodemask_t *mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap,
 						  unsigned int swappiness);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbd532161f68..0284e528748d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1582,7 +1582,8 @@ nofail_alloc:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+	did_some_progress = try_to_free_pages(zonelist, order,
+						gfp_mask, nodemask);
 
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f4619c6cd59e..06e72693b458 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,6 +78,12 @@ struct scan_control {
 	/* Which cgroup do we reclaim from */
 	struct mem_cgroup *mem_cgroup;
 
+	/*
+	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
+	 * are scanned.
+	 */
+	nodemask_t	*nodemask;
+
 	/* Pluggable isolate pages callback */
 	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
 			unsigned long *scanned, int order, int mode,
@@ -1538,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 	struct zone *zone;
 
 	sc->all_unreclaimable = 1;
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+					sc->nodemask) {
 		if (!populated_zone(zone))
 			continue;
 		/*
@@ -1683,7 +1690,7 @@ out:
 }
 
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-								gfp_t gfp_mask)
+				gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	struct scan_control sc = {
 		.gfp_mask = gfp_mask,
@@ -1694,6 +1701,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.order = order,
 		.mem_cgroup = NULL,
 		.isolate_pages = isolate_pages_global,
+		.nodemask = nodemask,
 	};
 
 	return do_try_to_free_pages(zonelist, &sc);
@@ -1714,6 +1722,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.order = 0,
 		.mem_cgroup = mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
+		.nodemask = NULL, /* we don't care the placement */
 	};
 	struct zonelist *zonelist;
 
-- 
cgit v1.2.3-71-gd317


From 9fab5619bdd7f84cdd22cc760778f759f9819a33 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 31 Mar 2009 15:23:33 -0700
Subject: shmem: writepage directly to swap

Synopsis: if shmem_writepage calls swap_writepage directly, most shmem
swap loads benefit, and a catastrophic interaction between SLUB and some
flash storage is avoided.

shmem_writepage() has always been peculiar in making no attempt to write:
it has just transferred a shmem page from file cache to swap cache, then
let that page make its way around the LRU again before being written and
freed.

The idea was that people use tmpfs because they want those pages to stay
in RAM; so although we give it an overflow to swap, we should resist
writing too soon, giving those pages a second chance before they can be
reclaimed.

That was always questionable, and I've toyed with this patch for years;
but never had a clear justification to depart from the original design.

It became more questionable in 2.6.28, when the split LRU patches classed
shmem and tmpfs pages as SwapBacked rather than as file_cache: that in
itself gives them more resistance to reclaim than normal file pages.  I
prepared this patch for 2.6.29, but the merge window arrived before I'd
completed gathering statistics to justify sending it in.

Then while comparing SLQB against SLUB, running SLUB on a laptop I'd
habitually used with SLAB, I found SLUB to run my tmpfs kbuild swapping
tests five times slower than SLAB or SLQB - other machines slower too, but
nowhere near so bad.  Simpler "cp -a" swapping tests showed the same.

slub_max_order=0 brings sanity to all, but heavy swapping is too far from
normal to justify such a tuning.  The crucial factor on that laptop turns
out to be that I'm using an SD card for swap.  What happens is this:

By default, SLUB uses order-2 pages for shmem_inode_cache (and many other
fs inodes), so creating tmpfs files under memory pressure brings lumpy
reclaim into play.  One subpage of the order is chosen from the bottom of
the LRU as usual, then the other three picked out from their random
positions on the LRUs.

In a tmpfs load, many of these pages will be ones which already passed
through shmem_writepage, so already have swap allocated.  And though their
offsets on swap were probably allocated sequentially, now that the pages
are picked off at random, their swap offsets are scattered.

But the flash storage on the SD card is very sensitive to having its
writes merged: once swap is written at scattered offsets, performance
falls apart.  Rotating disk seeks increase too, but less disastrously.

So: stop giving shmem/tmpfs pages a second pass around the LRU, write them
out to swap as soon as their swap has been allocated.

It's surely possible to devise an artificial load which runs faster the
old way, one whose sizing is such that the tmpfs pages on their second
pass are the ones that are wanted again, and other pages not.

But I've not yet found such a load: on all machines, under the loads I've
tried, immediate swap_writepage speeds up shmem swapping: especially when
using the SLUB allocator (and more effectively than slub_max_order=0), but
also with the others; and it also reduces the variance between runs.  How
much faster varies widely: a factor of five is rare, 5% is common.

One load which might have suffered: imagine a swapping shmem load in a
limited mem_cgroup on a machine with plenty of memory.  Before 2.6.29 the
swapcache was not charged, and such a load would have run quickest with
the shmem swapcache never written to swap.  But now swapcache is charged,
so even this load benefits from shmem_writepage directly to swap.

Apologies for the #ifndef CONFIG_SWAP swap_writepage() stub in swap.h:
it's silly because that will never get called; but refactoring shmem.c
sensibly according to CONFIG_SWAP will be a separate task.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 mm/shmem.c           | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index b8b0c4ce83e6..62d81435347a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -382,6 +382,11 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+{
+	return 0;
+}
+
 static inline struct page *lookup_swap_cache(swp_entry_t swp)
 {
 	return NULL;
diff --git a/mm/shmem.c b/mm/shmem.c
index 7ec78e24a30d..d94d2e9146bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1068,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		swap_duplicate(swap);
 		BUG_ON(page_mapped(page));
 		page_cache_release(page);	/* pagecache ref */
-		set_page_dirty(page);
-		unlock_page(page);
+		swap_writepage(page, wbc);
 		if (inode) {
 			mutex_lock(&shmem_swaplist_mutex);
 			/* move instead of add in case we're racing */
-- 
cgit v1.2.3-71-gd317


From a8af78982ff4c0b3731527b0217d286a343a3089 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 31 Mar 2009 15:23:37 -0700
Subject: pm: rework includes, remove arch ifdefs

Make the following header file changes:

 - remove arch ifdefs and asm/suspend.h from linux/suspend.h
 - add asm/suspend.h to disk.c (for arch_prepare_suspend())
 - add linux/io.h to swsusp.c (for ioremap())
 - x86 32/64 bit compile fixes

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/asm-offsets_32.c | 1 +
 arch/x86/kernel/asm-offsets_64.c | 1 +
 arch/x86/power/cpu_32.c          | 1 +
 arch/x86/power/cpu_64.c          | 1 +
 arch/x86/power/hibernate_64.c    | 1 +
 include/linux/suspend.h          | 3 ---
 kernel/power/disk.c              | 1 +
 kernel/power/swsusp.c            | 1 +
 8 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fbf2f33e3080..5a6aa1c1162f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,6 +18,7 @@
 #include <asm/thread_info.h>
 #include <asm/bootparam.h>
 #include <asm/elf.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 8793ab33e2c1..e72f062fb4b5 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -16,6 +16,7 @@
 #include <asm/thread_info.h>
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 274d06082f48..ce702c5b3a2c 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static struct saved_context saved_context;
 
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index e3b6cf70d62c..5343540f2607 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static void fix_processor_context(void);
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 6dd000dd7933..65fdc86e923f 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
+#include <asm/suspend.h>
 
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index c7d9bb1832ba..3e3a4364cbff 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -1,9 +1,6 @@
 #ifndef _LINUX_SUSPEND_H
 #define _LINUX_SUSPEND_H
 
-#if defined(CONFIG_X86) || defined(CONFIG_FRV) || defined(CONFIG_PPC32) || defined(CONFIG_PPC64)
-#include <asm/suspend.h>
-#endif
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e886d1332a10..f3db382c2b2d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <asm/suspend.h>
 
 #include "power.h"
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1ee6636414b2..78c35047586d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/rbtree.h>
+#include <linux/io.h>
 
 #include "power.h"
 
-- 
cgit v1.2.3-71-gd317


From 53d6660836f233df66490707365ab177e5fb2bb4 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05@yahoo.co.jp>
Date: Tue, 31 Mar 2009 15:23:43 -0700
Subject: loop: add ioctl to resize a loop device

Add the ability to 'resize' the loop device on the fly.

One practical application is a loop file with XFS filesystem, already
mounted: You can easily enlarge the file (append some bytes) and then call
ioctl(fd, LOOP_SET_CAPACITY, new); The loop driver will learn about the
new size and you can use xfs_growfs later on, which will allow you to use
full capacity of the loop file without the need to unmount.

Test app:

#include <linux/fs.h>
#include <linux/loop.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define _GNU_SOURCE
#include <getopt.h>

char *me;

void usage(FILE *f)
{
	fprintf(f, "%s [options] loop_dev [backend_file]\n"
		"-s, --set new_size_in_bytes\n"
		"\twhen backend_file is given, "
		"it will be expanded too while keeping the original contents\n",
		me);
}

struct option opts[] = {
	{
		.name		= "set",
		.has_arg	= 1,
		.flag		= NULL,
		.val		= 's'
	},
	{
		.name		= "help",
		.has_arg	= 0,
		.flag		= NULL,
		.val		= 'h'
	}
};

void err_size(char *name, __u64 old)
{
	fprintf(stderr, "size must be larger than current %s (%llu)\n",
		name, old);
}

int main(int argc, char *argv[])
{
	int fd, err, c, i, bfd;
	ssize_t ssz;
	size_t sz;
	__u64 old, new, append;
	char a[BUFSIZ];
	struct stat st;
	FILE *out;
	char *backend, *dev;

	err = EINVAL;
	out = stderr;
	me = argv[0];
	new = 0;
	while ((c = getopt_long(argc, argv, "s:h", opts, &i)) != -1) {
		switch (c) {
		case 's':
			errno = 0;
			new = strtoull(optarg, NULL, 0);
			if (errno) {
				err = errno;
				perror(argv[i]);
				goto out;
			}
			break;

		case 'h':
			err = 0;
			out = stdout;
			goto err;

		default:
			perror(argv[i]);
			goto err;
		}
	}

	if (optind < argc)
		dev = argv[optind++];
	else
		goto err;

	fd = open(dev, O_RDONLY);
	if (fd < 0) {
		err = errno;
		perror(dev);
		goto out;
	}

	err = ioctl(fd, BLKGETSIZE64, &old);
	if (err) {
		err = errno;
		perror("ioctl BLKGETSIZE64");
		goto out;
	}

	if (!new) {
		printf("%llu\n", old);
		goto out;
	}

	if (new < old) {
		err = EINVAL;
		err_size(dev, old);
		goto out;
	}

	if (optind < argc) {
		backend = argv[optind++];
		bfd = open(backend, O_WRONLY|O_APPEND);
		if (bfd < 0) {
			err = errno;
			perror(backend);
			goto out;
		}
		err = fstat(bfd, &st);
		if (err) {
			err = errno;
			perror(backend);
			goto out;
		}
		if (new < st.st_size) {
			err = EINVAL;
			err_size(backend, st.st_size);
			goto out;
		}
		append = new - st.st_size;
		sz = sizeof(a);
		while (append > 0) {
			if (append < sz)
				sz = append;
			ssz = write(bfd, a, sz);
			if (ssz != sz) {
				err = errno;
				perror(backend);
				goto out;
			}
			append -= sz;
		}
		err = fsync(bfd);
		if (err) {
			err = errno;
			perror(backend);
			goto out;
		}
	}

	err = ioctl(fd, LOOP_SET_CAPACITY, new);
	if (err) {
		err = errno;
		perror("ioctl LOOP_SET_CAPACITY");
	}
	goto out;

 err:
	usage(out);
 out:
	return err;
}

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Tomas Matejicek <tomas@slax.org>
Cc: <util-linux-ng@vger.kernel.org>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/loop.c | 30 ++++++++++++++++++++++++++++++
 include/linux/loop.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2621ed2ce6d2..40b17d3b55a1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1192,6 +1192,30 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	return err;
 }
 
+static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
+{
+	int err;
+	sector_t sec;
+	loff_t sz;
+
+	err = -ENXIO;
+	if (unlikely(lo->lo_state != Lo_bound))
+		goto out;
+	err = figure_loop_size(lo);
+	if (unlikely(err))
+		goto out;
+	sec = get_capacity(lo->lo_disk);
+	/* the width of sector_t may be narrow for bit-shift */
+	sz = sec;
+	sz <<= 9;
+	mutex_lock(&bdev->bd_mutex);
+	bd_set_size(bdev, sz);
+	mutex_unlock(&bdev->bd_mutex);
+
+ out:
+	return err;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1224,6 +1248,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
 		break;
+	case LOOP_SET_CAPACITY:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_capacity(lo, bdev);
+		break;
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
@@ -1371,6 +1400,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 			lo, (struct compat_loop_info __user *) arg);
 		mutex_unlock(&lo->lo_ctl_mutex);
 		break;
+	case LOOP_SET_CAPACITY:
 	case LOOP_CLR_FD:
 	case LOOP_GET_STATUS64:
 	case LOOP_SET_STATUS64:
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 6ffd6db5bb0d..40725447f5e0 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -160,5 +160,6 @@ int loop_unregister_transfer(int number);
 #define LOOP_SET_STATUS64	0x4C04
 #define LOOP_GET_STATUS64	0x4C05
 #define LOOP_CHANGE_FD		0x4C06
+#define LOOP_SET_CAPACITY	0x4C07
 
 #endif
-- 
cgit v1.2.3-71-gd317


From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 31 Mar 2009 15:23:46 -0700
Subject: filesystem freeze: allow SysRq emergency thaw to thaw frozen
 filesystems

Now that the filesystem freeze operation has been elevated to the VFS, and
is just an ioctl away, some sort of safety net for unintentionally frozen
root filesystems may be in order.

The timeout thaw originally proposed did not get merged, but perhaps
something like this would be useful in emergencies.

For example, freeze /path/to/mountpoint may freeze your root filesystem if
you forgot that you had that unmounted.

I chose 'j' as the last remaining character other than 'h' which is sort
of reserved for help (because help is generated on any unknown character).

I've tested this on a non-root fs with multiple (nested) freezers, as well
as on a system rendered unresponsive due to a frozen root fs.

[randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysrq.txt |  5 +++++
 drivers/char/sysrq.c    | 19 ++++++++++++++++++-
 fs/buffer.c             | 33 +++++++++++++++++++++++++++++++++
 include/linux/fs.h      |  1 +
 4 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 9e592c718afb..afa2946892da 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -81,6 +81,8 @@ On all -  write a character to /proc/sysrq-trigger.  e.g.:
 
 'i'     - Send a SIGKILL to all processes, except for init.
 
+'j'     - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl.
+
 'k'     - Secure Access Key (SAK) Kills all programs on the current virtual
           console. NOTE: See important comments below in SAK section.
 
@@ -160,6 +162,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you
 are unable to kill any other way, especially if it's spawning other
 processes.
 
+"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen
+(probably root) filesystem via the FIFREEZE ioctl.
+
 *  Sometimes SysRq seems to get 'stuck' after using it, what can I do?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 That happens to me, also. I've found that tapping shift, alt, and control
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 33a9351c896d..5afe7316c72e 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -346,6 +346,19 @@ static struct sysrq_key_op sysrq_moom_op = {
 	.enable_mask	= SYSRQ_ENABLE_SIGNAL,
 };
 
+#ifdef CONFIG_BLOCK
+static void sysrq_handle_thaw(int key, struct tty_struct *tty)
+{
+	emergency_thaw_all();
+}
+static struct sysrq_key_op sysrq_thaw_op = {
+	.handler	= sysrq_handle_thaw,
+	.help_msg	= "thaw-filesystems(J)",
+	.action_msg	= "Emergency Thaw of all frozen filesystems",
+	.enable_mask	= SYSRQ_ENABLE_SIGNAL,
+};
+#endif
+
 static void sysrq_handle_kill(int key, struct tty_struct *tty)
 {
 	send_sig_all(SIGKILL);
@@ -396,9 +409,13 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
 	&sysrq_moom_op,			/* f */
 	/* g: May be registered by ppc for kgdb */
 	NULL,				/* g */
-	NULL,				/* h */
+	NULL,				/* h - reserved for help */
 	&sysrq_kill_op,			/* i */
+#ifdef CONFIG_BLOCK
+	&sysrq_thaw_op,			/* j */
+#else
 	NULL,				/* j */
+#endif
 	&sysrq_SAK_op,			/* k */
 #ifdef CONFIG_SMP
 	&sysrq_showallcpus_op,		/* l */
diff --git a/fs/buffer.c b/fs/buffer.c
index c77b848c3d43..f5f8b15a6e40 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -547,6 +547,39 @@ repeat:
 	return err;
 }
 
+void do_thaw_all(unsigned long unused)
+{
+	struct super_block *sb;
+	char b[BDEVNAME_SIZE];
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+			printk(KERN_WARNING "Emergency Thaw on %s\n",
+			       bdevname(sb->s_bdev, b));
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+	pdflush_operation(do_thaw_all, 0);
+}
+
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87e7bfc5ebd7..61211ad823fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1878,6 +1878,7 @@ extern struct block_device *open_by_devnum(dev_t, fmode_t);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
+extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
 extern int fsync_super(struct super_block *);
-- 
cgit v1.2.3-71-gd317


From 311d07611e8b354cc1ee6546e4c574c01111adc8 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 31 Mar 2009 15:23:51 -0700
Subject: introduce pr_cont() macro

We cover all log-levels by pr_...  macros except KERN_CONT one.  Add it
for convenience.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f81d80f47dcb..e720b0da7751 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -353,6 +353,8 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
         printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info(fmt, ...) \
         printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_cont(fmt, ...) \
+	printk(KERN_CONT fmt, ##__VA_ARGS__)
 
 /* If you are writing a driver, please use dev_dbg instead */
 #if defined(DEBUG)
-- 
cgit v1.2.3-71-gd317


From bcd0b235bf3808dec5115c381cd55568f63b85f0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:18 -0700
Subject: eventfd: improve support for semaphore-like behavior

People started using eventfd in a semaphore-like way where before they
were using pipes.

That is, counter-based resource access.  Where a "wait()" returns
immediately by decrementing the counter by one, if counter is greater than
zero.  Otherwise will wait.  And where a "post(count)" will add count to
the counter releasing the appropriate amount of waiters.  If eventfd the
"post" (write) part is fine, while the "wait" (read) does not dequeue 1,
but the whole counter value.

The problem with eventfd is that a read() on the fd returns and wipes the
whole counter, making the use of it as semaphore a little bit more
cumbersome.  You can do a read() followed by a write() of COUNTER-1, but
IMO it's pretty easy and cheap to make this work w/out extra steps.  This
patch introduces a new eventfd flag that tells eventfd to only dequeue 1
from the counter, allowing simple read/write to make it behave like a
semaphore.  Simple test here:

http://www.xmailserver.org/eventfd-sem.c

To be back-compatible with earlier kernels, userspace applications should
probe for the availability of this feature via

#ifdef EFD_SEMAPHORE
	fd = eventfd2 (CNT, EFD_SEMAPHORE);
	if (fd == -1 && errno == EINVAL)
		<fallback>
#else
		<fallback>
#endif

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <linux-api@vger.kernel.org>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c            | 20 +++++++++++---------
 include/linux/eventfd.h | 12 +++++++++++-
 2 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa2..91c0829a7035 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
 	 * issue a wakeup.
 	 */
 	__u64 count;
+	unsigned int flags;
 };
 
 /*
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	ssize_t res;
-	__u64 ucnt;
+	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
 	if (count < sizeof(ucnt))
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
-	ucnt = ctx->count;
-	if (ucnt > 0)
+	if (ctx->count > 0)
 		res = sizeof(ucnt);
 	else if (!(file->f_flags & O_NONBLOCK)) {
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (res = 0;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (ctx->count > 0) {
-				ucnt = ctx->count;
 				res = sizeof(ucnt);
 				break;
 			}
@@ -117,8 +116,9 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
-		ctx->count = 0;
+	if (likely(res > 0)) {
+		ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+		ctx->count -= ucnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked(&ctx->wqh);
 	}
@@ -166,7 +166,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
+	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked(&ctx->wqh);
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
+	ctx->flags = flags;
 
 	/*
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
 	 */
 	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-			      flags & (O_CLOEXEC | O_NONBLOCK));
+			      flags & EFD_SHARED_FCNTL_FLAGS);
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
 	return sys_eventfd2(count, 0);
 }
+
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index a667637b54e3..f45a8ae5f828 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -13,10 +13,20 @@
 /* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
-/* Flags for eventfd2.  */
+/*
+ * CAREFUL: Check include/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from eventfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define EFD_SEMAPHORE (1 << 0)
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
+#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+
 struct file *eventfd_fget(int fd);
 int eventfd_signal(struct file *file, int n);
 
-- 
cgit v1.2.3-71-gd317


From 4ede816ac36e027db5fe0051ad9c73f76db63772 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: add __wake_up_locked_key() and
 __wake_up_sync_key()

This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.

The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to.  So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time.  This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.

The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.

By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.

Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).

Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled.  The former will gain some
efficiency though.

As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues.  I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.

Test program available here:

http://www.xmailserver.org/epoll_test.c

This patch:

Nothing revolutionary here.  Just using the available "key" that our
wakeup core already support.  The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().

The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both.  I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  7 +++++--
 kernel/sched.c       | 23 +++++++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index a210ede73b56..0d2eeb03a718 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -135,8 +135,11 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
-extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			void *key);
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_bit(wait_queue_head_t *, void *, int);
 int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
 int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbef..73513f4e19df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5196,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+
 /**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
@@ -5209,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int sync = 1;
@@ -5222,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 		sync = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	__wake_up_common(q, mode, nr_exclusive, sync, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
 /**
-- 
cgit v1.2.3-71-gd317


From c0da37753695e010776ccf2200a5731e0f88a9f3 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: introduce new *_poll() wakeup macros

Introduce new wakeup macros that allow passing an event mask to the wakeup
targets.  They exactly mimic their non-_poll() counterpart, with the added
event mask passing capability.  I did add only the ones currently
requested, avoiding the _nr() and _all() for the moment.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 0d2eeb03a718..5d631c17eaee 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -158,21 +158,17 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
 #define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
- * macro to avoid include hell
+ * Wakeup macros to be used to report events to the targets.
  */
-#define wake_up_nested(x, s)						\
-do {									\
-	unsigned long flags;						\
-									\
-	spin_lock_irqsave_nested(&(x)->lock, flags, (s));		\
-	wake_up_locked(x); 						\
-	spin_unlock_irqrestore(&(x)->lock, flags);			\
-} while (0)
-#else
-#define wake_up_nested(x, s)		wake_up(x)
-#endif
+#define wake_up_poll(x, m)				\
+	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
+#define wake_up_locked_poll(x, m)				\
+	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+#define wake_up_interruptible_poll(x, m)			\
+	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
+#define wake_up_interruptible_sync_poll(x, m)				\
+	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
 
 #define __wait_event(wq, condition) 					\
 do {									\
-- 
cgit v1.2.3-71-gd317


From 364fdbc00fbdd409ade63500710123fe323aa164 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:36 -0700
Subject: spi_mpc83xx: rework chip selects handling

The main purpose of this patch is to pass 'struct spi_device' to the chip
select handling routines.  This is needed so that we could implement
full-fledged OpenFirmware support for this driver.

While at it, also:
- Replace two {de,activate}_cs routines by single cs_contol().
- Don't duplicate platform data callbacks in mpc83xx_spi struct.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/83xx/mpc832x_rdb.c | 16 ++++------------
 arch/powerpc/sysdev/fsl_soc.c             | 14 ++++++--------
 arch/powerpc/sysdev/fsl_soc.h             |  5 +++--
 drivers/spi/spi_mpc83xx.c                 | 20 +++++++-------------
 include/linux/fsl_devices.h               |  5 +++--
 5 files changed, 23 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index 2a1295f19832..28e23cde64a1 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -39,16 +39,10 @@
 #endif
 
 #ifdef CONFIG_QUICC_ENGINE
-static void mpc83xx_spi_activate_cs(u8 cs, u8 polarity)
+static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on)
 {
-	pr_debug("%s %d %d\n", __func__, cs, polarity);
-	par_io_data_set(3, 13, polarity);
-}
-
-static void mpc83xx_spi_deactivate_cs(u8 cs, u8 polarity)
-{
-	pr_debug("%s %d %d\n", __func__, cs, polarity);
-	par_io_data_set(3, 13, !polarity);
+	pr_debug("%s %d %d\n", __func__, spi->chip_select, on);
+	par_io_data_set(3, 13, on);
 }
 
 static struct mmc_spi_platform_data mpc832x_mmc_pdata = {
@@ -74,9 +68,7 @@ static int __init mpc832x_spi_init(void)
 	par_io_config_pin(3, 14, 2, 0, 0, 0); /* SD_INSERT, I */
 	par_io_config_pin(3, 15, 2, 0, 0, 0); /* SD_PROTECT,I */
 
-	return fsl_spi_init(&mpc832x_spi_boardinfo, 1,
-			    mpc83xx_spi_activate_cs,
-			    mpc83xx_spi_deactivate_cs);
+	return fsl_spi_init(&mpc832x_spi_boardinfo, 1, mpc83xx_spi_cs_control);
 }
 machine_device_initcall(mpc832x_rdb, mpc832x_spi_init);
 #endif /* CONFIG_QUICC_ENGINE */
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index a01c89d3f9bd..a46c1c867930 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -420,8 +420,8 @@ arch_initcall(fsl_usb_of_init);
 static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
 				   struct spi_board_info *board_infos,
 				   unsigned int num_board_infos,
-				   void (*activate_cs)(u8 cs, u8 polarity),
-				   void (*deactivate_cs)(u8 cs, u8 polarity))
+				   void (*cs_control)(struct spi_device *dev,
+						      bool on))
 {
 	struct device_node *np;
 	unsigned int i = 0;
@@ -433,8 +433,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
 		struct resource res[2];
 		struct platform_device *pdev;
 		struct fsl_spi_platform_data pdata = {
-			.activate_cs = activate_cs,
-			.deactivate_cs = deactivate_cs,
+			.cs_control = cs_control,
 		};
 
 		memset(res, 0, sizeof(res));
@@ -501,8 +500,7 @@ next:
 
 int __init fsl_spi_init(struct spi_board_info *board_infos,
 			unsigned int num_board_infos,
-			void (*activate_cs)(u8 cs, u8 polarity),
-			void (*deactivate_cs)(u8 cs, u8 polarity))
+			void (*cs_control)(struct spi_device *spi, bool on))
 {
 	u32 sysclk = -1;
 	int ret;
@@ -518,10 +516,10 @@ int __init fsl_spi_init(struct spi_board_info *board_infos,
 	}
 
 	ret = of_fsl_spi_probe(NULL, "fsl,spi", sysclk, board_infos,
-			       num_board_infos, activate_cs, deactivate_cs);
+			       num_board_infos, cs_control);
 	if (!ret)
 		of_fsl_spi_probe("spi", "fsl_spi", sysclk, board_infos,
-				 num_board_infos, activate_cs, deactivate_cs);
+				 num_board_infos, cs_control);
 
 	return spi_register_board_info(board_infos, num_board_infos);
 }
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index 9c744e4285a0..b5f3456780b8 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -4,6 +4,8 @@
 
 #include <asm/mmu.h>
 
+struct spi_device;
+
 extern phys_addr_t get_immrbase(void);
 #if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx)
 extern u32 get_brgfreq(void);
@@ -19,8 +21,7 @@ struct device_node;
 
 extern int fsl_spi_init(struct spi_board_info *board_infos,
 			unsigned int num_board_infos,
-			void (*activate_cs)(u8 cs, u8 polarity),
-			void (*deactivate_cs)(u8 cs, u8 polarity));
+			void (*cs_control)(struct spi_device *spi, bool on));
 
 extern void fsl_rstcr_restart(char *cmd);
 
diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index df6420029004..b95085a46f90 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -89,9 +89,6 @@ struct mpc83xx_spi {
 
 	bool qe_mode;
 
-	void (*activate_cs) (u8 cs, u8 polarity);
-	void (*deactivate_cs) (u8 cs, u8 polarity);
-
 	u8 busy;
 
 	struct workqueue_struct *workqueue;
@@ -153,15 +150,14 @@ MPC83XX_SPI_TX_BUF(u32)
 
 static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 {
-	struct mpc83xx_spi *mpc83xx_spi;
-	u8 pol = spi->mode & SPI_CS_HIGH ? 1 : 0;
+	struct mpc83xx_spi *mpc83xx_spi = spi_master_get_devdata(spi->master);
+	struct fsl_spi_platform_data *pdata = spi->dev.parent->platform_data;
+	bool pol = spi->mode & SPI_CS_HIGH;
 	struct spi_mpc83xx_cs	*cs = spi->controller_state;
 
-	mpc83xx_spi = spi_master_get_devdata(spi->master);
-
 	if (value == BITBANG_CS_INACTIVE) {
-		if (mpc83xx_spi->deactivate_cs)
-			mpc83xx_spi->deactivate_cs(spi->chip_select, pol);
+		if (pdata->cs_control)
+			pdata->cs_control(spi, !pol);
 	}
 
 	if (value == BITBANG_CS_ACTIVE) {
@@ -186,8 +182,8 @@ static void mpc83xx_spi_chipselect(struct spi_device *spi, int value)
 			mpc83xx_spi_write_reg(mode, regval);
 			local_irq_restore(flags);
 		}
-		if (mpc83xx_spi->activate_cs)
-			mpc83xx_spi->activate_cs(spi->chip_select, pol);
+		if (pdata->cs_control)
+			pdata->cs_control(spi, pol);
 	}
 }
 
@@ -582,8 +578,6 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 	master->cleanup = mpc83xx_spi_cleanup;
 
 	mpc83xx_spi = spi_master_get_devdata(master);
-	mpc83xx_spi->activate_cs = pdata->activate_cs;
-	mpc83xx_spi->deactivate_cs = pdata->deactivate_cs;
 	mpc83xx_spi->qe_mode = pdata->qe_mode;
 	mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u8;
 	mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u8;
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index d9051d717d27..7bc1b643d370 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -95,14 +95,15 @@ struct fsl_usb2_platform_data {
 #define FSL_USB2_PORT0_ENABLED	0x00000001
 #define FSL_USB2_PORT1_ENABLED	0x00000002
 
+struct spi_device;
+
 struct fsl_spi_platform_data {
 	u32 	initial_spmode;	/* initial SPMODE value */
 	u16	bus_num;
 	bool	qe_mode;
 	/* board specific information */
 	u16	max_chipselect;
-	void	(*activate_cs)(u8 cs, u8 polarity);
-	void	(*deactivate_cs)(u8 cs, u8 polarity);
+	void	(*cs_control)(struct spi_device *spi, bool on);
 	u32	sysclk;
 };
 
-- 
cgit v1.2.3-71-gd317


From 35b4b3c0c1265f1a7342574be393c157601401f0 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 31 Mar 2009 15:24:37 -0700
Subject: spi_mpc83xx: add OF platform driver bindings

Implement full support for OF SPI bindings.  Now the driver can manage its
own chip selects without any help from the board files and/or fsl_soc
constructors.

The "legacy" code is well isolated and could be removed as time goes by.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi_mpc83xx.c   | 330 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/fsl_devices.h |   2 +-
 2 files changed, 295 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index b95085a46f90..f4573a96af24 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -14,6 +14,8 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -23,7 +25,13 @@
 #include <linux/spi/spi_bitbang.h>
 #include <linux/platform_device.h>
 #include <linux/fsl_devices.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/gpio.h>
+#include <linux/of_gpio.h>
+#include <linux/of_spi.h>
 
+#include <sysdev/fsl_soc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 
@@ -79,7 +87,7 @@ struct mpc83xx_spi {
 	u32(*get_tx) (struct mpc83xx_spi *);
 
 	unsigned int count;
-	int irq;
+	unsigned int irq;
 
 	unsigned nsecs;		/* (clock cycle time)/2 */
 
@@ -543,36 +551,23 @@ static void mpc83xx_spi_cleanup(struct spi_device *spi)
 	kfree(spi->controller_state);
 }
 
-static int __init mpc83xx_spi_probe(struct platform_device *dev)
+static struct spi_master * __devinit
+mpc83xx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq)
 {
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
 	struct spi_master *master;
 	struct mpc83xx_spi *mpc83xx_spi;
-	struct fsl_spi_platform_data *pdata;
-	struct resource *r;
 	u32 regval;
 	int ret = 0;
 
-	/* Get resources(memory, IRQ) associated with the device */
-	master = spi_alloc_master(&dev->dev, sizeof(struct mpc83xx_spi));
-
+	master = spi_alloc_master(dev, sizeof(struct mpc83xx_spi));
 	if (master == NULL) {
 		ret = -ENOMEM;
 		goto err;
 	}
 
-	platform_set_drvdata(dev, master);
-	pdata = dev->dev.platform_data;
-
-	if (pdata == NULL) {
-		ret = -ENODEV;
-		goto free_master;
-	}
+	dev_set_drvdata(dev, master);
 
-	r = platform_get_resource(dev, IORESOURCE_MEM, 0);
-	if (r == NULL) {
-		ret = -ENODEV;
-		goto free_master;
-	}
 	master->setup = mpc83xx_spi_setup;
 	master->transfer = mpc83xx_spi_transfer;
 	master->cleanup = mpc83xx_spi_cleanup;
@@ -592,18 +587,13 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 
 	init_completion(&mpc83xx_spi->done);
 
-	mpc83xx_spi->base = ioremap(r->start, r->end - r->start + 1);
+	mpc83xx_spi->base = ioremap(mem->start, mem->end - mem->start + 1);
 	if (mpc83xx_spi->base == NULL) {
 		ret = -ENOMEM;
 		goto put_master;
 	}
 
-	mpc83xx_spi->irq = platform_get_irq(dev, 0);
-
-	if (mpc83xx_spi->irq < 0) {
-		ret = -ENXIO;
-		goto unmap_io;
-	}
+	mpc83xx_spi->irq = irq;
 
 	/* Register for SPI Interrupt */
 	ret = request_irq(mpc83xx_spi->irq, mpc83xx_spi_irq,
@@ -645,9 +635,9 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 
 	printk(KERN_INFO
 	       "%s: MPC83xx SPI Controller driver at 0x%p (irq = %d)\n",
-	       dev_name(&dev->dev), mpc83xx_spi->base, mpc83xx_spi->irq);
+	       dev_name(dev), mpc83xx_spi->base, mpc83xx_spi->irq);
 
-	return ret;
+	return master;
 
 unreg_master:
 	destroy_workqueue(mpc83xx_spi->workqueue);
@@ -657,18 +647,16 @@ unmap_io:
 	iounmap(mpc83xx_spi->base);
 put_master:
 	spi_master_put(master);
-free_master:
-	kfree(master);
 err:
-	return ret;
+	return ERR_PTR(ret);
 }
 
-static int __exit mpc83xx_spi_remove(struct platform_device *dev)
+static int __devexit mpc83xx_spi_remove(struct device *dev)
 {
 	struct mpc83xx_spi *mpc83xx_spi;
 	struct spi_master *master;
 
-	master = platform_get_drvdata(dev);
+	master = dev_get_drvdata(dev);
 	mpc83xx_spi = spi_master_get_devdata(master);
 
 	flush_workqueue(mpc83xx_spi->workqueue);
@@ -681,23 +669,293 @@ static int __exit mpc83xx_spi_remove(struct platform_device *dev)
 	return 0;
 }
 
+struct mpc83xx_spi_probe_info {
+	struct fsl_spi_platform_data pdata;
+	int *gpios;
+	bool *alow_flags;
+};
+
+static struct mpc83xx_spi_probe_info *
+to_of_pinfo(struct fsl_spi_platform_data *pdata)
+{
+	return container_of(pdata, struct mpc83xx_spi_probe_info, pdata);
+}
+
+static void mpc83xx_spi_cs_control(struct spi_device *spi, bool on)
+{
+	struct device *dev = spi->dev.parent;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(dev->platform_data);
+	u16 cs = spi->chip_select;
+	int gpio = pinfo->gpios[cs];
+	bool alow = pinfo->alow_flags[cs];
+
+	gpio_set_value(gpio, on ^ alow);
+}
+
+static int of_mpc83xx_spi_get_chipselects(struct device *dev)
+{
+	struct device_node *np = dev_archdata_get_node(&dev->archdata);
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata);
+	unsigned int ngpios;
+	int i = 0;
+	int ret;
+
+	ngpios = of_gpio_count(np);
+	if (!ngpios) {
+		/*
+		 * SPI w/o chip-select line. One SPI device is still permitted
+		 * though.
+		 */
+		pdata->max_chipselect = 1;
+		return 0;
+	}
+
+	pinfo->gpios = kmalloc(ngpios * sizeof(pinfo->gpios), GFP_KERNEL);
+	if (!pinfo->gpios)
+		return -ENOMEM;
+	memset(pinfo->gpios, -1, ngpios * sizeof(pinfo->gpios));
+
+	pinfo->alow_flags = kzalloc(ngpios * sizeof(pinfo->alow_flags),
+				    GFP_KERNEL);
+	if (!pinfo->alow_flags) {
+		ret = -ENOMEM;
+		goto err_alloc_flags;
+	}
+
+	for (; i < ngpios; i++) {
+		int gpio;
+		enum of_gpio_flags flags;
+
+		gpio = of_get_gpio_flags(np, i, &flags);
+		if (!gpio_is_valid(gpio)) {
+			dev_err(dev, "invalid gpio #%d: %d\n", i, gpio);
+			goto err_loop;
+		}
+
+		ret = gpio_request(gpio, dev_name(dev));
+		if (ret) {
+			dev_err(dev, "can't request gpio #%d: %d\n", i, ret);
+			goto err_loop;
+		}
+
+		pinfo->gpios[i] = gpio;
+		pinfo->alow_flags[i] = flags & OF_GPIO_ACTIVE_LOW;
+
+		ret = gpio_direction_output(pinfo->gpios[i],
+					    pinfo->alow_flags[i]);
+		if (ret) {
+			dev_err(dev, "can't set output direction for gpio "
+				"#%d: %d\n", i, ret);
+			goto err_loop;
+		}
+	}
+
+	pdata->max_chipselect = ngpios;
+	pdata->cs_control = mpc83xx_spi_cs_control;
+
+	return 0;
+
+err_loop:
+	while (i >= 0) {
+		if (gpio_is_valid(pinfo->gpios[i]))
+			gpio_free(pinfo->gpios[i]);
+		i--;
+	}
+
+	kfree(pinfo->alow_flags);
+	pinfo->alow_flags = NULL;
+err_alloc_flags:
+	kfree(pinfo->gpios);
+	pinfo->gpios = NULL;
+	return ret;
+}
+
+static int of_mpc83xx_spi_free_chipselects(struct device *dev)
+{
+	struct fsl_spi_platform_data *pdata = dev->platform_data;
+	struct mpc83xx_spi_probe_info *pinfo = to_of_pinfo(pdata);
+	int i;
+
+	if (!pinfo->gpios)
+		return 0;
+
+	for (i = 0; i < pdata->max_chipselect; i++) {
+		if (gpio_is_valid(pinfo->gpios[i]))
+			gpio_free(pinfo->gpios[i]);
+	}
+
+	kfree(pinfo->gpios);
+	kfree(pinfo->alow_flags);
+	return 0;
+}
+
+static int __devinit of_mpc83xx_spi_probe(struct of_device *ofdev,
+					  const struct of_device_id *ofid)
+{
+	struct device *dev = &ofdev->dev;
+	struct device_node *np = ofdev->node;
+	struct mpc83xx_spi_probe_info *pinfo;
+	struct fsl_spi_platform_data *pdata;
+	struct spi_master *master;
+	struct resource mem;
+	struct resource irq;
+	const void *prop;
+	int ret = -ENOMEM;
+
+	pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL);
+	if (!pinfo)
+		return -ENOMEM;
+
+	pdata = &pinfo->pdata;
+	dev->platform_data = pdata;
+
+	/* Allocate bus num dynamically. */
+	pdata->bus_num = -1;
+
+	/* SPI controller is either clocked from QE or SoC clock. */
+	pdata->sysclk = get_brgfreq();
+	if (pdata->sysclk == -1) {
+		pdata->sysclk = fsl_get_sys_freq();
+		if (pdata->sysclk == -1) {
+			ret = -ENODEV;
+			goto err_clk;
+		}
+	}
+
+	prop = of_get_property(np, "mode", NULL);
+	if (prop && !strcmp(prop, "cpu-qe"))
+		pdata->qe_mode = 1;
+
+	ret = of_mpc83xx_spi_get_chipselects(dev);
+	if (ret)
+		goto err;
+
+	ret = of_address_to_resource(np, 0, &mem);
+	if (ret)
+		goto err;
+
+	ret = of_irq_to_resource(np, 0, &irq);
+	if (!ret) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	master = mpc83xx_spi_probe(dev, &mem, irq.start);
+	if (IS_ERR(master)) {
+		ret = PTR_ERR(master);
+		goto err;
+	}
+
+	of_register_spi_devices(master, np);
+
+	return 0;
+
+err:
+	of_mpc83xx_spi_free_chipselects(dev);
+err_clk:
+	kfree(pinfo);
+	return ret;
+}
+
+static int __devexit of_mpc83xx_spi_remove(struct of_device *ofdev)
+{
+	int ret;
+
+	ret = mpc83xx_spi_remove(&ofdev->dev);
+	if (ret)
+		return ret;
+	of_mpc83xx_spi_free_chipselects(&ofdev->dev);
+	return 0;
+}
+
+static const struct of_device_id of_mpc83xx_spi_match[] = {
+	{ .compatible = "fsl,spi" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, of_mpc83xx_spi_match);
+
+static struct of_platform_driver of_mpc83xx_spi_driver = {
+	.name		= "mpc83xx_spi",
+	.match_table	= of_mpc83xx_spi_match,
+	.probe		= of_mpc83xx_spi_probe,
+	.remove		= __devexit_p(of_mpc83xx_spi_remove),
+};
+
+#ifdef CONFIG_MPC832x_RDB
+/*
+ * 				XXX XXX XXX
+ * This is "legacy" platform driver, was used by the MPC8323E-RDB boards
+ * only. The driver should go away soon, since newer MPC8323E-RDB's device
+ * tree can work with OpenFirmware driver. But for now we support old trees
+ * as well.
+ */
+static int __devinit plat_mpc83xx_spi_probe(struct platform_device *pdev)
+{
+	struct resource *mem;
+	unsigned int irq;
+	struct spi_master *master;
+
+	if (!pdev->dev.platform_data)
+		return -EINVAL;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem)
+		return -EINVAL;
+
+	irq = platform_get_irq(pdev, 0);
+	if (!irq)
+		return -EINVAL;
+
+	master = mpc83xx_spi_probe(&pdev->dev, mem, irq);
+	if (IS_ERR(master))
+		return PTR_ERR(master);
+	return 0;
+}
+
+static int __devexit plat_mpc83xx_spi_remove(struct platform_device *pdev)
+{
+	return mpc83xx_spi_remove(&pdev->dev);
+}
+
 MODULE_ALIAS("platform:mpc83xx_spi");
 static struct platform_driver mpc83xx_spi_driver = {
-	.remove = __exit_p(mpc83xx_spi_remove),
+	.probe = plat_mpc83xx_spi_probe,
+	.remove = __exit_p(plat_mpc83xx_spi_remove),
 	.driver = {
 		.name = "mpc83xx_spi",
 		.owner = THIS_MODULE,
 	},
 };
 
+static bool legacy_driver_failed;
+
+static void __init legacy_driver_register(void)
+{
+	legacy_driver_failed = platform_driver_register(&mpc83xx_spi_driver);
+}
+
+static void __exit legacy_driver_unregister(void)
+{
+	if (legacy_driver_failed)
+		return;
+	platform_driver_unregister(&mpc83xx_spi_driver);
+}
+#else
+static void __init legacy_driver_register(void) {}
+static void __exit legacy_driver_unregister(void) {}
+#endif /* CONFIG_MPC832x_RDB */
+
 static int __init mpc83xx_spi_init(void)
 {
-	return platform_driver_probe(&mpc83xx_spi_driver, mpc83xx_spi_probe);
+	legacy_driver_register();
+	return of_register_platform_driver(&of_mpc83xx_spi_driver);
 }
 
 static void __exit mpc83xx_spi_exit(void)
 {
-	platform_driver_unregister(&mpc83xx_spi_driver);
+	of_unregister_platform_driver(&of_mpc83xx_spi_driver);
+	legacy_driver_unregister();
 }
 
 module_init(mpc83xx_spi_init);
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 7bc1b643d370..7ef1caf50269 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -99,7 +99,7 @@ struct spi_device;
 
 struct fsl_spi_platform_data {
 	u32 	initial_spmode;	/* initial SPMODE value */
-	u16	bus_num;
+	s16	bus_num;
 	bool	qe_mode;
 	/* board specific information */
 	u16	max_chipselect;
-- 
cgit v1.2.3-71-gd317


From 79955898f961a870cbcc58f6ae13f3741a909da5 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 31 Mar 2009 15:24:45 -0700
Subject: autofs4: fix kernel includes

autofs_dev-ioctl.h is included by both the kernel module and user space tools
and it includes two kernel header files.  Compiles work if the kernel headers
are installed but fail otherwise.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/auto_dev-ioctl.h | 7 ++++++-
 include/linux/auto_fs.h        | 6 ++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index 91a773993a5c..850f39b33e74 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -10,8 +10,13 @@
 #ifndef _LINUX_AUTO_DEV_IOCTL_H
 #define _LINUX_AUTO_DEV_IOCTL_H
 
+#include <linux/auto_fs.h>
+
+#ifdef __KERNEL__
 #include <linux/string.h>
-#include <linux/types.h>
+#else
+#include <string.h>
+#endif /* __KERNEL__ */
 
 #define AUTOFS_DEVICE_NAME		"autofs"
 
diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index c21e5972a3e8..63265852b7d1 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -17,11 +17,13 @@
 #ifdef __KERNEL__
 #include <linux/fs.h>
 #include <linux/limits.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#else
 #include <asm/types.h>
+#include <sys/ioctl.h>
 #endif /* __KERNEL__ */
 
-#include <linux/ioctl.h>
-
 /* This file describes autofs v3 */
 #define AUTOFS_PROTO_VERSION	3
 
-- 
cgit v1.2.3-71-gd317


From 78d89ef40c2ff7265df077e20c4d76be7d415204 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Mar 2009 15:24:48 -0700
Subject: rtc: convert LEAP_YEAR into an inline

- the LEAP_YEAR macro is buggy - it references its arg multiple times.
  Fix this by turning it into a C function.

- give it a more approriate name

- Move it to rtc.h so that other .c files can use it, instead of copying it.

Cc: dann frazier <dannf@hp.com>
Acked-by: Alessandro Zummo <alessandro.zummo@towertech.it>
Cc: stephane eranian <eranian@googlemail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-lib.c | 7 +++----
 include/linux/rtc.h   | 6 ++++++
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
index dd70bf73ce9d..773851f338b8 100644
--- a/drivers/rtc/rtc-lib.c
+++ b/drivers/rtc/rtc-lib.c
@@ -26,14 +26,13 @@ static const unsigned short rtc_ydays[2][13] = {
 };
 
 #define LEAPS_THRU_END_OF(y) ((y)/4 - (y)/100 + (y)/400)
-#define LEAP_YEAR(year) ((!(year % 4) && (year % 100)) || !(year % 400))
 
 /*
  * The number of days in the month.
  */
 int rtc_month_days(unsigned int month, unsigned int year)
 {
-	return rtc_days_in_month[month] + (LEAP_YEAR(year) && month == 1);
+	return rtc_days_in_month[month] + (is_leap_year(year) && month == 1);
 }
 EXPORT_SYMBOL(rtc_month_days);
 
@@ -42,7 +41,7 @@ EXPORT_SYMBOL(rtc_month_days);
  */
 int rtc_year_days(unsigned int day, unsigned int month, unsigned int year)
 {
-	return rtc_ydays[LEAP_YEAR(year)][month] + day-1;
+	return rtc_ydays[is_leap_year(year)][month] + day-1;
 }
 EXPORT_SYMBOL(rtc_year_days);
 
@@ -66,7 +65,7 @@ void rtc_time_to_tm(unsigned long time, struct rtc_time *tm)
 		- LEAPS_THRU_END_OF(1970 - 1);
 	if (days < 0) {
 		year -= 1;
-		days += 365 + LEAP_YEAR(year);
+		days += 365 + is_leap_year(year);
 	}
 	tm->tm_year = year - 1900;
 	tm->tm_yday = days + 1;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 4046b75563c1..60f88a7fb13d 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -99,6 +99,7 @@ struct rtc_pll_info {
 
 #ifdef __KERNEL__
 
+#include <linux/types.h>
 #include <linux/interrupt.h>
 
 extern int rtc_month_days(unsigned int month, unsigned int year);
@@ -232,6 +233,11 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
+static inline bool is_leap_year(unsigned int year)
+{
+	return (!(year % 4) && (year % 100)) || !(year % 400);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_RTC_H_ */
-- 
cgit v1.2.3-71-gd317


From 614c0dc93284404be2a4d5750c79bb95f2b6c980 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Tue, 31 Mar 2009 15:25:15 -0700
Subject: cirrusfb: add accelerator constant

Add an accelerator constant so almost all Cirrus are recognized as
accelerators by the fbset command.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/cirrusfb.c | 11 +++++++++--
 include/linux/fb.h       |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 15e2e6bfcbff..e9a2661669eb 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -519,6 +519,7 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 	int yres;
 	/* memory size in pixels */
 	unsigned pixels = info->screen_size * 8 / var->bits_per_pixel;
+	struct cirrusfb_info *cinfo = info->par;
 
 	switch (var->bits_per_pixel) {
 	case 1:
@@ -627,6 +628,9 @@ static int cirrusfb_check_var(struct fb_var_screeninfo *var,
 	if (cirrusfb_check_pixclock(var, info))
 		return -EINVAL;
 
+	if (!is_laguna(cinfo))
+		var->accel_flags = FB_ACCELF_TEXT;
+
 	return 0;
 }
 
@@ -2029,8 +2033,12 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 		    | FBINFO_HWACCEL_FILLRECT
 		    | FBINFO_HWACCEL_IMAGEBLIT
 		    | FBINFO_HWACCEL_COPYAREA;
-	if (noaccel || is_laguna(cinfo))
+	if (noaccel || is_laguna(cinfo)) {
 		info->flags |= FBINFO_HWACCEL_DISABLED;
+		info->fix.accel = FB_ACCEL_NONE;
+	} else
+		info->fix.accel = FB_ACCEL_CIRRUS_ALPINE;
+
 	info->fbops = &cirrusfb_ops;
 
 	if (cinfo->btype == BT_GD5480) {
@@ -2056,7 +2064,6 @@ static int __devinit cirrusfb_set_fbinfo(struct fb_info *info)
 
 	/* FIXME: map region at 0xB8000 if available, fill in here */
 	info->fix.mmio_len   = 0;
-	info->fix.accel = FB_ACCEL_NONE;
 
 	fb_alloc_cmap(&info->cmap, 256, 0);
 
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 31527e17076b..fe7d0d7907ab 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -123,6 +123,7 @@ struct dentry;
 #define FB_ACCEL_TRIDENT_3DIMAGE 51	/* Trident 3DImage		*/
 #define FB_ACCEL_TRIDENT_BLADE3D 52	/* Trident Blade3D		*/
 #define FB_ACCEL_TRIDENT_BLADEXP 53	/* Trident BladeXP		*/
+#define FB_ACCEL_CIRRUS_ALPINE   53	/* Cirrus Logic 543x/544x/5480	*/
 #define FB_ACCEL_NEOMAGIC_NM2070 90	/* NeoMagic NM2070              */
 #define FB_ACCEL_NEOMAGIC_NM2090 91	/* NeoMagic NM2090              */
 #define FB_ACCEL_NEOMAGIC_NM2093 92	/* NeoMagic NM2093              */
-- 
cgit v1.2.3-71-gd317


From 6a7f2829b5f8be124e168265f176dbbbea8861a0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 31 Mar 2009 15:25:19 -0700
Subject: fbdev: uninline lock_fb_info()

Before:

   text    data     bss     dec     hex filename
   3648    2910      32    6590    19be drivers/video/backlight/backlight.o
   3226    2812      32    6070    17b6 drivers/video/backlight/lcd.o
  30990   16688    8480   56158    db5e drivers/video/console/fbcon.o
  15488    8400      24   23912    5d68 drivers/video/fbmem.o

After:

   text    data     bss     dec     hex filename
   3537    2870      32    6439    1927 drivers/video/backlight/backlight.o
   3131    2772      32    5935    172f drivers/video/backlight/lcd.o
  30876   16648    8480   56004    dac4 drivers/video/console/fbcon.o
  15506    8400      24   23930    5d7a drivers/video/fbmem.o

Cc: Andrea Righi <righi.andrea@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/fbmem.c | 11 +++++++++++
 include/linux/fb.h    | 10 +---------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index b64f061dd447..2ac32e6b5953 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -46,6 +46,17 @@
 struct fb_info *registered_fb[FB_MAX] __read_mostly;
 int num_registered_fb __read_mostly;
 
+int lock_fb_info(struct fb_info *info)
+{
+	mutex_lock(&info->lock);
+	if (!info->fbops) {
+		mutex_unlock(&info->lock);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(lock_fb_info);
+
 /*
  * Helpers
  */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index fe7d0d7907ab..f563c5013932 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -961,15 +961,7 @@ extern struct fb_info *registered_fb[FB_MAX];
 extern int num_registered_fb;
 extern struct class *fb_class;
 
-static inline int lock_fb_info(struct fb_info *info)
-{
-	mutex_lock(&info->lock);
-	if (!info->fbops) {
-		mutex_unlock(&info->lock);
-		return 0;
-	}
-	return 1;
-}
+extern int lock_fb_info(struct fb_info *info);
 
 static inline void unlock_fb_info(struct fb_info *info)
 {
-- 
cgit v1.2.3-71-gd317


From 6fd5c665d8fe9da5f2081f0b3ca8054f0f730b1a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:23 +0200
Subject: include/linux/hdreg.h: cover struct hd_driveid with #ifndef/#endif
 __KERNEL__

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/hdreg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index ed21bd3dbd25..47fcb05af8b4 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -448,6 +448,7 @@ enum {
 
 #define __NEW_HD_DRIVE_ID
 
+#ifndef __KERNEL__
 /*
  * Structure returned by HDIO_GET_IDENTITY, as per ANSI NCITS ATA6 rev.1b spec.
  *
@@ -699,6 +700,7 @@ struct hd_driveid {
 					 *  7:0 Signature
 					 */
 };
+#endif /* __KERNEL__ */
 
 /*
  * IDE "nice" flags. These are used on a per drive basis to determine
-- 
cgit v1.2.3-71-gd317


From dafd01cc14a38690c87981eb2670d9c95f799ffd Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:25 +0200
Subject: include/linux/hdreg.h: cover WIN_* and friends with #ifndef/#endif
 __KERNEL__

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/hdreg.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index 47fcb05af8b4..101eb91de22f 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -191,6 +191,7 @@ typedef struct hd_drive_hob_hdr {
 #define TASKFILE_INVALID		0x7fff
 #endif
 
+#ifndef __KERNEL__
 /* ATA/ATAPI Commands pre T13 Spec */
 #define WIN_NOP				0x00
 /*
@@ -379,6 +380,7 @@ typedef struct hd_drive_hob_hdr {
 #define SECURITY_ERASE_UNIT		0xBD
 #define SECURITY_FREEZE_LOCK		0xBE
 #define SECURITY_DISABLE_PASSWORD	0xBF
+#endif /* __KERNEL__ */
 
 struct hd_geometry {
       unsigned char heads;
-- 
cgit v1.2.3-71-gd317


From 4fe6e30645de0b7a179892d83049580bf72bcff2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:25 +0200
Subject: include/linux/hdreg.h: remove unused defines

* Move HD_IRQ define to drivers/block/hd.c (only user).

* Remove unused *_STAT, *_ERR, HD_*, CD, IO, REL and TAG_MASK defines.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/block/hd.c    |  2 ++
 include/linux/hdreg.h | 58 ---------------------------------------------------
 2 files changed, 2 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 482c0c4b964f..3c11f062a18c 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -42,6 +42,8 @@
 #include <linux/ata.h>
 #include <linux/hdreg.h>
 
+#define HD_IRQ 14
+
 #define REALLY_SLOW_IO
 #include <asm/system.h>
 #include <asm/io.h>
diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index 101eb91de22f..3bc8f9f986b6 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -3,64 +3,6 @@
 
 #ifdef __KERNEL__
 #include <linux/ata.h>
-
-/*
- * This file contains some defines for the AT-hd-controller.
- * Various sources.
- */
-
-/* ide.c has its own port definitions in "ide.h" */
-
-#define HD_IRQ		14
-
-/* Hd controller regs. Ref: IBM AT Bios-listing */
-#define HD_DATA		0x1f0		/* _CTL when writing */
-#define HD_ERROR	0x1f1		/* see err-bits */
-#define HD_NSECTOR	0x1f2		/* nr of sectors to read/write */
-#define HD_SECTOR	0x1f3		/* starting sector */
-#define HD_LCYL		0x1f4		/* starting cylinder */
-#define HD_HCYL		0x1f5		/* high byte of starting cyl */
-#define HD_CURRENT	0x1f6		/* 101dhhhh , d=drive, hhhh=head */
-#define HD_STATUS	0x1f7		/* see status-bits */
-#define HD_FEATURE	HD_ERROR	/* same io address, read=error, write=feature */
-#define HD_PRECOMP	HD_FEATURE	/* obsolete use of this port - predates IDE */
-#define HD_COMMAND	HD_STATUS	/* same io address, read=status, write=cmd */
-
-#define HD_CMD		0x3f6		/* used for resets */
-#define HD_ALTSTATUS	0x3f6		/* same as HD_STATUS but doesn't clear irq */
-
-/* remainder is shared between hd.c, ide.c, ide-cd.c, and the hdparm utility */
-
-/* Bits of HD_STATUS */
-#define ERR_STAT		0x01
-#define INDEX_STAT		0x02
-#define ECC_STAT		0x04	/* Corrected error */
-#define DRQ_STAT		0x08
-#define SEEK_STAT		0x10
-#define SRV_STAT		0x10
-#define WRERR_STAT		0x20
-#define READY_STAT		0x40
-#define BUSY_STAT		0x80
-
-/* Bits for HD_ERROR */
-#define MARK_ERR		0x01	/* Bad address mark */
-#define ILI_ERR			0x01	/* Illegal Length Indication (ATAPI) */
-#define TRK0_ERR		0x02	/* couldn't find track 0 */
-#define EOM_ERR			0x02	/* End Of Media (ATAPI) */
-#define ABRT_ERR		0x04	/* Command aborted */
-#define MCR_ERR			0x08	/* media change request */
-#define ID_ERR			0x10	/* ID field not found */
-#define MC_ERR			0x20	/* media changed */
-#define ECC_ERR			0x40	/* Uncorrectable ECC error */
-#define BBD_ERR			0x80	/* pre-EIDE meaning:  block marked bad */
-#define ICRC_ERR		0x80	/* new meaning:  CRC error during transfer */
-#define LFS_ERR			0xf0	/* Last Failed Sense (ATAPI) */
-
-/* Bits of HD_NSECTOR */
-#define CD			0x01
-#define IO			0x02
-#define REL			0x04
-#define TAG_MASK		0xf8
 #endif /* __KERNEL__ */
 
 #include <linux/types.h>
-- 
cgit v1.2.3-71-gd317


From eae6c2b6414fc6673ac5415442fe463c01005366 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 1 Apr 2009 21:42:26 +0200
Subject: remove <linux/ata.h> include from <linux/hdreg.h>

All <linux/hdreg.h> users that need <linux/ata.h> have been fixed
to include it directly.

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/hdreg.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h
index 3bc8f9f986b6..29ee2873f4a8 100644
--- a/include/linux/hdreg.h
+++ b/include/linux/hdreg.h
@@ -1,10 +1,6 @@
 #ifndef _LINUX_HDREG_H
 #define _LINUX_HDREG_H
 
-#ifdef __KERNEL__
-#include <linux/ata.h>
-#endif /* __KERNEL__ */
-
 #include <linux/types.h>
 
 /*
-- 
cgit v1.2.3-71-gd317


From d9de451989a88a2003ca06e524aca4665c0c7f06 Mon Sep 17 00:00:00 2001
From: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com>
Date: Wed, 1 Apr 2009 15:47:02 +0200
Subject: dw_dmac: add cyclic API to DW DMA driver

This patch adds a cyclic DMA interface to the DW DMA driver. This is
very useful if you want to use the DMA controller in combination with a
sound device which uses cyclic buffers.

Using a DMA channel for cyclic DMA will disable the possibility to use
it as a normal DMA engine until the user calls the cyclic free function
on the DMA channel. Also a cyclic DMA list can not be prepared if the
channel is already active.

Signed-off-by: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com>
Acked-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dw_dmac.c      | 332 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/dma/dw_dmac_regs.h |   7 +-
 include/linux/dw_dmac.h    |  19 +++
 3 files changed, 356 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 862fc9ce9d86..0b8aada08aa8 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -363,6 +363,82 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	dwc_descriptor_complete(dwc, bad_desc);
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+inline dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan)
+{
+	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+	return channel_readl(dwc, SAR);
+}
+EXPORT_SYMBOL(dw_dma_get_src_addr);
+
+inline dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan)
+{
+	struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+	return channel_readl(dwc, DAR);
+}
+EXPORT_SYMBOL(dw_dma_get_dst_addr);
+
+/* called with dwc->lock held and all DMAC interrupts disabled */
+static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
+		u32 status_block, u32 status_err, u32 status_xfer)
+{
+	if (status_block & dwc->mask) {
+		void (*callback)(void *param);
+		void *callback_param;
+
+		dev_vdbg(chan2dev(&dwc->chan), "new cyclic period llp 0x%08x\n",
+				channel_readl(dwc, LLP));
+		dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+
+		callback = dwc->cdesc->period_callback;
+		callback_param = dwc->cdesc->period_callback_param;
+		if (callback) {
+			spin_unlock(&dwc->lock);
+			callback(callback_param);
+			spin_lock(&dwc->lock);
+		}
+	}
+
+	/*
+	 * Error and transfer complete are highly unlikely, and will most
+	 * likely be due to a configuration error by the user.
+	 */
+	if (unlikely(status_err & dwc->mask) ||
+			unlikely(status_xfer & dwc->mask)) {
+		int i;
+
+		dev_err(chan2dev(&dwc->chan), "cyclic DMA unexpected %s "
+				"interrupt, stopping DMA transfer\n",
+				status_xfer ? "xfer" : "error");
+		dev_err(chan2dev(&dwc->chan),
+			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+			channel_readl(dwc, SAR),
+			channel_readl(dwc, DAR),
+			channel_readl(dwc, LLP),
+			channel_readl(dwc, CTL_HI),
+			channel_readl(dwc, CTL_LO));
+
+		channel_clear_bit(dw, CH_EN, dwc->mask);
+		while (dma_readl(dw, CH_EN) & dwc->mask)
+			cpu_relax();
+
+		/* make sure DMA does not restart by loading a new list */
+		channel_writel(dwc, LLP, 0);
+		channel_writel(dwc, CTL_LO, 0);
+		channel_writel(dwc, CTL_HI, 0);
+
+		dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+		dma_writel(dw, CLEAR.ERROR, dwc->mask);
+		dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+		for (i = 0; i < dwc->cdesc->periods; i++)
+			dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli);
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+
 static void dw_dma_tasklet(unsigned long data)
 {
 	struct dw_dma *dw = (struct dw_dma *)data;
@@ -382,7 +458,10 @@ static void dw_dma_tasklet(unsigned long data)
 	for (i = 0; i < dw->dma.chancnt; i++) {
 		dwc = &dw->chan[i];
 		spin_lock(&dwc->lock);
-		if (status_err & (1 << i))
+		if (test_bit(DW_DMA_IS_CYCLIC, &dwc->flags))
+			dwc_handle_cyclic(dw, dwc, status_block, status_err,
+					status_xfer);
+		else if (status_err & (1 << i))
 			dwc_handle_error(dw, dwc);
 		else if ((status_block | status_xfer) & (1 << i))
 			dwc_scan_descriptors(dw, dwc);
@@ -883,6 +962,257 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	dev_vdbg(chan2dev(chan), "free_chan_resources done\n");
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+/**
+ * dw_dma_cyclic_start - start the cyclic DMA transfer
+ * @chan: the DMA channel to start
+ *
+ * Must be called with soft interrupts disabled. Returns zero on success or
+ * -errno on failure.
+ */
+int dw_dma_cyclic_start(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+
+	if (!test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) {
+		dev_err(chan2dev(&dwc->chan), "missing prep for cyclic DMA\n");
+		return -ENODEV;
+	}
+
+	spin_lock(&dwc->lock);
+
+	/* assert channel is idle */
+	if (dma_readl(dw, CH_EN) & dwc->mask) {
+		dev_err(chan2dev(&dwc->chan),
+			"BUG: Attempted to start non-idle channel\n");
+		dev_err(chan2dev(&dwc->chan),
+			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+			channel_readl(dwc, SAR),
+			channel_readl(dwc, DAR),
+			channel_readl(dwc, LLP),
+			channel_readl(dwc, CTL_HI),
+			channel_readl(dwc, CTL_LO));
+		spin_unlock(&dwc->lock);
+		return -EBUSY;
+	}
+
+	dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+	dma_writel(dw, CLEAR.ERROR, dwc->mask);
+	dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+	/* setup DMAC channel registers */
+	channel_writel(dwc, LLP, dwc->cdesc->desc[0]->txd.phys);
+	channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+	channel_writel(dwc, CTL_HI, 0);
+
+	channel_set_bit(dw, CH_EN, dwc->mask);
+
+	spin_unlock(&dwc->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_start);
+
+/**
+ * dw_dma_cyclic_stop - stop the cyclic DMA transfer
+ * @chan: the DMA channel to stop
+ *
+ * Must be called with soft interrupts disabled.
+ */
+void dw_dma_cyclic_stop(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+
+	spin_lock(&dwc->lock);
+
+	channel_clear_bit(dw, CH_EN, dwc->mask);
+	while (dma_readl(dw, CH_EN) & dwc->mask)
+		cpu_relax();
+
+	spin_unlock(&dwc->lock);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_stop);
+
+/**
+ * dw_dma_cyclic_prep - prepare the cyclic DMA transfer
+ * @chan: the DMA channel to prepare
+ * @buf_addr: physical DMA address where the buffer starts
+ * @buf_len: total number of bytes for the entire buffer
+ * @period_len: number of bytes for each period
+ * @direction: transfer direction, to or from device
+ *
+ * Must be called before trying to start the transfer. Returns a valid struct
+ * dw_cyclic_desc if successful or an ERR_PTR(-errno) if not successful.
+ */
+struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
+		dma_addr_t buf_addr, size_t buf_len, size_t period_len,
+		enum dma_data_direction direction)
+{
+	struct dw_dma_chan		*dwc = to_dw_dma_chan(chan);
+	struct dw_cyclic_desc		*cdesc;
+	struct dw_cyclic_desc		*retval = NULL;
+	struct dw_desc			*desc;
+	struct dw_desc			*last = NULL;
+	struct dw_dma_slave		*dws = chan->private;
+	unsigned long			was_cyclic;
+	unsigned int			reg_width;
+	unsigned int			periods;
+	unsigned int			i;
+
+	spin_lock_bh(&dwc->lock);
+	if (!list_empty(&dwc->queue) || !list_empty(&dwc->active_list)) {
+		spin_unlock_bh(&dwc->lock);
+		dev_dbg(chan2dev(&dwc->chan),
+				"queue and/or active list are not empty\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	was_cyclic = test_and_set_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+	spin_unlock_bh(&dwc->lock);
+	if (was_cyclic) {
+		dev_dbg(chan2dev(&dwc->chan),
+				"channel already prepared for cyclic DMA\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	retval = ERR_PTR(-EINVAL);
+	reg_width = dws->reg_width;
+	periods = buf_len / period_len;
+
+	/* Check for too big/unaligned periods and unaligned DMA buffer. */
+	if (period_len > (DWC_MAX_COUNT << reg_width))
+		goto out_err;
+	if (unlikely(period_len & ((1 << reg_width) - 1)))
+		goto out_err;
+	if (unlikely(buf_addr & ((1 << reg_width) - 1)))
+		goto out_err;
+	if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE))))
+		goto out_err;
+
+	retval = ERR_PTR(-ENOMEM);
+
+	if (periods > NR_DESCS_PER_CHANNEL)
+		goto out_err;
+
+	cdesc = kzalloc(sizeof(struct dw_cyclic_desc), GFP_KERNEL);
+	if (!cdesc)
+		goto out_err;
+
+	cdesc->desc = kzalloc(sizeof(struct dw_desc *) * periods, GFP_KERNEL);
+	if (!cdesc->desc)
+		goto out_err_alloc;
+
+	for (i = 0; i < periods; i++) {
+		desc = dwc_desc_get(dwc);
+		if (!desc)
+			goto out_err_desc_get;
+
+		switch (direction) {
+		case DMA_TO_DEVICE:
+			desc->lli.dar = dws->tx_reg;
+			desc->lli.sar = buf_addr + (period_len * i);
+			desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+					| DWC_CTLL_DST_WIDTH(reg_width)
+					| DWC_CTLL_SRC_WIDTH(reg_width)
+					| DWC_CTLL_DST_FIX
+					| DWC_CTLL_SRC_INC
+					| DWC_CTLL_FC_M2P
+					| DWC_CTLL_INT_EN);
+			break;
+		case DMA_FROM_DEVICE:
+			desc->lli.dar = buf_addr + (period_len * i);
+			desc->lli.sar = dws->rx_reg;
+			desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+					| DWC_CTLL_SRC_WIDTH(reg_width)
+					| DWC_CTLL_DST_WIDTH(reg_width)
+					| DWC_CTLL_DST_INC
+					| DWC_CTLL_SRC_FIX
+					| DWC_CTLL_FC_P2M
+					| DWC_CTLL_INT_EN);
+			break;
+		default:
+			break;
+		}
+
+		desc->lli.ctlhi = (period_len >> reg_width);
+		cdesc->desc[i] = desc;
+
+		if (last) {
+			last->lli.llp = desc->txd.phys;
+			dma_sync_single_for_device(chan2parent(chan),
+					last->txd.phys, sizeof(last->lli),
+					DMA_TO_DEVICE);
+		}
+
+		last = desc;
+	}
+
+	/* lets make a cyclic list */
+	last->lli.llp = cdesc->desc[0]->txd.phys;
+	dma_sync_single_for_device(chan2parent(chan), last->txd.phys,
+			sizeof(last->lli), DMA_TO_DEVICE);
+
+	dev_dbg(chan2dev(&dwc->chan), "cyclic prepared buf 0x%08x len %zu "
+			"period %zu periods %d\n", buf_addr, buf_len,
+			period_len, periods);
+
+	cdesc->periods = periods;
+	dwc->cdesc = cdesc;
+
+	return cdesc;
+
+out_err_desc_get:
+	while (i--)
+		dwc_desc_put(dwc, cdesc->desc[i]);
+out_err_alloc:
+	kfree(cdesc);
+out_err:
+	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+	return (struct dw_cyclic_desc *)retval;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_prep);
+
+/**
+ * dw_dma_cyclic_free - free a prepared cyclic DMA transfer
+ * @chan: the DMA channel to free
+ */
+void dw_dma_cyclic_free(struct dma_chan *chan)
+{
+	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+	struct dw_cyclic_desc	*cdesc = dwc->cdesc;
+	int			i;
+
+	dev_dbg(chan2dev(&dwc->chan), "cyclic free\n");
+
+	if (!cdesc)
+		return;
+
+	spin_lock_bh(&dwc->lock);
+
+	channel_clear_bit(dw, CH_EN, dwc->mask);
+	while (dma_readl(dw, CH_EN) & dwc->mask)
+		cpu_relax();
+
+	dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+	dma_writel(dw, CLEAR.ERROR, dwc->mask);
+	dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+	spin_unlock_bh(&dwc->lock);
+
+	for (i = 0; i < cdesc->periods; i++)
+		dwc_desc_put(dwc, cdesc->desc[i]);
+
+	kfree(cdesc->desc);
+	kfree(cdesc);
+
+	clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_free);
+
 /*----------------------------------------------------------------------*/
 
 static void dw_dma_off(struct dw_dma *dw)
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index b252b202c5cf..13a580767031 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -126,6 +126,10 @@ struct dw_dma_regs {
 
 #define DW_REGLEN		0x400
 
+enum dw_dmac_flags {
+	DW_DMA_IS_CYCLIC = 0,
+};
+
 struct dw_dma_chan {
 	struct dma_chan		chan;
 	void __iomem		*ch_regs;
@@ -134,10 +138,12 @@ struct dw_dma_chan {
 	spinlock_t		lock;
 
 	/* these other elements are all protected by lock */
+	unsigned long		flags;
 	dma_cookie_t		completed;
 	struct list_head	active_list;
 	struct list_head	queue;
 	struct list_head	free_list;
+	struct dw_cyclic_desc	*cdesc;
 
 	unsigned int		descs_allocated;
 };
@@ -158,7 +164,6 @@ static inline struct dw_dma_chan *to_dw_dma_chan(struct dma_chan *chan)
 	return container_of(chan, struct dw_dma_chan, chan);
 }
 
-
 struct dw_dma {
 	struct dma_device	dma;
 	void __iomem		*regs;
diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h
index d797dde247f7..c8aad713a046 100644
--- a/include/linux/dw_dmac.h
+++ b/include/linux/dw_dmac.h
@@ -74,4 +74,23 @@ struct dw_dma_slave {
 #define DWC_CFGL_HS_DST_POL	(1 << 18)	/* dst handshake active low */
 #define DWC_CFGL_HS_SRC_POL	(1 << 19)	/* src handshake active low */
 
+/* DMA API extensions */
+struct dw_cyclic_desc {
+	struct dw_desc	**desc;
+	unsigned long	periods;
+	void		(*period_callback)(void *param);
+	void		*period_callback_param;
+};
+
+struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
+		dma_addr_t buf_addr, size_t buf_len, size_t period_len,
+		enum dma_data_direction direction);
+void dw_dma_cyclic_free(struct dma_chan *chan);
+int dw_dma_cyclic_start(struct dma_chan *chan);
+void dw_dma_cyclic_stop(struct dma_chan *chan);
+
+dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan);
+
+dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan);
+
 #endif /* DW_DMAC_H */
-- 
cgit v1.2.3-71-gd317


From 45194e4f89fbdd97a2b7d2698c05f0b00c19e820 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Thu, 2 Apr 2009 19:55:28 +0100
Subject: dm target: remove struct tt_internal

The tt_internal is really just a list_head to manage registered target_type
in a double linked list,

Here embed the list_head into target_type directly,
1. to avoid kmalloc/kfree;
2. then tt_internal is really unneeded;

Cc: stable@kernel.org
Signed-off-by: Cheng Renquan <crquan@gmail.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-target.c        | 90 +++++++++++++++----------------------------
 drivers/md/dm.h               |  2 +-
 include/linux/device-mapper.h |  3 ++
 3 files changed, 34 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index db72c9497bb4..04feccf2a997 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -14,40 +14,34 @@
 
 #define DM_MSG_PREFIX "target"
 
-struct tt_internal {
-	struct target_type tt;
-
-	struct list_head list;
-};
-
 static LIST_HEAD(_targets);
 static DECLARE_RWSEM(_lock);
 
 #define DM_MOD_NAME_SIZE 32
 
-static inline struct tt_internal *__find_target_type(const char *name)
+static inline struct target_type *__find_target_type(const char *name)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
-	list_for_each_entry (ti, &_targets, list)
-		if (!strcmp(name, ti->tt.name))
-			return ti;
+	list_for_each_entry(tt, &_targets, list)
+		if (!strcmp(name, tt->name))
+			return tt;
 
 	return NULL;
 }
 
-static struct tt_internal *get_target_type(const char *name)
+static struct target_type *get_target_type(const char *name)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
 	down_read(&_lock);
 
-	ti = __find_target_type(name);
-	if (ti && !try_module_get(ti->tt.module))
-		ti = NULL;
+	tt = __find_target_type(name);
+	if (tt && !try_module_get(tt->module))
+		tt = NULL;
 
 	up_read(&_lock);
-	return ti;
+	return tt;
 }
 
 static void load_module(const char *name)
@@ -57,83 +51,59 @@ static void load_module(const char *name)
 
 struct target_type *dm_get_target_type(const char *name)
 {
-	struct tt_internal *ti = get_target_type(name);
+	struct target_type *tt = get_target_type(name);
 
-	if (!ti) {
+	if (!tt) {
 		load_module(name);
-		ti = get_target_type(name);
+		tt = get_target_type(name);
 	}
 
-	return ti ? &ti->tt : NULL;
+	return tt;
 }
 
-void dm_put_target_type(struct target_type *t)
+void dm_put_target_type(struct target_type *tt)
 {
-	struct tt_internal *ti = (struct tt_internal *) t;
-
 	down_read(&_lock);
-	module_put(ti->tt.module);
+	module_put(tt->module);
 	up_read(&_lock);
-
-	return;
 }
 
-static struct tt_internal *alloc_target(struct target_type *t)
-{
-	struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-
-	if (ti)
-		ti->tt = *t;
-
-	return ti;
-}
-
-
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param)
 {
-	struct tt_internal *ti;
+	struct target_type *tt;
 
 	down_read(&_lock);
-	list_for_each_entry (ti, &_targets, list)
-		iter_func(&ti->tt, param);
+	list_for_each_entry(tt, &_targets, list)
+		iter_func(tt, param);
 	up_read(&_lock);
 
 	return 0;
 }
 
-int dm_register_target(struct target_type *t)
+int dm_register_target(struct target_type *tt)
 {
 	int rv = 0;
-	struct tt_internal *ti = alloc_target(t);
-
-	if (!ti)
-		return -ENOMEM;
 
 	down_write(&_lock);
-	if (__find_target_type(t->name))
+	if (__find_target_type(tt->name))
 		rv = -EEXIST;
 	else
-		list_add(&ti->list, &_targets);
+		list_add(&tt->list, &_targets);
 
 	up_write(&_lock);
-	if (rv)
-		kfree(ti);
 	return rv;
 }
 
-void dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *tt)
 {
-	struct tt_internal *ti;
-
 	down_write(&_lock);
-	if (!(ti = __find_target_type(t->name))) {
-		DMCRIT("Unregistering unrecognised target: %s", t->name);
+	if (!__find_target_type(tt->name)) {
+		DMCRIT("Unregistering unrecognised target: %s", tt->name);
 		BUG();
 	}
 
-	list_del(&ti->list);
-	kfree(ti);
+	list_del(&tt->list);
 
 	up_write(&_lock);
 }
@@ -142,17 +112,17 @@ void dm_unregister_target(struct target_type *t)
  * io-err: always fails an io, useful for bringing
  * up LVs that have holes in them.
  */
-static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
 {
 	return 0;
 }
 
-static void io_err_dtr(struct dm_target *ti)
+static void io_err_dtr(struct dm_target *tt)
 {
 	/* empty */
 }
 
-static int io_err_map(struct dm_target *ti, struct bio *bio,
+static int io_err_map(struct dm_target *tt, struct bio *bio,
 		      union map_info *map_context)
 {
 	return -EIO;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 20194e000c5a..b48397c0abbd 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -60,7 +60,7 @@ int dm_table_barrier_ok(struct dm_table *t);
 int dm_target_init(void);
 void dm_target_exit(void);
 struct target_type *dm_get_target_type(const char *name);
-void dm_put_target_type(struct target_type *t);
+void dm_put_target_type(struct target_type *tt);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 8209e08969f9..66ec05a57955 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -139,6 +139,9 @@ struct target_type {
 	dm_ioctl_fn ioctl;
 	dm_merge_fn merge;
 	dm_busy_fn busy;
+
+	/* For internal device-mapper use. */
+	struct list_head list;
 };
 
 struct io_restrictions {
-- 
cgit v1.2.3-71-gd317


From ec44ab9d6681ddf9026b593e866bec9c0e075e1d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Apr 2009 19:55:30 +0100
Subject: dm log: remove struct dm_dirty_log_internal

Remove the 'dm_dirty_log_internal' structure.  The resulting cleanup
eliminates extra memory allocations.  Therefore exposing the internal
list_head to the external 'dm_dirty_log_type' structure is a worthwhile
compromise.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c          | 58 +++++++++++---------------------------------
 include/linux/dm-dirty-log.h |  3 +++
 2 files changed, 17 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 094c8f0e0097..be233bc4d917 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -16,34 +16,28 @@
 
 #define DM_MSG_PREFIX "dirty region log"
 
-struct dm_dirty_log_internal {
-	struct dm_dirty_log_type *type;
-
-	struct list_head list;
-};
-
 static LIST_HEAD(_log_types);
 static DEFINE_SPINLOCK(_lock);
 
-static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
 {
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	list_for_each_entry(log_type, &_log_types, list)
-		if (!strcmp(name, log_type->type->name))
+		if (!strcmp(name, log_type->name))
 			return log_type;
 
 	return NULL;
 }
 
-static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
 {
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	spin_lock(&_lock);
 
 	log_type = __find_dirty_log_type(name);
-	if (log_type && !try_module_get(log_type->type->module))
+	if (log_type && !try_module_get(log_type->module))
 		log_type = NULL;
 
 	spin_unlock(&_lock);
@@ -71,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
 static struct dm_dirty_log_type *get_type(const char *type_name)
 {
 	char *p, *type_name_dup;
-	struct dm_dirty_log_internal *log_type;
+	struct dm_dirty_log_type *log_type;
 
 	if (!type_name)
 		return NULL;
 
 	log_type = _get_dirty_log_type(type_name);
 	if (log_type)
-		return log_type->type;
+		return log_type;
 
 	type_name_dup = kstrdup(type_name, GFP_KERNEL);
 	if (!type_name_dup) {
@@ -100,19 +94,16 @@ static struct dm_dirty_log_type *get_type(const char *type_name)
 
 	kfree(type_name_dup);
 
-	return log_type ? log_type->type : NULL;
+	return log_type;
 }
 
 static void put_type(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type;
-
 	if (!type)
 		return;
 
 	spin_lock(&_lock);
-	log_type = __find_dirty_log_type(type->name);
-	if (!log_type)
+	if (!__find_dirty_log_type(type->name))
 		goto out;
 
 	module_put(type->module);
@@ -121,32 +112,15 @@ out:
 	spin_unlock(&_lock);
 }
 
-static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type)
-{
-	struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type),
-							 GFP_KERNEL);
-
-	if (log_type)
-		log_type->type = type;
-
-	return log_type;
-}
-
 int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type);
 	int r = 0;
 
-	if (!log_type)
-		return -ENOMEM;
-
 	spin_lock(&_lock);
 	if (!__find_dirty_log_type(type->name))
-		list_add(&log_type->list, &_log_types);
-	else {
-		kfree(log_type);
+		list_add(&type->list, &_log_types);
+	else
 		r = -EEXIST;
-	}
 	spin_unlock(&_lock);
 
 	return r;
@@ -155,20 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register);
 
 int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
 {
-	struct dm_dirty_log_internal *log_type;
-
 	spin_lock(&_lock);
 
-	log_type = __find_dirty_log_type(type->name);
-	if (!log_type) {
+	if (!__find_dirty_log_type(type->name)) {
 		spin_unlock(&_lock);
 		return -EINVAL;
 	}
 
-	list_del(&log_type->list);
+	list_del(&type->list);
 
 	spin_unlock(&_lock);
-	kfree(log_type);
 
 	return 0;
 }
diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h
index 600c5fb2daad..727602b686d4 100644
--- a/include/linux/dm-dirty-log.h
+++ b/include/linux/dm-dirty-log.h
@@ -28,6 +28,9 @@ struct dm_dirty_log_type {
 	const char *name;
 	struct module *module;
 
+	/* For internal device-mapper use */
+	struct list_head list;
+
 	int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
 		   unsigned argc, char **argv);
 	void (*dtr)(struct dm_dirty_log *log);
-- 
cgit v1.2.3-71-gd317


From 7513c2a761d69d2a93f17146b3563527d3618ba0 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 2 Apr 2009 19:55:30 +0100
Subject: dm raid1: add is_remote_recovering hook for clusters

The logging API needs an extra function to make cluster mirroring
possible.  This new function allows us to check whether a mirror
region is being recovered on another machine in the cluster.  This
helps us prevent simultaneous recovery I/O and process I/O to the
same locations on disk.

Cluster-aware log modules will implement this function.  Single
machine log modules will not.  So, there is no performance
penalty for single machine mirrors.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Acked-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c        | 25 +++++++++++++++++++++++--
 include/linux/dm-dirty-log.h | 10 ++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 62d594889ac3..536ef0bef154 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -588,6 +588,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	int state;
 	struct bio *bio;
 	struct bio_list sync, nosync, recover, *this_list = NULL;
+	struct bio_list requeue;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+	region_t region;
 
 	if (!writes->head)
 		return;
@@ -598,10 +601,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&sync);
 	bio_list_init(&nosync);
 	bio_list_init(&recover);
+	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		state = dm_rh_get_state(ms->rh,
-					dm_rh_bio_to_region(ms->rh, bio), 1);
+		region = dm_rh_bio_to_region(ms->rh, bio);
+
+		if (log->type->is_remote_recovering &&
+		    log->type->is_remote_recovering(log, region)) {
+			bio_list_add(&requeue, bio);
+			continue;
+		}
+
+		state = dm_rh_get_state(ms->rh, region, 1);
 		switch (state) {
 		case DM_RH_CLEAN:
 		case DM_RH_DIRTY:
@@ -620,6 +631,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		bio_list_add(this_list, bio);
 	}
 
+	/*
+	 * Add bios that are delayed due to remote recovery
+	 * back on to the write queue
+	 */
+	if (unlikely(requeue.head)) {
+		spin_lock_irq(&ms->lock);
+		bio_list_merge(&ms->writes, &requeue);
+		spin_unlock_irq(&ms->lock);
+	}
+
 	/*
 	 * Increment the pending counts for any regions that will
 	 * be written to (writes to recover regions are going to
diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h
index 727602b686d4..5e8b11d88f6f 100644
--- a/include/linux/dm-dirty-log.h
+++ b/include/linux/dm-dirty-log.h
@@ -116,6 +116,16 @@ struct dm_dirty_log_type {
 	 */
 	int (*status)(struct dm_dirty_log *log, status_type_t status_type,
 		      char *result, unsigned maxlen);
+
+	/*
+	 * is_remote_recovering is necessary for cluster mirroring. It provides
+	 * a way to detect recovery on another node, so we aren't writing
+	 * concurrently.  This function is likely to block (when a cluster log
+	 * is used).
+	 *
+	 * Returns: 0, 1
+	 */
+	int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
 };
 
 int dm_dirty_log_type_register(struct dm_dirty_log_type *type);
-- 
cgit v1.2.3-71-gd317


From ee3b4290aec03022cfb67c9adba9f1b3215245f0 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 2 Apr 2009 16:56:30 -0700
Subject: generic debug pagealloc: build fix

This fixes a build failure with generic debug pagealloc:

  mm/debug-pagealloc.c: In function 'set_page_poison':
  mm/debug-pagealloc.c:8: error: 'struct page' has no member named 'debug_flags'
  mm/debug-pagealloc.c: In function 'clear_page_poison':
  mm/debug-pagealloc.c:13: error: 'struct page' has no member named 'debug_flags'
  mm/debug-pagealloc.c: In function 'page_poison':
  mm/debug-pagealloc.c:18: error: 'struct page' has no member named 'debug_flags'
  mm/debug-pagealloc.c: At top level:
  mm/debug-pagealloc.c:120: error: redefinition of 'kernel_map_pages'
  include/linux/mm.h:1278: error: previous definition of 'kernel_map_pages' was here
  mm/debug-pagealloc.c: In function 'kernel_map_pages':
  mm/debug-pagealloc.c:122: error: 'debug_pagealloc_enabled' undeclared (first use in this function)

by fixing

 - debug_flags should be in struct page
 - define DEBUG_PAGEALLOC config option for all architectures

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig.debug | 10 ----------
 arch/s390/Kconfig.debug    |  9 ---------
 arch/sparc/Kconfig.debug   |  9 ---------
 arch/x86/Kconfig.debug     |  9 ---------
 include/linux/mm_types.h   |  6 +++---
 mm/Kconfig.debug           |  9 +++++++++
 6 files changed, 12 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 6aa0b5e087cd..a1098e23221f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -27,16 +27,6 @@ config DEBUG_STACK_USAGE
 
 	  This option will slow down process creation somewhat.
 
-config DEBUG_PAGEALLOC
-        bool "Debug page memory allocations"
-        depends on DEBUG_KERNEL && !HIBERNATION
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-        help
-          Unmap pages from the kernel linear mapping after free_pages().
-          This results in a large slowdown, but helps to find certain types
-          of memory corruptions.
-
-
 config HCALL_STATS
 	bool "Hypervisor call instrumentation"
 	depends on PPC_PSERIES && DEBUG_FS
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 7e297a3cde34..2283933a9a93 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -6,13 +6,4 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
-config DEBUG_PAGEALLOC
-	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	help
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a slowdown, but helps to find certain types of
-	  memory corruptions.
-
 endmenu
diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug
index d001b42041a5..90d5fe223a74 100644
--- a/arch/sparc/Kconfig.debug
+++ b/arch/sparc/Kconfig.debug
@@ -22,15 +22,6 @@ config DEBUG_DCFLUSH
 config STACK_DEBUG
 	bool "Stack Overflow Detection Support"
 
-config DEBUG_PAGEALLOC
-	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL && !HIBERNATION
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	help
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a large slowdown, but helps to find certain types
-	  of memory corruptions.
-
 config MCOUNT
 	bool
 	depends on SPARC64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a345cb5447a8..d8359e73317f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,15 +72,6 @@ config DEBUG_STACK_USAGE
 
 	  This option will slow down process creation somewhat.
 
-config DEBUG_PAGEALLOC
-	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL
-	depends on ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	---help---
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a large slowdown, but helps to find certain types
-	  of memory corruptions.
-
 config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ddadb4defe00..0e80e26ecf21 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -95,6 +95,9 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
+	unsigned long debug_flags;	/* Use atomic bitops on this */
+#endif
 };
 
 /*
@@ -175,9 +178,6 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
-#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
-	unsigned long debug_flags;	/* Use atomic bitops on this */
-#endif
 };
 
 struct core_thread {
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index c8d62d49a44e..bb01e298f260 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,3 +1,12 @@
+config DEBUG_PAGEALLOC
+	bool "Debug page memory allocations"
+	depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	depends on !HIBERNATION || !PPC && !SPARC
+	---help---
+	  Unmap pages from the kernel linear mapping after free_pages().
+	  This results in a large slowdown, but helps to find certain types
+	  of memory corruptions.
+
 config WANT_PAGE_DEBUG_FLAGS
 	bool
 
-- 
cgit v1.2.3-71-gd317


From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Apr 2009 16:56:32 -0700
Subject: nommu: fix a number of issues with the per-MM VMA patch

Fix a number of issues with the per-MM VMA patch:

 (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on
     a NOMMU system with more than 2G pages.  Makes no difference on a 32-bit
     system.

 (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value,
     lest it overflow.

 (3) Move the allocation of the vm_area_struct slab back for fork.c.

 (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs.

 (5) Use BUG_ON() rather than if () BUG().

 (6) Make the default validate_nommu_regions() a static inline rather than a
     #define.

 (7) Make free_page_series()'s objection to pages with a refcount != 1 more
     informative.

 (8) Adjust the __put_nommu_region() banner comment to indicate that the
     semaphore must be held for writing.

 (9) Limit the number of warnings about munmaps of non-mmapped regions.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c    |  2 +-
 fs/proc/task_nommu.c |  4 ++--
 include/linux/mm.h   |  2 +-
 kernel/fork.c        |  1 +
 mm/mmap.c            |  3 ---
 mm/nommu.c           | 52 +++++++++++++++++++++++++---------------------------
 6 files changed, 30 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeram-i.freehigh),
 #endif
 #ifndef CONFIG_MMU
-		K((unsigned long) atomic_read(&mmap_pages_allocated)),
+		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..370be0a2c909 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -136,14 +136,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	}
 
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   vma->vm_pgoff << PAGE_SHIFT,
+		   (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aeabe953ba4f..bff1f0d475c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1079,7 +1079,7 @@ static inline void setup_per_cpu_pageset(void) {}
 #endif
 
 /* nommu.c */
-extern atomic_t mmap_pages_allocated;
+extern atomic_long_t mmap_pages_allocated;
 
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..51d1aa21483b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1488,6 +1488,7 @@ void __init proc_caches_init(void)
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1abb9185a686..4a3841186c11 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm)
  */
 void __init mmap_init(void)
 {
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..72eda4aee2cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
 int heap_stack_gap = 0;
 
-atomic_t mmap_pages_allocated;
+atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
@@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
  */
 void __init mmap_init(void)
 {
-	vm_region_jar = kmem_cache_create("vm_region_jar",
-					  sizeof(struct vm_region), 0,
-					  SLAB_PANIC, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-					   sizeof(struct vm_area_struct), 0,
-					   SLAB_PANIC, NULL);
+	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
 
 /*
@@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void)
 		return;
 
 	last = rb_entry(lastp, struct vm_region, vm_rb);
-	if (unlikely(last->vm_end <= last->vm_start))
-		BUG();
-	if (unlikely(last->vm_top < last->vm_end))
-		BUG();
+	BUG_ON(unlikely(last->vm_end <= last->vm_start));
+	BUG_ON(unlikely(last->vm_top < last->vm_end));
 
 	while ((p = rb_next(lastp))) {
 		region = rb_entry(p, struct vm_region, vm_rb);
 		last = rb_entry(lastp, struct vm_region, vm_rb);
 
-		if (unlikely(region->vm_end <= region->vm_start))
-			BUG();
-		if (unlikely(region->vm_top < region->vm_end))
-			BUG();
-		if (unlikely(region->vm_start < last->vm_top))
-			BUG();
+		BUG_ON(unlikely(region->vm_end <= region->vm_start));
+		BUG_ON(unlikely(region->vm_top < region->vm_end));
+		BUG_ON(unlikely(region->vm_start < last->vm_top));
 
 		lastp = p;
 	}
 }
 #else
-#define validate_nommu_regions() do {} while(0)
+static void validate_nommu_regions(void)
+{
+}
 #endif
 
 /*
@@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to)
 		struct page *page = virt_to_page(from);
 
 		kdebug("- free %lx", from);
-		atomic_dec(&mmap_pages_allocated);
+		atomic_long_dec(&mmap_pages_allocated);
 		if (page_count(page) != 1)
-			kdebug("free page %p [%d]", page, page_count(page));
+			kdebug("free page %p: refcount not one: %d",
+			       page, page_count(page));
 		put_page(page);
 	}
 }
 
 /*
  * release a reference to a region
- * - the caller must hold the region semaphore, which this releases
+ * - the caller must hold the region semaphore for writing, which this releases
  * - the region may not have been added to the tree yet, in which case vm_top
  *   will equal vm_start
  */
@@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 		goto enomem;
 
 	total = 1 << order;
-	atomic_add(total, &mmap_pages_allocated);
+	atomic_long_add(total, &mmap_pages_allocated);
 
 	point = rlen >> PAGE_SHIFT;
 
@@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 			order = ilog2(total - point);
 			n = 1 << order;
 			kdebug("shave %lu/%lu @%lu", n, total - point, total);
-			atomic_sub(n, &mmap_pages_allocated);
+			atomic_long_sub(n, &mmap_pages_allocated);
 			total -= n;
 			set_page_refcounted(pages + total);
 			__free_pages(pages + total, order);
@@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	/* find the first potentially overlapping VMA */
 	vma = find_vma(mm, start);
 	if (!vma) {
-		printk(KERN_WARNING
-		       "munmap of memory not mmapped by process %d (%s):"
-		       " 0x%lx-0x%lx\n",
-		       current->pid, current->comm, start, start + len - 1);
+		static int limit = 0;
+		if (limit < 5) {
+			printk(KERN_WARNING
+			       "munmap of memory not mmapped by process %d"
+			       " (%s): 0x%lx-0x%lx\n",
+			       current->pid, current->comm,
+			       start, start + len - 1);
+			limit++;
+		}
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 8e2c3795c78d5c4e2e1f14ce751e9d08decbe9d3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 2 Apr 2009 16:56:44 -0700
Subject: add fiemap.h to header-y

Include fiemap.h in header-y; it defines the interface for the
FS_IOC_FIEMAP file mapping ioctl.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index a67b6227d272..ca9b9b9bd331 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -67,6 +67,7 @@ header-y += falloc.h
 header-y += fd.h
 header-y += fdreg.h
 header-y += fib_rules.h
+header-y += fiemap.h
 header-y += firewire-cdev.h
 header-y += firewire-constants.h
 header-y += fuse.h
-- 
cgit v1.2.3-71-gd317


From 9a896c9a48ac6704c0ce8ee081b836644d0afe40 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Thu, 2 Apr 2009 16:56:45 -0700
Subject: mm: define a UNIQUE value for AS_UNEVICTABLE flag

A new "address_space flag"--AS_MM_ALL_LOCKS--was defined to use the next
available AS flag while the Unevictable LRU was under development.  The
Unevictable LRU was using the same flag and "no one" noticed.  Current
mainline, since 2.6.28, has same value for two symbolic flag names.

So, define a unique flag value for AS_UNEVICTABLE--up close to the other
flags, [at the cost of an additional #ifdef] so we'll notice next time.
Note that #ifdef is not actually required, if we don't mind having the
unused flag value defined.

Replace #defines with an enum.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: <stable@kernel.org>		[2.6.28.x, 2.6.29.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 01ca0856caff..076a7dc67c2b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -18,9 +18,14 @@
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
  * allocation mode flags.
  */
-#define	AS_EIO		(__GFP_BITS_SHIFT + 0)	/* IO error on async write */
-#define AS_ENOSPC	(__GFP_BITS_SHIFT + 1)	/* ENOSPC on async write */
-#define AS_MM_ALL_LOCKS	(__GFP_BITS_SHIFT + 2)	/* under mm_take_all_locks() */
+enum mapping_flags {
+	AS_EIO		= __GFP_BITS_SHIFT + 0,	/* IO error on async write */
+	AS_ENOSPC	= __GFP_BITS_SHIFT + 1,	/* ENOSPC on async write */
+	AS_MM_ALL_LOCKS	= __GFP_BITS_SHIFT + 2,	/* under mm_take_all_locks() */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	AS_UNEVICTABLE	= __GFP_BITS_SHIFT + 3,	/* e.g., ramdisk, SHM_LOCK */
+#endif
+};
 
 static inline void mapping_set_error(struct address_space *mapping, int error)
 {
@@ -33,7 +38,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
 }
 
 #ifdef CONFIG_UNEVICTABLE_LRU
-#define AS_UNEVICTABLE	(__GFP_BITS_SHIFT + 2)	/* e.g., ramdisk, SHM_LOCK */
 
 static inline void mapping_set_unevictable(struct address_space *mapping)
 {
-- 
cgit v1.2.3-71-gd317


From bf6aede712334d7338d5c47a5ee5ba3883c82a61 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 2 Apr 2009 16:56:54 -0700
Subject: workqueue: add to_delayed_work() helper function

It is a fairly common operation to have a pointer to a work and to need a
pointer to the delayed work it is contained in.  In particular, all
delayed works which want to rearm themselves will have to do that.  So it
would seem fair to offer a helper function for this operation.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/vio.c                               | 2 +-
 drivers/crypto/hifn_795x.c                              | 2 +-
 drivers/input/mouse/hgpk.c                              | 2 +-
 drivers/net/dm9000.c                                    | 2 +-
 drivers/net/mlx4/en_netdev.c                            | 2 +-
 drivers/net/mlx4/en_rx.c                                | 2 +-
 drivers/net/mlx4/sense.c                                | 2 +-
 drivers/net/phy/phy.c                                   | 3 +--
 drivers/s390/scsi/zfcp_fc.c                             | 2 +-
 drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c | 8 ++++----
 drivers/staging/rtl8187se/r8180_core.c                  | 8 ++++----
 drivers/usb/wusbcore/devconnect.c                       | 2 +-
 include/linux/workqueue.h                               | 5 +++++
 mm/slab.c                                               | 3 +--
 14 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index d3694498f3af..819e59f6f7c7 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -482,7 +482,7 @@ static void vio_cmo_balance(struct work_struct *work)
 	cmo->excess.size = cmo->entitled - cmo->reserve.size;
 	cmo->excess.free = cmo->excess.size - need;
 
-	cancel_delayed_work(container_of(work, struct delayed_work, work));
+	cancel_delayed_work(to_delayed_work(work));
 	spin_unlock_irqrestore(&vio_cmo.lock, flags);
 }
 
diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index 0c79fe7f1567..4d85402a9e4a 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -1882,7 +1882,7 @@ static void hifn_clear_rings(struct hifn_device *dev, int error)
 
 static void hifn_work(struct work_struct *work)
 {
-	struct delayed_work *dw = container_of(work, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(work);
 	struct hifn_device *dev = container_of(dw, struct hifn_device, work);
 	unsigned long flags;
 	int reset = 0;
diff --git a/drivers/input/mouse/hgpk.c b/drivers/input/mouse/hgpk.c
index 81e6ebf323e9..55cd0fa68339 100644
--- a/drivers/input/mouse/hgpk.c
+++ b/drivers/input/mouse/hgpk.c
@@ -381,7 +381,7 @@ static void hgpk_disconnect(struct psmouse *psmouse)
 
 static void hgpk_recalib_work(struct work_struct *work)
 {
-	struct delayed_work *w = container_of(work, struct delayed_work, work);
+	struct delayed_work *w = to_delayed_work(work);
 	struct hgpk_data *priv = container_of(w, struct hgpk_data, recalib_wq);
 	struct psmouse *psmouse = priv->psmouse;
 
diff --git a/drivers/net/dm9000.c b/drivers/net/dm9000.c
index 254ec62b5f58..d8350860c0f8 100644
--- a/drivers/net/dm9000.c
+++ b/drivers/net/dm9000.c
@@ -559,7 +559,7 @@ static void dm9000_show_carrier(board_info_t *db,
 static void
 dm9000_poll_work(struct work_struct *w)
 {
-	struct delayed_work *dw = container_of(w, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(w);
 	board_info_t *db = container_of(dw, board_info_t, phy_poll);
 	struct net_device *ndev = db->ndev;
 
diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index 9f6644a44030..303c23de6cac 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -505,7 +505,7 @@ out:
 
 static void mlx4_en_do_get_stats(struct work_struct *work)
 {
-	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct delayed_work *delay = to_delayed_work(work);
 	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
 						 stats_task);
 	struct mlx4_en_dev *mdev = priv->mdev;
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index a4130e764991..7e40741fb7d8 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -298,7 +298,7 @@ static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
 
 void mlx4_en_rx_refill(struct work_struct *work)
 {
-	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct delayed_work *delay = to_delayed_work(work);
 	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
 						 refill_task);
 	struct mlx4_en_dev *mdev = priv->mdev;
diff --git a/drivers/net/mlx4/sense.c b/drivers/net/mlx4/sense.c
index 6d5089ecb5af..f36ae691cab3 100644
--- a/drivers/net/mlx4/sense.c
+++ b/drivers/net/mlx4/sense.c
@@ -103,7 +103,7 @@ void mlx4_do_sense_ports(struct mlx4_dev *dev,
 
 static void mlx4_sense_port(struct work_struct *work)
 {
-	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct delayed_work *delay = to_delayed_work(work);
 	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
 						sense_poll);
 	struct mlx4_dev *dev = sense->dev;
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 58b73b08dde0..3ff1f425f1bb 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -757,8 +757,7 @@ EXPORT_SYMBOL(phy_start);
  */
 static void phy_state_machine(struct work_struct *work)
 {
-	struct delayed_work *dwork =
-			container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct phy_device *phydev =
 			container_of(dwork, struct phy_device, state_queue);
 	int needs_aneg = 0;
diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c
index aab8123c5966..e8d032b9dfbd 100644
--- a/drivers/s390/scsi/zfcp_fc.c
+++ b/drivers/s390/scsi/zfcp_fc.c
@@ -94,7 +94,7 @@ static int zfcp_wka_port_get(struct zfcp_wka_port *wka_port)
 
 static void zfcp_wka_port_offline(struct work_struct *work)
 {
-	struct delayed_work *dw = container_of(work, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(work);
 	struct zfcp_wka_port *wka_port =
 			container_of(dw, struct zfcp_wka_port, work);
 
diff --git a/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c b/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c
index e5752f615e09..80f9cc7137c2 100644
--- a/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c
+++ b/drivers/staging/rtl8187se/ieee80211/ieee80211_softmac.c
@@ -719,7 +719,7 @@ void ieee80211_softmac_scan(struct ieee80211_device *ieee)
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_softmac_scan_wq(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork, struct ieee80211_device, softmac_scan_wq);
 #else
 void ieee80211_softmac_scan_wq(struct ieee80211_device *ieee)
@@ -777,7 +777,7 @@ out:
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_softmac_scan_wq(struct work_struct *work)
 {
-        struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
         struct ieee80211_device *ieee = container_of(work, struct ieee80211_device, softmac_scan_wq);
 #else
 void ieee80211_softmac_scan_wq(struct ieee80211_device *ieee)
@@ -2980,7 +2980,7 @@ void ieee80211_start_monitor_mode(struct ieee80211_device *ieee)
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_start_ibss_wq(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork, struct ieee80211_device, start_ibss_wq);
 #else
 void ieee80211_start_ibss_wq(struct ieee80211_device *ieee)
@@ -3162,7 +3162,7 @@ void ieee80211_disassociate(struct ieee80211_device *ieee)
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void ieee80211_associate_retry_wq(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork, struct ieee80211_device, associate_retry_wq);
 #else
 void ieee80211_associate_retry_wq(struct ieee80211_device *ieee)
diff --git a/drivers/staging/rtl8187se/r8180_core.c b/drivers/staging/rtl8187se/r8180_core.c
index 66de5cc8ddf1..ff1f23f99f27 100644
--- a/drivers/staging/rtl8187se/r8180_core.c
+++ b/drivers/staging/rtl8187se/r8180_core.c
@@ -5438,7 +5438,7 @@ void rtl8180_hw_wakeup_wq (struct work_struct *work)
 //	struct r8180_priv *priv = container_of(work, struct r8180_priv, watch_dog_wq);
 //	struct ieee80211_device * ieee = (struct ieee80211_device*)
 //	                                       container_of(work, struct ieee80211_device, watch_dog_wq);
-	struct delayed_work *dwork = container_of(work,struct delayed_work,work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(dwork,struct ieee80211_device,hw_wakeup_wq);
 	struct net_device *dev = ieee->dev;
 #else
@@ -5459,7 +5459,7 @@ void rtl8180_hw_sleep_wq (struct work_struct *work)
 //      struct r8180_priv *priv = container_of(work, struct r8180_priv, watch_dog_wq);
 //      struct ieee80211_device * ieee = (struct ieee80211_device*)
 //                                             container_of(work, struct ieee80211_device, watch_dog_wq);
-        struct delayed_work *dwork = container_of(work,struct delayed_work,work);
+	struct delayed_work *dwork = to_delayed_work(work);
         struct ieee80211_device *ieee = container_of(dwork,struct ieee80211_device,hw_sleep_wq);
         struct net_device *dev = ieee->dev;
 #else
@@ -6407,7 +6407,7 @@ priv->txnpring)/8);
 void rtl8180_tx_irq_wq(struct work_struct *work)
 {
 	//struct r8180_priv *priv = container_of(work, struct r8180_priv, reset_wq);
-        struct delayed_work *dwork = container_of(work,struct delayed_work,work);
+	struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device * ieee = (struct ieee80211_device*)
 	                                       container_of(dwork, struct ieee80211_device, watch_dog_wq);
 	struct net_device *dev = ieee->dev;
@@ -6691,7 +6691,7 @@ lizhaoming--------------------------- RF power on/power off -----------------
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20))
 void GPIOChangeRFWorkItemCallBack(struct work_struct *work)
 {
-	//struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	//struct delayed_work *dwork = to_delayed_work(work);
 	struct ieee80211_device *ieee = container_of(work, struct ieee80211_device, GPIOChangeRFWorkItem.work);
 	struct net_device *dev = ieee->dev;
 	struct r8180_priv *priv = ieee80211_priv(dev);
diff --git a/drivers/usb/wusbcore/devconnect.c b/drivers/usb/wusbcore/devconnect.c
index f0aac0cf315a..386eaa22d215 100644
--- a/drivers/usb/wusbcore/devconnect.c
+++ b/drivers/usb/wusbcore/devconnect.c
@@ -471,7 +471,7 @@ static void __wusbhc_keep_alive(struct wusbhc *wusbhc)
  */
 static void wusbhc_keep_alive_run(struct work_struct *ws)
 {
-	struct delayed_work *dw = container_of(ws, struct delayed_work, work);
+	struct delayed_work *dw = to_delayed_work(ws);
 	struct wusbhc *wusbhc =	container_of(dw, struct wusbhc, keep_alive_timer);
 
 	mutex_lock(&wusbhc->mutex);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 3cd51e579ab1..13e1adf55c4c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -41,6 +41,11 @@ struct delayed_work {
 	struct timer_list timer;
 };
 
+static inline struct delayed_work *to_delayed_work(struct work_struct *work)
+{
+	return container_of(work, struct delayed_work, work);
+}
+
 struct execute_work {
 	struct work_struct work;
 };
diff --git a/mm/slab.c b/mm/slab.c
index 825c606f691d..208323fd37bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3992,8 +3992,7 @@ static void cache_reap(struct work_struct *w)
 	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
 	int node = numa_node_id();
-	struct delayed_work *work =
-		container_of(w, struct delayed_work, work);
+	struct delayed_work *work = to_delayed_work(w);
 
 	if (!mutex_trylock(&cache_chain_mutex))
 		/* Give up. Setup the next iteration. */
-- 
cgit v1.2.3-71-gd317


From 06c421ee0d5af95c8c6749ca0ba620cd5010707f Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@deeprootsystems.com>
Date: Thu, 2 Apr 2009 16:56:56 -0700
Subject: memory_accessor: new interface for reading/writing persistent memory

Add an interface by which other kernel code can read/write persistent
memory such as I2C or SPI EEPROMs, or devices which provide NVRAM.  Use
cases include storage of board-specific configuration data like Ethernet
addresses and sensor calibrations.

Original idea, review and improvement suggestions by David Brownell.

Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 3fdc10806d31..42767d1a62e7 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -99,4 +99,15 @@ enum mem_add_context { BOOT, HOTPLUG };
 #define hotplug_memory_notifier(fn, pri) do { } while (0)
 #endif
 
+/*
+ * 'struct memory_accessor' is a generic interface to provide
+ * in-kernel access to persistent memory such as i2c or SPI EEPROMs
+ */
+struct memory_accessor {
+	ssize_t (*read)(struct memory_accessor *, char *buf, off_t offset,
+			size_t count);
+	ssize_t (*write)(struct memory_accessor *, const char *buf,
+			 off_t offset, size_t count);
+};
+
 #endif /* _LINUX_MEMORY_H_ */
-- 
cgit v1.2.3-71-gd317


From 7274ec8bd71e99018642f474528ea7de4bb3ae25 Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@deeprootsystems.com>
Date: Thu, 2 Apr 2009 16:56:57 -0700
Subject: memory_accessor: implement the new memory_accessor interface for I2C
 EEPROM

In the case of at24, the platform code registers a 'setup' callback with
the at24_platform_data.  When the at24 driver detects an EEPROM, it fills
out the read and write functions of the memory_accessor and calls the
setup callback passing the memory_accessor struct.  The platform code can
then use the read/write functions in the memory_accessor struct for
reading and writing the EEPROM.

Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/eeprom/at24.c | 67 +++++++++++++++++++++++++++++++++++++---------
 include/linux/i2c/at24.h   |  4 +++
 2 files changed, 58 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index d4775528abc6..d184dfab9631 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -53,6 +53,7 @@
 
 struct at24_data {
 	struct at24_platform_data chip;
+	struct memory_accessor macc;
 	bool use_smbus;
 
 	/*
@@ -225,14 +226,11 @@ static ssize_t at24_eeprom_read(struct at24_data *at24, char *buf,
 		return status;
 }
 
-static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t at24_read(struct at24_data *at24,
 		char *buf, loff_t off, size_t count)
 {
-	struct at24_data *at24;
 	ssize_t retval = 0;
 
-	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
-
 	if (unlikely(!count))
 		return count;
 
@@ -262,12 +260,14 @@ static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
 	return retval;
 }
 
+static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t off, size_t count)
+{
+	struct at24_data *at24;
 
-/*
- * REVISIT: export at24_bin{read,write}() to let other kernel code use
- * eeprom data. For example, it might hold a board's Ethernet address, or
- * board-specific calibration data generated on the manufacturing floor.
- */
+	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
+	return at24_read(at24, buf, off, count);
+}
 
 
 /*
@@ -347,14 +347,11 @@ static ssize_t at24_eeprom_write(struct at24_data *at24, char *buf,
 	return -ETIMEDOUT;
 }
 
-static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t at24_write(struct at24_data *at24,
 		char *buf, loff_t off, size_t count)
 {
-	struct at24_data *at24;
 	ssize_t retval = 0;
 
-	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
-
 	if (unlikely(!count))
 		return count;
 
@@ -384,6 +381,39 @@ static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
 	return retval;
 }
 
+static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t off, size_t count)
+{
+	struct at24_data *at24;
+
+	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
+	return at24_write(at24, buf, off, count);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * This lets other kernel code access the eeprom data. For example, it
+ * might hold a board's Ethernet address, or board-specific calibration
+ * data generated on the manufacturing floor.
+ */
+
+static ssize_t at24_macc_read(struct memory_accessor *macc, char *buf,
+			 off_t offset, size_t count)
+{
+	struct at24_data *at24 = container_of(macc, struct at24_data, macc);
+
+	return at24_read(at24, buf, offset, count);
+}
+
+static ssize_t at24_macc_write(struct memory_accessor *macc, char *buf,
+			  off_t offset, size_t count)
+{
+	struct at24_data *at24 = container_of(macc, struct at24_data, macc);
+
+	return at24_write(at24, buf, offset, count);
+}
+
 /*-------------------------------------------------------------------------*/
 
 static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
@@ -413,6 +443,9 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		 * is recommended anyhow.
 		 */
 		chip.page_size = 1;
+
+		chip.setup = NULL;
+		chip.context = NULL;
 	}
 
 	if (!is_power_of_2(chip.byte_len))
@@ -463,6 +496,8 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	at24->bin.read = at24_bin_read;
 	at24->bin.size = chip.byte_len;
 
+	at24->macc.read = at24_macc_read;
+
 	writable = !(chip.flags & AT24_FLAG_READONLY);
 	if (writable) {
 		if (!use_smbus || i2c_check_functionality(client->adapter,
@@ -470,6 +505,8 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
 			unsigned write_max = chip.page_size;
 
+			at24->macc.write = at24_macc_write;
+
 			at24->bin.write = at24_bin_write;
 			at24->bin.attr.mode |= S_IWUSR;
 
@@ -520,6 +557,10 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		at24->write_max,
 		use_smbus ? ", use_smbus" : "");
 
+	/* export data to kernel code */
+	if (chip.setup)
+		chip.setup(&at24->macc, chip.context);
+
 	return 0;
 
 err_clients:
diff --git a/include/linux/i2c/at24.h b/include/linux/i2c/at24.h
index f6edd522a929..8ace93024d60 100644
--- a/include/linux/i2c/at24.h
+++ b/include/linux/i2c/at24.h
@@ -2,6 +2,7 @@
 #define _LINUX_AT24_H
 
 #include <linux/types.h>
+#include <linux/memory.h>
 
 /*
  * As seen through Linux I2C, differences between the most common types of I2C
@@ -23,6 +24,9 @@ struct at24_platform_data {
 #define AT24_FLAG_READONLY	0x40	/* sysfs-entry will be read-only */
 #define AT24_FLAG_IRUGO		0x20	/* sysfs-entry will be world-readable */
 #define AT24_FLAG_TAKE8ADDR	0x10	/* take always 8 addresses (24c00) */
+
+	void		(*setup)(struct memory_accessor *, void *context);
+	void		*context;
 };
 
 #endif /* _LINUX_AT24_H */
-- 
cgit v1.2.3-71-gd317


From 14dd1ff0f9e75dd4ae2f1ff8e48becb76d14f4ab Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Thu, 2 Apr 2009 16:56:58 -0700
Subject: memory_accessor: implement the new memory_accessor interfaces for SPI
 EEPROMs

- Define new setup() hook to export the accessor
 - Implement accessor methods

Moves some error checking out of the sysfs interface code into the layer
below it, which is now shared by both sysfs and memory access code.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/eeprom/at25.c | 58 +++++++++++++++++++++++++++++++++++-----------
 include/linux/spi/eeprom.h |  6 +++++
 2 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index 290dbe99647a..6bc0dac5c1e8 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c
@@ -30,6 +30,7 @@
 
 struct at25_data {
 	struct spi_device	*spi;
+	struct memory_accessor	mem;
 	struct mutex		lock;
 	struct spi_eeprom	chip;
 	struct bin_attribute	bin;
@@ -75,6 +76,13 @@ at25_ee_read(
 	struct spi_transfer	t[2];
 	struct spi_message	m;
 
+	if (unlikely(offset >= at25->bin.size))
+		return 0;
+	if ((offset + count) > at25->bin.size)
+		count = at25->bin.size - offset;
+	if (unlikely(!count))
+		return count;
+
 	cp = command;
 	*cp++ = AT25_READ;
 
@@ -127,13 +135,6 @@ at25_bin_read(struct kobject *kobj, struct bin_attribute *bin_attr,
 	dev = container_of(kobj, struct device, kobj);
 	at25 = dev_get_drvdata(dev);
 
-	if (unlikely(off >= at25->bin.size))
-		return 0;
-	if ((off + count) > at25->bin.size)
-		count = at25->bin.size - off;
-	if (unlikely(!count))
-		return count;
-
 	return at25_ee_read(at25, buf, off, count);
 }
 
@@ -146,6 +147,13 @@ at25_ee_write(struct at25_data *at25, char *buf, loff_t off, size_t count)
 	unsigned		buf_size;
 	u8			*bounce;
 
+	if (unlikely(off >= at25->bin.size))
+		return -EFBIG;
+	if ((off + count) > at25->bin.size)
+		count = at25->bin.size - off;
+	if (unlikely(!count))
+		return count;
+
 	/* Temp buffer starts with command and address */
 	buf_size = at25->chip.page_size;
 	if (buf_size > io_limit)
@@ -253,18 +261,31 @@ at25_bin_write(struct kobject *kobj, struct bin_attribute *bin_attr,
 	dev = container_of(kobj, struct device, kobj);
 	at25 = dev_get_drvdata(dev);
 
-	if (unlikely(off >= at25->bin.size))
-		return -EFBIG;
-	if ((off + count) > at25->bin.size)
-		count = at25->bin.size - off;
-	if (unlikely(!count))
-		return count;
-
 	return at25_ee_write(at25, buf, off, count);
 }
 
 /*-------------------------------------------------------------------------*/
 
+/* Let in-kernel code access the eeprom data. */
+
+static ssize_t at25_mem_read(struct memory_accessor *mem, char *buf,
+			 off_t offset, size_t count)
+{
+	struct at25_data *at25 = container_of(mem, struct at25_data, mem);
+
+	return at25_ee_read(at25, buf, offset, count);
+}
+
+static ssize_t at25_mem_write(struct memory_accessor *mem, char *buf,
+			  off_t offset, size_t count)
+{
+	struct at25_data *at25 = container_of(mem, struct at25_data, mem);
+
+	return at25_ee_write(at25, buf, offset, count);
+}
+
+/*-------------------------------------------------------------------------*/
+
 static int at25_probe(struct spi_device *spi)
 {
 	struct at25_data	*at25 = NULL;
@@ -317,6 +338,10 @@ static int at25_probe(struct spi_device *spi)
 	at25->addrlen = addrlen;
 
 	/* Export the EEPROM bytes through sysfs, since that's convenient.
+	 * And maybe to other kernel code; it might hold a board's Ethernet
+	 * address, or board-specific calibration data generated on the
+	 * manufacturing floor.
+	 *
 	 * Default to root-only access to the data; EEPROMs often hold data
 	 * that's sensitive for read and/or write, like ethernet addresses,
 	 * security codes, board-specific manufacturing calibrations, etc.
@@ -324,17 +349,22 @@ static int at25_probe(struct spi_device *spi)
 	at25->bin.attr.name = "eeprom";
 	at25->bin.attr.mode = S_IRUSR;
 	at25->bin.read = at25_bin_read;
+	at25->mem.read = at25_mem_read;
 
 	at25->bin.size = at25->chip.byte_len;
 	if (!(chip->flags & EE_READONLY)) {
 		at25->bin.write = at25_bin_write;
 		at25->bin.attr.mode |= S_IWUSR;
+		at25->mem.write = at25_mem_write;
 	}
 
 	err = sysfs_create_bin_file(&spi->dev.kobj, &at25->bin);
 	if (err)
 		goto fail;
 
+	if (chip->setup)
+		chip->setup(&at25->mem, chip->context);
+
 	dev_info(&spi->dev, "%Zd %s %s eeprom%s, pagesize %u\n",
 		(at25->bin.size < 1024)
 			? at25->bin.size
diff --git a/include/linux/spi/eeprom.h b/include/linux/spi/eeprom.h
index 1085212c446e..306e7b1c69ed 100644
--- a/include/linux/spi/eeprom.h
+++ b/include/linux/spi/eeprom.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_SPI_EEPROM_H
 #define __LINUX_SPI_EEPROM_H
 
+#include <linux/memory.h>
+
 /*
  * Put one of these structures in platform_data for SPI EEPROMS handled
  * by the "at25" driver.  On SPI, most EEPROMS understand the same core
@@ -17,6 +19,10 @@ struct spi_eeprom {
 #define	EE_ADDR2	0x0002			/* 16 bit addrs */
 #define	EE_ADDR3	0x0004			/* 24 bit addrs */
 #define	EE_READONLY	0x0008			/* disallow writes */
+
+	/* for exporting this chip's data to other kernel code */
+	void (*setup)(struct memory_accessor *mem, void *context);
+	void *context;
 };
 
 #endif /* __LINUX_SPI_EEPROM_H */
-- 
cgit v1.2.3-71-gd317


From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 2 Apr 2009 16:56:59 -0700
Subject: Simplify copy_thread()

First argument unused since 2.3.11.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c         | 2 +-
 arch/arm/kernel/process.c           | 2 +-
 arch/avr32/kernel/process.c         | 2 +-
 arch/blackfin/kernel/process.c      | 2 +-
 arch/cris/arch-v10/kernel/process.c | 2 +-
 arch/cris/arch-v32/kernel/process.c | 2 +-
 arch/frv/kernel/process.c           | 2 +-
 arch/h8300/kernel/process.c         | 2 +-
 arch/ia64/kernel/process.c          | 2 +-
 arch/m32r/kernel/process.c          | 2 +-
 arch/m68k/kernel/process.c          | 2 +-
 arch/m68knommu/kernel/process.c     | 2 +-
 arch/mips/kernel/process.c          | 2 +-
 arch/mn10300/kernel/process.c       | 2 +-
 arch/parisc/kernel/process.c        | 2 +-
 arch/powerpc/kernel/process.c       | 2 +-
 arch/s390/kernel/process.c          | 2 +-
 arch/sh/kernel/process_32.c         | 2 +-
 arch/sh/kernel/process_64.c         | 2 +-
 arch/sparc/kernel/process_32.c      | 2 +-
 arch/sparc/kernel/process_64.c      | 2 +-
 arch/um/kernel/process.c            | 2 +-
 arch/x86/kernel/process_32.c        | 2 +-
 arch/x86/kernel/process_64.c        | 2 +-
 arch/xtensa/kernel/process.c        | 2 +-
 include/linux/sched.h               | 3 ++-
 kernel/fork.c                       | 2 +-
 27 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 8d0097f10208..3a2fb7a02db4 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -272,7 +272,7 @@ alpha_vfork(struct pt_regs *regs)
  */
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,
 	    struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 2de14e2afdc5..c3265a2e7cd4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -301,7 +301,7 @@ void release_thread(struct task_struct *dead_task)
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long stack_start,
+copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	    unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *thread = task_thread_info(p);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 43ae555ecb33..1bbe1da54869 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -332,7 +332,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 33e2e8993f7f..f49427293ca1 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -193,7 +193,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	    unsigned long usp, unsigned long topstk,
 	    struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index bd9b3ff63f6c..c4c69cf721e5 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -115,7 +115,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
  */
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index ced5b725d9bd..120e7f796fea 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -131,7 +131,7 @@ kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 extern asmlinkage void ret_from_fork(void);
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 9583a338e9d6..0de50df74970 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -204,7 +204,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * set up the kernel stack and exception frames for a new process
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index a8ef654a5a0b..e2f33d0f9969 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -191,7 +191,7 @@ asmlinkage int h8300_clone(struct pt_regs *regs)
 
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
                 unsigned long usp, unsigned long topstk,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..5d7c0e5b9e76 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -413,7 +413,7 @@ ia64_load_extra (struct task_struct *task)
  * so there is nothing to worry about.
  */
 int
-copy_thread (int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	     unsigned long user_stack_base, unsigned long user_stack_size,
 	     struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 7103d91e1a2f..3e876f0baebc 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 	return 0; /* Task didn't use the fpu at all. */
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long spu,
+int copy_thread(unsigned long clone_flags, unsigned long spu,
 	unsigned long unused, struct task_struct *tsk, struct pt_regs *regs)
 {
 	struct pt_regs *childregs = task_pt_regs(tsk);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 632ce016014d..ec37fb56c127 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -233,7 +233,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
 		       parent_tidptr, child_tidptr);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		 unsigned long unused,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 3f2d7745f31e..1e96c6eb6312 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -199,7 +199,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
         return do_fork(clone_flags, newsp, regs, 0, NULL, NULL);
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index ca2e4026ad20..1eaaa450e20c 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -99,7 +99,7 @@ void flush_thread(void)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index b28c9a60445b..234cf344cdce 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -193,7 +193,7 @@ void prepare_to_copy(struct task_struct *tsk)
  * set up the kernel stack for a new thread and copy arch-specific thread
  * control information
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long c_usp, unsigned long ustk_size,
 		struct task_struct *p, struct pt_regs *kregs)
 {
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index b80e02a4d81d..8aa591ed9127 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -263,7 +263,7 @@ sys_vfork(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,	/* in ia64 this is "user_stack_size" */
 	    struct task_struct * p, struct pt_regs * pregs)
 {
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index eac064948780..7b44a33f03c2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -598,7 +598,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * Copy a thread..
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused, struct task_struct *p,
 		struct pt_regs *regs)
 {
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index b48e961a38f6..a3acd8e60aff 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -160,7 +160,7 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
+int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index ddafbbbab2ab..694bc15f84fd 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -170,7 +170,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index c90c7e5e5fee..96be839040f8 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -425,7 +425,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index f4bee35a1b46..2830b415e214 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -455,7 +455,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags,
  */
 extern void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index a73954b87f0a..4041f94e7724 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -561,7 +561,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
  * Parent -->  %o0 == childs  pid, %o1 == 0
  * Child  -->  %o0 == parents pid, %o1 == 1
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index a1c6d07cac3e..4a28a1568d85 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -179,7 +179,7 @@ void fork_handler(void)
 	userspace(&current->thread.regs.regs);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long stack_top, struct task_struct * p,
 		struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 14014d766cad..76f8f84043a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index abb7e6a7f0c6..b751a41392b1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -278,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 9185597eb6a0..031f36685710 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -172,7 +172,7 @@ void prepare_to_copy(struct task_struct *tsk)
  *       childregs.
  */
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
                 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 481fad3a9b42..9186f8c5d5f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *);
 /* Allocate a new mm structure and copy contents from tsk->mm */
 extern struct mm_struct *dup_mm(struct task_struct *tsk);
 
-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+			struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51d1aa21483b..d7eb727eb535 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-- 
cgit v1.2.3-71-gd317


From 96615841e170f0108832e64a90d51b469573a472 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Thu, 2 Apr 2009 16:57:01 -0700
Subject: rtc-v3020: add ability to access v3020 chip with GPIOs

The v3020 RTC can be connected to GPIOs as well as to memory-like
interface.  Add ability to use GPIO bit-bang for v3020 read-write access.

[akpm@linux-foundation.org: fix off-by-one in error path]
Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-v3020.c   | 190 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/rtc-v3020.h |   6 ++
 2 files changed, 176 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c
index 66955cc9c746..ad164056feb6 100644
--- a/drivers/rtc/rtc-v3020.c
+++ b/drivers/rtc/rtc-v3020.c
@@ -27,17 +27,162 @@
 #include <linux/bcd.h>
 #include <linux/rtc-v3020.h>
 #include <linux/delay.h>
+#include <linux/gpio.h>
 
 #include <linux/io.h>
 
 #undef DEBUG
 
+struct v3020;
+
+struct v3020_chip_ops {
+	int (*map_io)(struct v3020 *chip, struct platform_device *pdev,
+		      struct v3020_platform_data *pdata);
+	void (*unmap_io)(struct v3020 *chip);
+	unsigned char (*read_bit)(struct v3020 *chip);
+	void (*write_bit)(struct v3020 *chip, unsigned char bit);
+};
+
+#define V3020_CS	0
+#define V3020_WR	1
+#define V3020_RD	2
+#define V3020_IO	3
+
+struct v3020_gpio {
+	const char *name;
+	unsigned int gpio;
+};
+
 struct v3020 {
+	/* MMIO access */
 	void __iomem *ioaddress;
 	int leftshift;
+
+	/* GPIO access */
+	struct v3020_gpio *gpio;
+
+	struct v3020_chip_ops *ops;
+
 	struct rtc_device *rtc;
 };
 
+
+static int v3020_mmio_map(struct v3020 *chip, struct platform_device *pdev,
+			  struct v3020_platform_data *pdata)
+{
+	if (pdev->num_resources != 1)
+		return -EBUSY;
+
+	if (pdev->resource[0].flags != IORESOURCE_MEM)
+		return -EBUSY;
+
+	chip->leftshift = pdata->leftshift;
+	chip->ioaddress = ioremap(pdev->resource[0].start, 1);
+	if (chip->ioaddress == NULL)
+		return -EBUSY;
+
+	return 0;
+}
+
+static void v3020_mmio_unmap(struct v3020 *chip)
+{
+	iounmap(chip->ioaddress);
+}
+
+static void v3020_mmio_write_bit(struct v3020 *chip, unsigned char bit)
+{
+	writel(bit << chip->leftshift, chip->ioaddress);
+}
+
+static unsigned char v3020_mmio_read_bit(struct v3020 *chip)
+{
+	return readl(chip->ioaddress) & (1 << chip->leftshift);
+}
+
+static struct v3020_chip_ops v3020_mmio_ops = {
+	.map_io		= v3020_mmio_map,
+	.unmap_io	= v3020_mmio_unmap,
+	.read_bit	= v3020_mmio_read_bit,
+	.write_bit	= v3020_mmio_write_bit,
+};
+
+static struct v3020_gpio v3020_gpio[] = {
+	{ "RTC CS", 0 },
+	{ "RTC WR", 0 },
+	{ "RTC RD", 0 },
+	{ "RTC IO", 0 },
+};
+
+static int v3020_gpio_map(struct v3020 *chip, struct platform_device *pdev,
+			  struct v3020_platform_data *pdata)
+{
+	int i, err;
+
+	v3020_gpio[V3020_CS].gpio = pdata->gpio_cs;
+	v3020_gpio[V3020_WR].gpio = pdata->gpio_wr;
+	v3020_gpio[V3020_RD].gpio = pdata->gpio_rd;
+	v3020_gpio[V3020_IO].gpio = pdata->gpio_io;
+
+	for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++) {
+		err = gpio_request(v3020_gpio[i].gpio, v3020_gpio[i].name);
+		if (err)
+			goto err_request;
+
+		gpio_direction_output(v3020_gpio[i].gpio, 1);
+	}
+
+	chip->gpio = v3020_gpio;
+
+	return 0;
+
+err_request:
+	while (--i >= 0)
+		gpio_free(v3020_gpio[i].gpio);
+
+	return err;
+}
+
+static void v3020_gpio_unmap(struct v3020 *chip)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++)
+		gpio_free(v3020_gpio[i].gpio);
+}
+
+static void v3020_gpio_write_bit(struct v3020 *chip, unsigned char bit)
+{
+	gpio_direction_output(chip->gpio[V3020_IO].gpio, bit);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 0);
+	gpio_set_value(chip->gpio[V3020_WR].gpio, 0);
+	udelay(1);
+	gpio_set_value(chip->gpio[V3020_WR].gpio, 1);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 1);
+}
+
+static unsigned char v3020_gpio_read_bit(struct v3020 *chip)
+{
+	int bit;
+
+	gpio_direction_input(chip->gpio[V3020_IO].gpio);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 0);
+	gpio_set_value(chip->gpio[V3020_RD].gpio, 0);
+	udelay(1);
+	bit = !!gpio_get_value(chip->gpio[V3020_IO].gpio);
+	udelay(1);
+	gpio_set_value(chip->gpio[V3020_RD].gpio, 1);
+	gpio_set_value(chip->gpio[V3020_CS].gpio, 1);
+
+	return bit;
+}
+
+static struct v3020_chip_ops v3020_gpio_ops = {
+	.map_io		= v3020_gpio_map,
+	.unmap_io	= v3020_gpio_unmap,
+	.read_bit	= v3020_gpio_read_bit,
+	.write_bit	= v3020_gpio_write_bit,
+};
+
 static void v3020_set_reg(struct v3020 *chip, unsigned char address,
 			unsigned char data)
 {
@@ -46,7 +191,7 @@ static void v3020_set_reg(struct v3020 *chip, unsigned char address,
 
 	tmp = address;
 	for (i = 0; i < 4; i++) {
-		writel((tmp & 1) << chip->leftshift, chip->ioaddress);
+		chip->ops->write_bit(chip, (tmp & 1));
 		tmp >>= 1;
 		udelay(1);
 	}
@@ -54,7 +199,7 @@ static void v3020_set_reg(struct v3020 *chip, unsigned char address,
 	/* Commands dont have data */
 	if (!V3020_IS_COMMAND(address)) {
 		for (i = 0; i < 8; i++) {
-			writel((data & 1) << chip->leftshift, chip->ioaddress);
+			chip->ops->write_bit(chip, (data & 1));
 			data >>= 1;
 			udelay(1);
 		}
@@ -67,14 +212,14 @@ static unsigned char v3020_get_reg(struct v3020 *chip, unsigned char address)
 	int i;
 
 	for (i = 0; i < 4; i++) {
-		writel((address & 1) << chip->leftshift, chip->ioaddress);
+		chip->ops->write_bit(chip, (address & 1));
 		address >>= 1;
 		udelay(1);
 	}
 
 	for (i = 0; i < 8; i++) {
 		data >>= 1;
-		if (readl(chip->ioaddress) & (1 << chip->leftshift))
+		if (chip->ops->read_bit(chip))
 			data |= 0x80;
 		udelay(1);
 	}
@@ -164,25 +309,23 @@ static int rtc_probe(struct platform_device *pdev)
 	int i;
 	int temp;
 
-	if (pdev->num_resources != 1)
-		return -EBUSY;
-
-	if (pdev->resource[0].flags != IORESOURCE_MEM)
-		return -EBUSY;
-
 	chip = kzalloc(sizeof *chip, GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
 
-	chip->leftshift = pdata->leftshift;
-	chip->ioaddress = ioremap(pdev->resource[0].start, 1);
-	if (chip->ioaddress == NULL)
+	if (pdata->use_gpio)
+		chip->ops = &v3020_gpio_ops;
+	else
+		chip->ops = &v3020_mmio_ops;
+
+	retval = chip->ops->map_io(chip, pdev, pdata);
+	if (retval)
 		goto err_chip;
 
 	/* Make sure the v3020 expects a communication cycle
 	 * by reading 8 times */
 	for (i = 0; i < 8; i++)
-		temp = readl(chip->ioaddress);
+		temp = chip->ops->read_bit(chip);
 
 	/* Test chip by doing a write/read sequence
 	 * to the chip ram */
@@ -196,10 +339,17 @@ static int rtc_probe(struct platform_device *pdev)
 	 * are all disabled */
 	v3020_set_reg(chip, V3020_STATUS_0, 0x0);
 
-	dev_info(&pdev->dev, "Chip available at physical address 0x%llx,"
-		"data connected to D%d\n",
-		(unsigned long long)pdev->resource[0].start,
-		chip->leftshift);
+	if (pdata->use_gpio)
+		dev_info(&pdev->dev, "Chip available at GPIOs "
+			 "%d, %d, %d, %d\n",
+			 chip->gpio[V3020_CS].gpio, chip->gpio[V3020_WR].gpio,
+			 chip->gpio[V3020_RD].gpio, chip->gpio[V3020_IO].gpio);
+	else
+		dev_info(&pdev->dev, "Chip available at "
+			 "physical address 0x%llx,"
+			 "data connected to D%d\n",
+			 (unsigned long long)pdev->resource[0].start,
+			 chip->leftshift);
 
 	platform_set_drvdata(pdev, chip);
 
@@ -214,7 +364,7 @@ static int rtc_probe(struct platform_device *pdev)
 	return 0;
 
 err_io:
-	iounmap(chip->ioaddress);
+	chip->ops->unmap_io(chip);
 err_chip:
 	kfree(chip);
 
@@ -229,7 +379,7 @@ static int rtc_remove(struct platform_device *dev)
 	if (rtc)
 		rtc_device_unregister(rtc);
 
-	iounmap(chip->ioaddress);
+	chip->ops->unmap_io(chip);
 	kfree(chip);
 
 	return 0;
diff --git a/include/linux/rtc-v3020.h b/include/linux/rtc-v3020.h
index bf74e63c98fe..8ba646e610d9 100644
--- a/include/linux/rtc-v3020.h
+++ b/include/linux/rtc-v3020.h
@@ -14,6 +14,12 @@
  * is used depends on the board. */
 struct v3020_platform_data {
 	int leftshift; /* (1<<(leftshift)) & readl() */
+
+	int use_gpio:1;
+	unsigned int gpio_cs;
+	unsigned int gpio_wr;
+	unsigned int gpio_rd;
+	unsigned int gpio_io;
 };
 
 #define V3020_STATUS_0	0x00
-- 
cgit v1.2.3-71-gd317


From bfb9bcdbda9a61bca469bf899a589918c60c4c18 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Thu, 2 Apr 2009 16:57:07 -0700
Subject: spi-gpio: allow operation without CS signal

Change spi-gpio so that it is possible to drive SPI communications over
GPIO without the need for a chipselect signal.

This is useful in very small setups where there's only one slave device
on the bus.

This patch does not affect existing setups.

I use this for a tiny communication channel between an embedded device and
a microcontroller.  There are not enough GPIOs available for chipselect
and it's not needed anyway in this case.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi_gpio.c       | 21 +++++++++++++--------
 include/linux/spi/spi_gpio.h |  6 ++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi_gpio.c b/drivers/spi/spi_gpio.c
index d2866c293dee..26bd03e61855 100644
--- a/drivers/spi/spi_gpio.c
+++ b/drivers/spi/spi_gpio.c
@@ -178,8 +178,10 @@ static void spi_gpio_chipselect(struct spi_device *spi, int is_active)
 	if (is_active)
 		setsck(spi, spi->mode & SPI_CPOL);
 
-	/* SPI is normally active-low */
-	gpio_set_value(cs, (spi->mode & SPI_CS_HIGH) ? is_active : !is_active);
+	if (cs != SPI_GPIO_NO_CHIPSELECT) {
+		/* SPI is normally active-low */
+		gpio_set_value(cs, (spi->mode & SPI_CS_HIGH) ? is_active : !is_active);
+	}
 }
 
 static int spi_gpio_setup(struct spi_device *spi)
@@ -191,15 +193,17 @@ static int spi_gpio_setup(struct spi_device *spi)
 		return -EINVAL;
 
 	if (!spi->controller_state) {
-		status = gpio_request(cs, dev_name(&spi->dev));
-		if (status)
-			return status;
-		status = gpio_direction_output(cs, spi->mode & SPI_CS_HIGH);
+		if (cs != SPI_GPIO_NO_CHIPSELECT) {
+			status = gpio_request(cs, dev_name(&spi->dev));
+			if (status)
+				return status;
+			status = gpio_direction_output(cs, spi->mode & SPI_CS_HIGH);
+		}
 	}
 	if (!status)
 		status = spi_bitbang_setup(spi);
 	if (status) {
-		if (!spi->controller_state)
+		if (!spi->controller_state && cs != SPI_GPIO_NO_CHIPSELECT)
 			gpio_free(cs);
 	}
 	return status;
@@ -209,7 +213,8 @@ static void spi_gpio_cleanup(struct spi_device *spi)
 {
 	unsigned long	cs = (unsigned long) spi->controller_data;
 
-	gpio_free(cs);
+	if (cs != SPI_GPIO_NO_CHIPSELECT)
+		gpio_free(cs);
 	spi_bitbang_cleanup(spi);
 }
 
diff --git a/include/linux/spi/spi_gpio.h b/include/linux/spi/spi_gpio.h
index 0f01a0f1f40c..ca6782ee4b9f 100644
--- a/include/linux/spi/spi_gpio.h
+++ b/include/linux/spi/spi_gpio.h
@@ -25,10 +25,16 @@
  *	...
  *	};
  *
+ * If chipselect is not used (there's only one device on the bus), assign
+ * SPI_GPIO_NO_CHIPSELECT to the controller_data:
+ *		.controller_data = (void *) SPI_GPIO_NO_CHIPSELECT;
+ *
  * If the bitbanged bus is later switched to a "native" controller,
  * that platform_device and controller_data should be removed.
  */
 
+#define SPI_GPIO_NO_CHIPSELECT		((unsigned long)-1l)
+
 /**
  * struct spi_gpio_platform_data - parameter for bitbanged SPI master
  * @sck: number of the GPIO used for clock output
-- 
cgit v1.2.3-71-gd317


From 039fd8ce6258e01ec29f1637f9bf1868dd877c55 Mon Sep 17 00:00:00 2001
From: Cyrus Massoumi <cyrusm@gmx.net>
Date: Thu, 2 Apr 2009 16:57:12 -0700
Subject: ext3: remove the BKL in ext3/ioctl.c

Reformat ext3/ioctl.c to make it look more like ext4/ioctl.c and remove
the BKL around ext3_ioctl().

Signed-off-by: Cyrus Massoumi <cyrusm@gmx.net>
Cc: <linux-ext4@vger.kernel.org>
Acked-by: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c           |  2 +-
 fs/ext3/file.c          |  2 +-
 fs/ext3/ioctl.c         | 59 +++++++++++++++++--------------------------------
 include/linux/ext3_fs.h |  5 ++---
 4 files changed, 24 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
-	.ioctl		= ext3_ioctl,		/* BKL held */
+	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..521f8238b2fa 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -112,7 +112,7 @@ const struct file_operations ext3_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= ext3_file_write,
-	.ioctl		= ext3_ioctl,
+	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 
-int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
-		unsigned long arg)
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		unsigned int oldflags;
 		unsigned int jflag;
 
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
 
-		if (!is_owner_or_cap(inode)) {
-			err = -EACCES;
-			goto flags_out;
-		}
-
-		if (get_user(flags, (int __user *) arg)) {
-			err = -EFAULT;
-			goto flags_out;
-		}
-
 		flags = ext3_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
+
 		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode)) {
-			mutex_unlock(&inode->i_mutex);
-			err = -EPERM;
+		err = -EPERM;
+		if (IS_NOQUOTA(inode))
 			goto flags_out;
-		}
+
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
-				err = -EPERM;
+			if (!capable(CAP_LINUX_IMMUTABLE))
 				goto flags_out;
-			}
 		}
 
 		/*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE)) {
-				mutex_unlock(&inode->i_mutex);
-				err = -EPERM;
+			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
-			}
 		}
 
-
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
-			mutex_unlock(&inode->i_mutex);
 			err = PTR_ERR(handle);
 			goto flags_out;
 		}
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
 		ext3_journal_stop(handle);
-		if (err) {
-			mutex_unlock(&inode->i_mutex);
-			return err;
-		}
+		if (err)
+			goto flags_out;
 
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
 			err = ext3_change_inode_journal_flag(inode, jflag);
-		mutex_unlock(&inode->i_mutex);
 flags_out:
+		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
@@ -140,6 +125,7 @@ flags_out:
 
 		if (!is_owner_or_cap(inode))
 			return -EPERM;
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
@@ -147,6 +133,7 @@ flags_out:
 			err = -EFAULT;
 			goto setversion_out;
 		}
+
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
 #ifdef CONFIG_COMPAT
 long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-
 	/* These are just misnamed, they actually get/put from/to user an int */
 	switch (cmd) {
 	case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	default:
 		return -ENOIOCTLCMD;
 	}
-	lock_kernel();
-	ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
+	return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index dd495b8c3091..e263acaa405b 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -893,9 +893,8 @@ extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
 
 /* ioctl.c */
-extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
-		       unsigned long);
-extern long ext3_compat_ioctl (struct file *, unsigned int, unsigned long);
+extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* namei.c */
 extern int ext3_orphan_add(handle_t *, struct inode *);
-- 
cgit v1.2.3-71-gd317


From d20a390a0ee2bf2f692c539c6ce1c829e1080bb5 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Thu, 2 Apr 2009 16:57:22 -0700
Subject: cgroups: fix cgroup.h comments

Fix the style of some multi-line comments in cgroup.h to match
Documentation/CodingStyle

Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 71 +++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 499900d0cee7..bb8feb9feccd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -47,14 +47,18 @@ enum cgroup_subsys_id {
 
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
-	/* The cgroup that this subsystem is attached to. Useful
+	/*
+	 * The cgroup that this subsystem is attached to. Useful
 	 * for subsystems that want to know about the cgroup
-	 * hierarchy structure */
+	 * hierarchy structure
+	 */
 	struct cgroup *cgroup;
 
-	/* State maintained by the cgroup system to allow subsystems
+	/*
+	 * State maintained by the cgroup system to allow subsystems
 	 * to be "busy". Should be accessed via css_get(),
-	 * css_tryget() and and css_put(). */
+	 * css_tryget() and and css_put().
+	 */
 
 	atomic_t refcnt;
 
@@ -120,8 +124,10 @@ static inline void css_put(struct cgroup_subsys_state *css)
 enum {
 	/* Control Group is dead */
 	CGRP_REMOVED,
-	/* Control Group has previously had a child cgroup or a task,
-	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
+	/*
+	 * Control Group has previously had a child cgroup or a task,
+	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
+	 */
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
@@ -130,9 +136,10 @@ enum {
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
-	/* count users of this cgroup. >0 means busy, but doesn't
-	 * necessarily indicate the number of tasks in the
-	 * cgroup */
+	/*
+	 * count users of this cgroup. >0 means busy, but doesn't
+	 * necessarily indicate the number of tasks in the cgroup
+	 */
 	atomic_t count;
 
 	/*
@@ -142,7 +149,7 @@ struct cgroup {
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
 
-	struct cgroup *parent;	/* my parent */
+	struct cgroup *parent;		/* my parent */
 	struct dentry *dentry;	  	/* cgroup fs entry, RCU protected */
 
 	/* Private pointers for each registered subsystem */
@@ -177,11 +184,12 @@ struct cgroup {
 	struct rcu_head rcu_head;
 };
 
-/* A css_set is a structure holding pointers to a set of
+/*
+ * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
  * object and speeds up fork()/exit(), since a single inc/dec and a
- * list_add()/del() can bump the reference count on the entire
- * cgroup set for a task.
+ * list_add()/del() can bump the reference count on the entire cgroup
+ * set for a task.
  */
 
 struct css_set {
@@ -226,13 +234,8 @@ struct cgroup_map_cb {
 	void *state;
 };
 
-/* struct cftype:
- *
- * The files in the cgroup filesystem mostly have a very simple read/write
- * handling, some common function will take care of it. Nevertheless some cases
- * (read tasks) are special and therefore I define this structure for every
- * kind of file.
- *
+/*
+ * struct cftype: handler definitions for cgroup control files
  *
  * When reading/writing to a file:
  *	- the cgroup to use is file->f_dentry->d_parent->d_fsdata
@@ -241,8 +244,10 @@ struct cgroup_map_cb {
 
 #define MAX_CFTYPE_NAME 64
 struct cftype {
-	/* By convention, the name should begin with the name of the
-	 * subsystem, followed by a period */
+	/*
+	 * By convention, the name should begin with the name of the
+	 * subsystem, followed by a period
+	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
 
@@ -321,13 +326,17 @@ struct cgroup_scanner {
 	struct ptr_heap *heap;
 };
 
-/* Add a new file to the given cgroup directory. Should only be
- * called by subsystems from within a populate() method */
+/*
+ * Add a new file to the given cgroup directory. Should only be
+ * called by subsystems from within a populate() method
+ */
 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		       const struct cftype *cft);
 
-/* Add a set of new files to the given cgroup directory. Should
- * only be called by subsystems from within a populate() method */
+/*
+ * Add a set of new files to the given cgroup directory. Should
+ * only be called by subsystems from within a populate() method
+ */
 int cgroup_add_files(struct cgroup *cgrp,
 			struct cgroup_subsys *subsys,
 			const struct cftype cft[],
@@ -419,7 +428,8 @@ struct cgroup_iter {
 	struct list_head *task;
 };
 
-/* To iterate across the tasks in a cgroup:
+/*
+ * To iterate across the tasks in a cgroup:
  *
  * 1) call cgroup_iter_start to intialize an iterator
  *
@@ -428,9 +438,10 @@ struct cgroup_iter {
  *
  * 3) call cgroup_iter_end() to destroy the iterator.
  *
- * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
- *    - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
- *      callback, but not while calling the process_task() callback.
+ * Or, call cgroup_scan_tasks() to iterate through every task in a
+ * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
+ * the test_task() callback, but not while calling the process_task()
+ * callback.
  */
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
-- 
cgit v1.2.3-71-gd317


From 313e924c0852943e67335fad9d2608701f0dfe8e Mon Sep 17 00:00:00 2001
From: Grzegorz Nosek <root@localdomain.pl>
Date: Thu, 2 Apr 2009 16:57:23 -0700
Subject: cgroups: relax ns_can_attach checks to allow attaching to grandchild
 cgroups

The ns_proxy cgroup allows moving processes to child cgroups only one
level deep at a time.  This commit relaxes this restriction and makes it
possible to attach tasks directly to grandchild cgroups, e.g.:

($pid is in the root cgroup)
echo $pid > /cgroup/CG1/CG2/tasks

Previously this operation would fail with -EPERM and would have to be
performed as two steps:
echo $pid > /cgroup/CG1/tasks
echo $pid > /cgroup/CG1/CG2/tasks

Also, the target cgroup no longer needs to be empty to move a task there.

Signed-off-by: Grzegorz Nosek <root@localdomain.pl>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 11 ++++++-----
 kernel/ns_cgroup.c     | 14 ++++----------
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb8feb9feccd..788c4964c142 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -348,8 +348,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-/* Return true if the cgroup is a descendant of the current cgroup */
-int cgroup_is_descendant(const struct cgroup *cgrp);
+/* Return true if cgrp is a descendant of the task's cgroup */
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 
 /* Control Group subsystem type. See Documentation/cgroups.txt for details */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7239b2..27792bcb0758 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3084,18 +3084,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 }
 
 /**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
  * @cgrp: the cgroup in question
+ * @task: the task in question
  *
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
+ * See if @cgrp is a descendant of @task's cgroup in the appropriate
+ * hierarchy.
  *
  * If we are sending in dummytop, then presumably we are creating
  * the top cgroup in the subsystem.
  *
  * Called only by the ns (nsproxy) cgroup.
  */
-int cgroup_is_descendant(const struct cgroup *cgrp)
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
 	int ret;
 	struct cgroup *target;
@@ -3105,7 +3106,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
 		return 1;
 
 	get_first_subsys(cgrp, NULL, &subsys_id);
-	target = task_cgroup(current, subsys_id);
+	target = task_cgroup(task, subsys_id);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 
 /*
  * Rules:
- *   1. you can only enter a cgroup which is a child of your current
+ *   1. you can only enter a cgroup which is a descendant of your current
  *     cgroup
  *   2. you can only place another process into a cgroup if
  *     a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 static int ns_can_attach(struct cgroup_subsys *ss,
 		struct cgroup *new_cgroup, struct task_struct *task)
 {
-	struct cgroup *orig;
-
 	if (current != task) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		if (!cgroup_is_descendant(new_cgroup))
+		if (!cgroup_is_descendant(new_cgroup, current))
 			return -EPERM;
 	}
 
-	if (atomic_read(&new_cgroup->count) != 0)
-		return -EPERM;
-
-	orig = task_cgroup(task, ns_subsys_id);
-	if (orig && orig != new_cgroup->parent)
+	if (!cgroup_is_descendant(new_cgroup, task))
 		return -EPERM;
 
 	return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
-	if (!cgroup_is_descendant(cgroup))
+	if (!cgroup_is_descendant(cgroup, current))
 		return ERR_PTR(-EPERM);
 
 	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-- 
cgit v1.2.3-71-gd317


From 38460b48d06440de46b34cb778bd6c4855030754 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:25 -0700
Subject: cgroup: CSS ID support

Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code.

This patch attaches unique ID to each css and provides following.

 - css_lookup(subsys, id)
   returns pointer to struct cgroup_subysys_state of id.
 - css_get_next(subsys, id, rootid, depth, foundid)
   returns the next css under "root" by scanning

When cgroup_subsys->use_id is set, an id for css is maintained.

The cgroup framework only parepares
	- css_id of root css for subsys
	- id is automatically attached at creation of css.
	- id is *not* freed automatically. Because the cgroup framework
	  don't know lifetime of cgroup_subsys_state.
	  free_css_id() function is provided. This must be called by subsys.

There are several reasons to develop this.
	- Saving space .... For example, memcg's swap_cgroup is array of
	  pointers to cgroup. But it is not necessary to be very fast.
	  By replacing pointers(8bytes per ent) to ID (2byes per ent), we can
	  reduce much amount of memory usage.

	- Scanning without lock.
	  CSS_ID provides "scan id under this ROOT" function. By this, scanning
	  css under root can be written without locks.
	  ex)
	  do {
		rcu_read_lock();
		next = cgroup_get_next(subsys, id, root, &found);
		/* check sanity of next here */
		css_tryget();
		rcu_read_unlock();
		id = found + 1
	 } while(...)

Characteristics:
	- Each css has unique ID under subsys.
	- Lifetime of ID is controlled by subsys.
	- css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy
	- Allowed ID is 1-65535, ID 0 is UNUSED ID.

Design Choices:
	- scan-by-ID v.s. scan-by-tree-walk.
	  As /proc's pid scan does, scan-by-ID is robust when scanning is done
	  by following kind of routine.
	  scan -> rest a while(release a lock) -> conitunue from interrupted
	  memcg's hierarchical reclaim does this.

	- When subsys->use_id is set, # of css in the system is limited to
	  65535.

[bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  50 +++++++++
 include/linux/idr.h    |   1 +
 kernel/cgroup.c        | 286 ++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/idr.c              |  46 ++++++++
 4 files changed, 382 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 788c4964c142..9a23bb098205 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -15,6 +15,7 @@
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
+#include <linux/idr.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -22,6 +23,7 @@ struct cgroupfs_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
+struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -63,6 +65,8 @@ struct cgroup_subsys_state {
 	atomic_t refcnt;
 
 	unsigned long flags;
+	/* ID for this css, if possible */
+	struct css_id *id;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
@@ -373,6 +377,11 @@ struct cgroup_subsys {
 	int active;
 	int disabled;
 	int early_init;
+	/*
+	 * True if this subsys uses ID. ID is not available before cgroup_init()
+	 * (not available in early_init time.)
+	 */
+	bool use_id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
@@ -395,6 +404,9 @@ struct cgroup_subsys {
 	 */
 	struct cgroupfs_root *root;
 	struct list_head sibling;
+	/* used when use_id == true */
+	struct idr idr;
+	spinlock_t id_lock;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
@@ -450,6 +462,44 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+/*
+ * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
+ * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
+ * CSS ID is assigned at cgroup allocation (create) automatically
+ * and removed when subsys calls free_css_id() function. This is because
+ * the lifetime of cgroup_subsys_state is subsys's matter.
+ *
+ * Looking up and scanning function should be called under rcu_read_lock().
+ * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
+ * But the css returned by this routine can be "not populated yet" or "being
+ * destroyed". The caller should check css and cgroup's status.
+ */
+
+/*
+ * Typically Called at ->destroy(), or somewhere the subsys frees
+ * cgroup_subsys_state.
+ */
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
+
+/* Find a cgroup_subsys_state which has given ID */
+
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
+
+/*
+ * Get a cgroup whose id is greater than or equal to id under tree of root.
+ * Returning a cgroup_subsys_state or NULL.
+ */
+struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
+		struct cgroup_subsys_state *root, int *foundid);
+
+/* Returns true if root is ancestor of cg */
+bool css_is_ancestor(struct cgroup_subsys_state *cg,
+		     struct cgroup_subsys_state *root);
+
+/* Get id and depth of css */
+unsigned short css_id(struct cgroup_subsys_state *css);
+unsigned short css_depth(struct cgroup_subsys_state *css);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/idr.h b/include/linux/idr.h
index dd846df8cd32..e968db71e33a 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *ptr, int *id);
 int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
 int idr_for_each(struct idr *idp,
 		 int (*fn)(int id, void *p, void *data), void *data);
+void *idr_get_next(struct idr *idp, int *nextid);
 void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
 void idr_remove_all(struct idr *idp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27792bcb0758..d3c521137425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
 	char release_agent_path[PATH_MAX];
 };
 
-
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
  */
 static struct cgroupfs_root rootnode;
 
+/*
+ * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
+ * cgroup_subsys->use_id != 0.
+ */
+#define CSS_ID_MAX	(65535)
+struct css_id {
+	/*
+	 * The css to which this ID points. This pointer is set to valid value
+	 * after cgroup is populated. If cgroup is removed, this will be NULL.
+	 * This pointer is expected to be RCU-safe because destroy()
+	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
+	 * css_tryget() should be used for avoiding race.
+	 */
+	struct cgroup_subsys_state *css;
+	/*
+	 * ID of this css.
+	 */
+	unsigned short id;
+	/*
+	 * Depth in hierarchy which this ID belongs to.
+	 */
+	unsigned short depth;
+	/*
+	 * ID is freed by RCU. (and lookup routine is RCU safe.)
+	 */
+	struct rcu_head rcu_head;
+	/*
+	 * Hierarchy of CSS ID belongs to.
+	 */
+	unsigned short stack[0]; /* Array of Length (depth+1) */
+};
+
+
 /* The list of hierarchy roots */
 
 static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 
+static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
+static int alloc_css_id(struct cgroup_subsys *ss,
+			struct cgroup *parent, struct cgroup *child);
+
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -2327,6 +2364,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 	}
+	/* This cgroup is ready now */
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		/*
+		 * Update id->css pointer and make this css visible from
+		 * CSS ID functions. This pointer will be dereferened
+		 * from RCU-read-side without locks.
+		 */
+		if (css->id)
+			rcu_assign_pointer(css->id->css, css);
+	}
 
 	return 0;
 }
@@ -2338,6 +2386,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
+	css->id = NULL;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2413,6 +2462,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
+		if (ss->use_id)
+			if (alloc_css_id(ss, parent, cgrp))
+				goto err_destroy;
+		/* At error, ->destroy() callback has to free assigned ID. */
 	}
 
 	cgroup_lock_hierarchy(root);
@@ -2708,6 +2761,8 @@ int __init cgroup_init(void)
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
+		if (ss->use_id)
+			cgroup_subsys_init_idr(ss);
 	}
 
 	/* Add init_css_set to the hash table */
@@ -3242,3 +3297,232 @@ static int __init cgroup_disable(char *str)
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
+
+/*
+ * Functons for CSS ID.
+ */
+
+/*
+ *To get ID other than 0, this should be called when !cgroup_is_removed().
+ */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->id;
+	return 0;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->depth;
+	return 0;
+}
+
+bool css_is_ancestor(struct cgroup_subsys_state *child,
+		    struct cgroup_subsys_state *root)
+{
+	struct css_id *child_id = rcu_dereference(child->id);
+	struct css_id *root_id = rcu_dereference(root->id);
+
+	if (!child_id || !root_id || (child_id->depth < root_id->depth))
+		return false;
+	return child_id->stack[root_id->depth] == root_id->id;
+}
+
+static void __free_css_id_cb(struct rcu_head *head)
+{
+	struct css_id *id;
+
+	id = container_of(head, struct css_id, rcu_head);
+	kfree(id);
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+	struct css_id *id = css->id;
+	/* When this is called before css_id initialization, id can be NULL */
+	if (!id)
+		return;
+
+	BUG_ON(!ss->use_id);
+
+	rcu_assign_pointer(id->css, NULL);
+	rcu_assign_pointer(css->id, NULL);
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, id->id);
+	spin_unlock(&ss->id_lock);
+	call_rcu(&id->rcu_head, __free_css_id_cb);
+}
+
+/*
+ * This is called by init or create(). Then, calls to this function are
+ * always serialized (By cgroup_mutex() at create()).
+ */
+
+static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+{
+	struct css_id *newid;
+	int myid, error, size;
+
+	BUG_ON(!ss->use_id);
+
+	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
+	newid = kzalloc(size, GFP_KERNEL);
+	if (!newid)
+		return ERR_PTR(-ENOMEM);
+	/* get id */
+	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+		error = -ENOMEM;
+		goto err_out;
+	}
+	spin_lock(&ss->id_lock);
+	/* Don't use 0. allocates an ID of 1-65535 */
+	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+	spin_unlock(&ss->id_lock);
+
+	/* Returns error when there are no free spaces for new ID.*/
+	if (error) {
+		error = -ENOSPC;
+		goto err_out;
+	}
+	if (myid > CSS_ID_MAX)
+		goto remove_idr;
+
+	newid->id = myid;
+	newid->depth = depth;
+	return newid;
+remove_idr:
+	error = -ENOSPC;
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, myid);
+	spin_unlock(&ss->id_lock);
+err_out:
+	kfree(newid);
+	return ERR_PTR(error);
+
+}
+
+static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+{
+	struct css_id *newid;
+	struct cgroup_subsys_state *rootcss;
+
+	spin_lock_init(&ss->id_lock);
+	idr_init(&ss->idr);
+
+	rootcss = init_css_set.subsys[ss->subsys_id];
+	newid = get_new_cssid(ss, 0);
+	if (IS_ERR(newid))
+		return PTR_ERR(newid);
+
+	newid->stack[0] = newid->id;
+	newid->css = rootcss;
+	rootcss->id = newid;
+	return 0;
+}
+
+static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
+			struct cgroup *child)
+{
+	int subsys_id, i, depth = 0;
+	struct cgroup_subsys_state *parent_css, *child_css;
+	struct css_id *child_id, *parent_id = NULL;
+
+	subsys_id = ss->subsys_id;
+	parent_css = parent->subsys[subsys_id];
+	child_css = child->subsys[subsys_id];
+	depth = css_depth(parent_css) + 1;
+	parent_id = parent_css->id;
+
+	child_id = get_new_cssid(ss, depth);
+	if (IS_ERR(child_id))
+		return PTR_ERR(child_id);
+
+	for (i = 0; i < depth; i++)
+		child_id->stack[i] = parent_id->stack[i];
+	child_id->stack[depth] = child_id->id;
+	/*
+	 * child_id->css pointer will be set after this cgroup is available
+	 * see cgroup_populate_dir()
+	 */
+	rcu_assign_pointer(child_css->id, child_id);
+
+	return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @ss: cgroup subsys to be looked into.
+ * @id: the id
+ *
+ * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * NULL if not. Should be called under rcu_read_lock()
+ */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+	struct css_id *cssid = NULL;
+
+	BUG_ON(!ss->use_id);
+	cssid = idr_find(&ss->idr, id);
+
+	if (unlikely(!cssid))
+		return NULL;
+
+	return rcu_dereference(cssid->css);
+}
+
+/**
+ * css_get_next - lookup next cgroup under specified hierarchy.
+ * @ss: pointer to subsystem
+ * @id: current position of iteration.
+ * @root: pointer to css. search tree under this.
+ * @foundid: position of found object.
+ *
+ * Search next css under the specified hierarchy of rootid. Calling under
+ * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ */
+struct cgroup_subsys_state *
+css_get_next(struct cgroup_subsys *ss, int id,
+	     struct cgroup_subsys_state *root, int *foundid)
+{
+	struct cgroup_subsys_state *ret = NULL;
+	struct css_id *tmp;
+	int tmpid;
+	int rootid = css_id(root);
+	int depth = css_depth(root);
+
+	if (!rootid)
+		return NULL;
+
+	BUG_ON(!ss->use_id);
+	/* fill start point for scan */
+	tmpid = id;
+	while (1) {
+		/*
+		 * scan next entry from bitmap(tree), tmpid is updated after
+		 * idr_get_next().
+		 */
+		spin_lock(&ss->id_lock);
+		tmp = idr_get_next(&ss->idr, &tmpid);
+		spin_unlock(&ss->id_lock);
+
+		if (!tmp)
+			break;
+		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
+			ret = rcu_dereference(tmp->css);
+			if (ret) {
+				*foundid = tmpid;
+				break;
+			}
+		}
+		/* continue to scan from next id */
+		tmpid = tmpid + 1;
+	}
+	return ret;
+}
+
diff --git a/lib/idr.c b/lib/idr.c
index dab4bca86f5d..80ca9aca038b 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -578,6 +578,52 @@ int idr_for_each(struct idr *idp,
 }
 EXPORT_SYMBOL(idr_for_each);
 
+/**
+ * idr_get_next - lookup next object of id to given id.
+ * @idp: idr handle
+ * @id:  pointer to lookup key
+ *
+ * Returns pointer to registered object with id, which is next number to
+ * given id.
+ */
+
+void *idr_get_next(struct idr *idp, int *nextidp)
+{
+	struct idr_layer *p, *pa[MAX_LEVEL];
+	struct idr_layer **paa = &pa[0];
+	int id = *nextidp;
+	int n, max;
+
+	/* find first ent */
+	n = idp->layers * IDR_BITS;
+	max = 1 << n;
+	p = rcu_dereference(idp->top);
+	if (!p)
+		return NULL;
+
+	while (id < max) {
+		while (n > 0 && p) {
+			n -= IDR_BITS;
+			*paa++ = p;
+			p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
+		}
+
+		if (p) {
+			*nextidp = id;
+			return p;
+		}
+
+		id += 1 << n;
+		while (n < fls(id)) {
+			n += IDR_BITS;
+			p = *--paa;
+		}
+	}
+	return NULL;
+}
+
+
+
 /**
  * idr_replace - replace pointer for given id
  * @idp: idr handle
-- 
cgit v1.2.3-71-gd317


From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:26 -0700
Subject: cgroup: fix frequent -EBUSY at rmdir

In following situation, with memory subsystem,

	/groupA use_hierarchy==1
		/01 some tasks
		/02 some tasks
		/03 some tasks
		/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
is triggered and the kernel walks tree under groupA. In this case,
rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from the kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
	- retries if css_refcnt is got by someone.
	- add "return value" to pre_destroy() and allows subsystem to
	  say "we're really busy!"

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/cgroups.txt |  6 ++-
 include/linux/cgroup.h            |  6 ++-
 kernel/cgroup.c                   | 81 ++++++++++++++++++++++++++++++++-------
 mm/memcontrol.c                   |  5 ++-
 4 files changed, 79 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 93feb8444489..cdc46a501b85 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a
 newly-created cgroup if an error occurs after this subsystem's
 create() method has been called for the new cgroup).
 
-void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
 
 Called before checking the reference count on each subsystem. This may
 be useful for subsystems which have some extra references even if
-there are not tasks in the cgroup.
+there are not tasks in the cgroup. If pre_destroy() returns error code,
+rmdir() will fail with it. From this behavior, pre_destroy() can be
+called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	       struct task_struct *task)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9a23bb098205..7d824b80b3d7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -135,6 +135,10 @@ enum {
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
+	/*
+	 * A thread in rmdir() is wating for this cgroup.
+	 */
+	CGRP_WAIT_ON_RMDIR,
 };
 
 struct cgroup {
@@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
 						  struct cgroup *cgrp);
-	void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
+	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss,
 			  struct cgroup *cgrp, struct task_struct *tsk);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c521137425..fc5e4a48582f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
+	int ret = 0;
+
 	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy)
-			ss->pre_destroy(ss, cgrp);
-	return;
+		if (ss->pre_destroy) {
+			ret = ss->pre_destroy(ss, cgrp);
+			if (ret)
+				break;
+		}
+	return ret;
 }
 
 static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 	remove_dir(dentry);
 }
 
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+{
+	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+		wake_up_all(&cgroup_rmdir_waitq);
+}
+
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
 	put_css_set(cg);
+
+	/*
+	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
+	 * is no longer empty.
+	 */
+	cgroup_wakeup_rmdir_waiters(cgrp);
 	return 0;
 }
 
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
+	DEFINE_WAIT(wait);
+	int ret;
 
 	/* the vfs holds both inode->i_mutex already */
-
+again:
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	 * Call pre_destroy handlers of subsys. Notify subsystems
 	 * that rmdir() request comes.
 	 */
-	cgroup_call_pre_destroy(cgrp);
+	ret = cgroup_call_pre_destroy(cgrp);
+	if (ret)
+		return ret;
 
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
-
-	if (atomic_read(&cgrp->count)
-	    || !list_empty(&cgrp->children)
-	    || !cgroup_clear_css_refs(cgrp)) {
+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
+	/*
+	 * css_put/get is provided for subsys to grab refcnt to css. In typical
+	 * case, subsystem has no reference after pre_destroy(). But, under
+	 * hierarchy management, some *temporal* refcnt can be hold.
+	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
+	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+	 * is called when css_put() is called and refcnt goes down to 0.
+	 */
+	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+
+	if (!cgroup_clear_css_refs(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		schedule();
+		finish_wait(&cgroup_rmdir_waitq, &wait);
+		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+		if (signal_pending(current))
+			return -EINTR;
+		goto again;
+	}
+	/* NO css_tryget() can success after here. */
+	finish_wait(&cgroup_rmdir_waitq, &wait);
+	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
-	if ((atomic_dec_return(&css->refcnt) == 1) &&
-	    notify_on_release(cgrp)) {
-		set_bit(CGRP_RELEASABLE, &cgrp->flags);
-		check_for_release(cgrp);
+	if (atomic_dec_return(&css->refcnt) == 1) {
+		if (notify_on_release(cgrp)) {
+			set_bit(CGRP_RELEASABLE, &cgrp->flags);
+			check_for_release(cgrp);
+		}
+		cgroup_wakeup_rmdir_waiters(cgrp);
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..8ffec674c5ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2272,11 +2272,12 @@ free_out:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-	mem_cgroup_force_empty(mem, false);
+
+	return mem_cgroup_force_empty(mem, false);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
-- 
cgit v1.2.3-71-gd317


From 099fca3225b39f7a3ed853036038054172b55581 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:29 -0700
Subject: cgroups: show correct file mode

We have some read-only files and write-only files, but currently they are
all set to 0644, which is counter-intuitive and cause trouble for some
cgroup tools like libcgroup.

This patch adds 'mode' to struct cftype to allow cgroup subsys to set it's
own files' file mode, and for the most cases cft->mode can be default to 0
and cgroup will figure out proper mode.

Acked-by: Paul Menage <menage@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  5 +++++
 kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++----
 kernel/cpuset.c        |  1 +
 3 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7d824b80b3d7..b2816fba5306 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -258,6 +258,11 @@ struct cftype {
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
+	/*
+	 * If not 0, file mode is set to this value, otherwise it will
+	 * be figured out automatically
+	 */
+	mode_t mode;
 
 	/*
 	 * If non-zero, defines the maximum length of string that can
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9a6c2bfa1d9f..fea11c5c990c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,7 +1686,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
 	.rename = cgroup_rename,
 };
 
-static int cgroup_create_file(struct dentry *dentry, int mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
@@ -1732,7 +1732,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-				int mode)
+				mode_t mode)
 {
 	struct dentry *parent;
 	int error = 0;
@@ -1750,6 +1750,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 	return error;
 }
 
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static mode_t cgroup_file_mode(const struct cftype *cft)
+{
+	mode_t mode = 0;
+
+	if (cft->mode)
+		return cft->mode;
+
+	if (cft->read || cft->read_u64 || cft->read_s64 ||
+	    cft->read_map || cft->read_seq_string)
+		mode |= S_IRUGO;
+
+	if (cft->write || cft->write_u64 || cft->write_s64 ||
+	    cft->write_string || cft->trigger)
+		mode |= S_IWUSR;
+
+	return mode;
+}
+
 int cgroup_add_file(struct cgroup *cgrp,
 		       struct cgroup_subsys *subsys,
 		       const struct cftype *cft)
@@ -1757,6 +1784,7 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
+	mode_t mode;
 
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1767,7 +1795,8 @@ int cgroup_add_file(struct cgroup *cgrp,
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
-		error = cgroup_create_file(dentry, 0644 | S_IFREG,
+		mode = cgroup_file_mode(cft);
+		error = cgroup_create_file(dentry, mode | S_IFREG,
 						cgrp->root->sb);
 		if (!error)
 			dentry->d_fsdata = (void *)cft;
@@ -2349,6 +2378,7 @@ static struct cftype files[] = {
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
+		.mode = S_IRUGO | S_IWUSR,
 	},
 
 	{
@@ -2449,7 +2479,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-			     int mode)
+			     mode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..ee5ec386aa8b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1706,6 +1706,7 @@ static struct cftype files[] = {
 		.read_u64 = cpuset_read_u64,
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE,
+		.mode = S_IRUGO,
 	},
 
 	{
-- 
cgit v1.2.3-71-gd317


From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:38 -0700
Subject: memcg: fix OOM killer under memcg

This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.

But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup

	/groupA/	hierarchy=1, limit=1G,
		01	nolimit
		02	nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.

This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.

To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memcg_test.txt | 20 +++++++++++++++++++-
 include/linux/cgroup.h               |  2 +-
 kernel/cgroup.c                      |  2 +-
 mm/memcontrol.c                      | 30 ++++++++++++++++++++++++++++--
 4 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 523a9c16c400..8a11caf417a0 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
 Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2009/1/19
+Last Updated: 2009/1/20
 Base Kernel Version: based on 2.6.29-rc2.
 
 Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -360,3 +360,21 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	# kill malloc task.
 
 	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+ 9.8 OOM-Killer
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+	Case A) when you can swapoff
+	#swapoff -a
+	#echo 50M > /memory.limit_in_bytes
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation.
+	#echo 50M > memory.limit_in_bytes
+	#echo 50M > memory.memsw.limit_in_bytes
+	run 51M of malloc
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b2816fba5306..43763bd772b9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
 
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
-		     struct cgroup_subsys_state *root);
+		     const struct cgroup_subsys_state *root);
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f2a3f5c9936c..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 }
 
 bool css_is_ancestor(struct cgroup_subsys_state *child,
-		    struct cgroup_subsys_state *root)
+		    const struct cgroup_subsys_state *root)
 {
 	struct css_id *child_id = rcu_dereference(child->id);
 	struct css_id *root_id = rcu_dereference(root->id);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f6a575e77ad..025f8abfae2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *mem = NULL;
+
+	if (!mm)
+		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
@@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page,
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
+	struct mem_cgroup *curr = NULL;
 
 	task_lock(task);
-	ret = task->mm && mm_match_cgroup(task->mm, mem);
+	rcu_read_lock();
+	curr = try_get_mem_cgroup_from_mm(task->mm);
+	rcu_read_unlock();
 	task_unlock(task);
+	if (!curr)
+		return 0;
+	if (curr->use_hierarchy)
+		ret = css_is_ancestor(&curr->css, &mem->css);
+	else
+		ret = (curr == mem);
+	css_put(&curr->css);
 	return ret;
 }
 
@@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
 	rcu_read_unlock();
 	return ret;
 }
+
+static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+{
+	mem->last_oom_jiffies = jiffies;
+	return 0;
+}
+
+static void record_last_oom(struct mem_cgroup *mem)
+{
+	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+}
+
+
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
@@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				mutex_lock(&memcg_tasklist);
 				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
 				mutex_unlock(&memcg_tasklist);
-				mem_over_limit->last_oom_jiffies = jiffies;
+				record_last_oom(mem_over_limit);
 			}
 			goto nomem;
 		}
-- 
cgit v1.2.3-71-gd317


From e222432bfa7dcf6ec008622a978c9f284ed5e3a9 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:57:39 -0700
Subject: memcg: show memcg information during OOM

Add RSS and swap to OOM output from memcg

Display memcg values like failcnt, usage and limit when an OOM occurs due
to memcg.

Thanks to Johannes Weiner, Li Zefan, David Rientjes, Kamezawa Hiroyuki,
Daisuke Nishimura and KOSAKI Motohiro for review.

Sample output
-------------

Task in /a/x killed as a result of limit of /a
memory: usage 1048576kB, limit 1048576kB, failcnt 4183
memory+swap: usage 1400964kB, limit 9007199254740991kB, failcnt 0

[akpm@linux-foundation.org: compilation fix]
[akpm@linux-foundation.org: fix kerneldoc and whitespace]
[akpm@linux-foundation.org: add printk facility level]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 +++++
 mm/memcontrol.c            | 69 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/oom_kill.c              |  1 +
 3 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 326f45c86530..7aba9f264622 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -104,6 +104,8 @@ struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
 						      struct zone *zone);
 struct zone_reclaim_stat*
 mem_cgroup_get_reclaim_stat_from_page(struct page *page);
+extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+					struct task_struct *p);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
@@ -270,6 +272,11 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 	return NULL;
 }
 
+static inline void
+mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 025f8abfae2d..2bdb6149faeb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
@@ -721,6 +722,74 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 	(*val)++;
 	return 0;
 }
+
+/**
+ * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
+ * @memcg: The memory cgroup that went over limit
+ * @p: Task that is going to be killed
+ *
+ * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
+ * enabled
+ */
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+{
+	struct cgroup *task_cgrp;
+	struct cgroup *mem_cgrp;
+	/*
+	 * Need a buffer in BSS, can't rely on allocations. The code relies
+	 * on the assumption that OOM is serialized for memory controller.
+	 * If this assumption is broken, revisit this code.
+	 */
+	static char memcg_name[PATH_MAX];
+	int ret;
+
+	if (!memcg)
+		return;
+
+
+	rcu_read_lock();
+
+	mem_cgrp = memcg->css.cgroup;
+	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
+
+	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
+	if (ret < 0) {
+		/*
+		 * Unfortunately, we are unable to convert to a useful name
+		 * But we'll still print out the usage information
+		 */
+		rcu_read_unlock();
+		goto done;
+	}
+	rcu_read_unlock();
+
+	printk(KERN_INFO "Task in %s killed", memcg_name);
+
+	rcu_read_lock();
+	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
+	if (ret < 0) {
+		rcu_read_unlock();
+		goto done;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Continues from above, so we don't need an KERN_ level
+	 */
+	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
+done:
+
+	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
+		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
+		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
+		res_counter_read_u64(&memcg->res, RES_FAILCNT));
+	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
+		"failcnt %llu\n",
+		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
+		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
+		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+}
+
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3b9bac085b5..2f3166e308d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -394,6 +394,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 		cpuset_print_task_mems_allowed(current);
 		task_unlock(current);
 		dump_stack();
+		mem_cgroup_print_oom_info(mem, current);
 		show_mem();
 		if (sysctl_oom_dump_tasks)
 			dump_tasks(mem);
-- 
cgit v1.2.3-71-gd317


From c137b5ece4b111e46981aae7da77315b9909809f Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:40 -0700
Subject: memcg: remove mem_cgroup_calc_mapped_ratio()

Currently, mem_cgroup_calc_mapped_ratio() is unused at all.  it can be
removed and KAMEZAWA-san suggested it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ------
 mm/memcontrol.c            | 17 -----------------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7aba9f264622..4562d09ab964 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -88,7 +88,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 /*
  * For memory reclaim.
  */
-extern int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem);
 extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
 
 extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
@@ -211,11 +210,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
 {
 }
 
-static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
-{
-	return 0;
-}
-
 static inline int mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
 {
 	return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bdb6149faeb..7bb14fdc780c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -507,23 +507,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	return ret;
 }
 
-/*
- * Calculate mapped_ratio under memory controller. This will be used in
- * vmscan.c for deteremining we have to reclaim mapped pages.
- */
-int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
-{
-	long total, rss;
-
-	/*
-	 * usage is recorded in bytes. But, here, we assume the number of
-	 * physical pages can be represented by "long" on any arch.
-	 */
-	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
-	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
-	return (int)((rss * 100L) / total);
-}
-
 /*
  * prev_priority control...this will be used in memory reclaim path.
  */
-- 
cgit v1.2.3-71-gd317


From 3918b96e03b2b8dd05889320623f6870e81d35ec Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:41 -0700
Subject: memcg: remove mem_cgroup_reclaim_imbalance() remnants

commit 4f98a2fee8acdb4ac84545df98cccecfd130f8db (vmscan: split LRU lists
into anon & file sets) removed mem_cgroup_reclaim_imbalance(), but there
are some leftovers in memcontrol.h.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4562d09ab964..18146c980b68 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -88,8 +88,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 /*
  * For memory reclaim.
  */
-extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
-
 extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
 extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
@@ -210,11 +208,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
 {
 }
 
-static inline int mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
-{
-	return 0;
-}
-
 static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 {
 	return 0;
-- 
cgit v1.2.3-71-gd317


From a3b2d692690aef228e493b1beaafe5364cab3237 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:45 -0700
Subject: cgroups: use css id in swap cgroup for saving memory v5

Try to use CSS ID for records in swap_cgroup.  By this, on 64bit machine,
size of swap_cgroup goes down to 2 bytes from 8bytes.

This means, when 2GB of swap is equipped, (assume the page size is 4096bytes)

	From size of swap_cgroup = 2G/4k * 8 = 4Mbytes.
	To   size of swap_cgroup = 2G/4k * 2 = 1Mbytes.

Reduction is large.  Of course, there are trade-offs.  This CSS ID will
add overhead to swap-in/swap-out/swap-free.

But in general,
  - swap is a resource which the user tend to avoid use.
  - If swap is never used, swap_cgroup area is not used.
  - Reading traditional manuals, size of swap should be proportional to
    size of memory. Memory size of machine is increasing now.

I think reducing size of swap_cgroup makes sense.

Note:
  - ID->CSS lookup routine has no locks, it's under RCU-Read-Side.
  - memcg can be obsolete at rmdir() but not freed while refcnt from
    swap_cgroup is available.

Changelog v4->v5:
 - reworked on to memcg-charge-swapcache-to-proper-memcg.patch
Changlog ->v4:
 - fixed not configured case.
 - deleted unnecessary comments.
 - fixed NULL pointer bug.
 - fixed message in dmesg.

[nishimura@mxp.nes.nec.co.jp: css_tryget can be called twice in !PageCgroupUsed case]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h | 13 ++++----
 mm/memcontrol.c             | 74 +++++++++++++++++++++++++++++++++++++--------
 mm/page_cgroup.c            | 32 +++++++++-----------
 3 files changed, 82 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 602cc1fdee90..7339c7bf7331 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -91,24 +91,23 @@ static inline void page_cgroup_init(void)
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 #include <linux/swap.h>
-extern struct mem_cgroup *
-swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem);
-extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
+extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
+extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
 extern int swap_cgroup_swapon(int type, unsigned long max_pages);
 extern void swap_cgroup_swapoff(int type);
 #else
 #include <linux/swap.h>
 
 static inline
-struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 {
-	return NULL;
+	return 0;
 }
 
 static inline
-struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+unsigned short lookup_swap_cgroup(swp_entry_t ent)
 {
-	return NULL;
+	return 0;
 }
 
 static inline int
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 81b0ae8183d0..55dea5968464 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -991,10 +991,31 @@ nomem:
 	return -ENOMEM;
 }
 
+
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock(). The caller must check css_is_removed() or some if
+ * it's concern. (dropping refcnt from swap can be called against removed
+ * memcg.)
+ */
+static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
+{
+	struct cgroup_subsys_state *css;
+
+	/* ID 0 is unused ID */
+	if (!id)
+		return NULL;
+	css = css_lookup(&mem_cgroup_subsys, id);
+	if (!css)
+		return NULL;
+	return container_of(css, struct mem_cgroup, css);
+}
+
 static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
+	unsigned short id;
 	swp_entry_t ent;
 
 	VM_BUG_ON(!PageLocked(page));
@@ -1006,16 +1027,19 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
 	/*
 	 * Used bit of swapcache is solid under page lock.
 	 */
-	if (PageCgroupUsed(pc))
+	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
-	else {
+		if (mem && !css_tryget(&mem->css))
+			mem = NULL;
+	} else {
 		ent.val = page_private(page);
-		mem = lookup_swap_cgroup(ent);
+		id = lookup_swap_cgroup(ent);
+		rcu_read_lock();
+		mem = mem_cgroup_lookup(id);
+		if (mem && !css_tryget(&mem->css))
+			mem = NULL;
+		rcu_read_unlock();
 	}
-	if (!mem)
-		return NULL;
-	if (!css_tryget(&mem->css))
-		return NULL;
 	return mem;
 }
 
@@ -1276,12 +1300,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 
 	if (do_swap_account && !ret && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
+		unsigned short id;
 		/* avoid double counting */
-		mem = swap_cgroup_record(ent, NULL);
+		id = swap_cgroup_record(ent, 0);
+		rcu_read_lock();
+		mem = mem_cgroup_lookup(id);
 		if (mem) {
+			/*
+			 * We did swap-in. Then, this entry is doubly counted
+			 * both in mem and memsw. We uncharge it, here.
+			 * Recorded ID can be obsolete. We avoid calling
+			 * css_tryget()
+			 */
 			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 			mem_cgroup_put(mem);
 		}
+		rcu_read_unlock();
 	}
 	return ret;
 }
@@ -1346,13 +1380,21 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
+		unsigned short id;
 		struct mem_cgroup *memcg;
-		memcg = swap_cgroup_record(ent, NULL);
+
+		id = swap_cgroup_record(ent, 0);
+		rcu_read_lock();
+		memcg = mem_cgroup_lookup(id);
 		if (memcg) {
+			/*
+			 * This recorded memcg can be obsolete one. So, avoid
+			 * calling css_tryget
+			 */
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 			mem_cgroup_put(memcg);
 		}
-
+		rcu_read_unlock();
 	}
 	/* add this page(page_cgroup) to the LRU we want. */
 
@@ -1473,7 +1515,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
 	/* record memcg information */
 	if (do_swap_account && memcg) {
-		swap_cgroup_record(ent, memcg);
+		swap_cgroup_record(ent, css_id(&memcg->css));
 		mem_cgroup_get(memcg);
 	}
 	if (memcg)
@@ -1488,15 +1530,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 	struct mem_cgroup *memcg;
+	unsigned short id;
 
 	if (!do_swap_account)
 		return;
 
-	memcg = swap_cgroup_record(ent, NULL);
+	id = swap_cgroup_record(ent, 0);
+	rcu_read_lock();
+	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
+		/*
+		 * We uncharge this because swap is freed.
+		 * This memcg can be obsolete one. We avoid calling css_tryget
+		 */
 		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_put(memcg);
 	}
+	rcu_read_unlock();
 }
 #endif
 
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ceecfbb143fa..ebf81074bed4 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -285,12 +285,8 @@ struct swap_cgroup_ctrl {
 
 struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 
-/*
- * This 8bytes seems big..maybe we can reduce this when we can use "id" for
- * cgroup rather than pointer.
- */
 struct swap_cgroup {
-	struct mem_cgroup	*val;
+	unsigned short		id;
 };
 #define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
 #define SC_POS_MASK	(SC_PER_PAGE - 1)
@@ -342,10 +338,10 @@ not_enough_page:
  * @ent: swap entry to be recorded into
  * @mem: mem_cgroup to be recorded
  *
- * Returns old value at success, NULL at failure.
- * (Of course, old value can be NULL.)
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
  */
-struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 {
 	int type = swp_type(ent);
 	unsigned long offset = swp_offset(ent);
@@ -354,18 +350,18 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
 	struct swap_cgroup_ctrl *ctrl;
 	struct page *mappage;
 	struct swap_cgroup *sc;
-	struct mem_cgroup *old;
+	unsigned short old;
 
 	if (!do_swap_account)
-		return NULL;
+		return 0;
 
 	ctrl = &swap_cgroup_ctrl[type];
 
 	mappage = ctrl->map[idx];
 	sc = page_address(mappage);
 	sc += pos;
-	old = sc->val;
-	sc->val = mem;
+	old = sc->id;
+	sc->id = id;
 
 	return old;
 }
@@ -374,9 +370,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
  * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
  * @ent: swap entry to be looked up.
  *
- * Returns pointer to mem_cgroup at success. NULL at failure.
+ * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
  */
-struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+unsigned short lookup_swap_cgroup(swp_entry_t ent)
 {
 	int type = swp_type(ent);
 	unsigned long offset = swp_offset(ent);
@@ -385,16 +381,16 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
 	struct swap_cgroup_ctrl *ctrl;
 	struct page *mappage;
 	struct swap_cgroup *sc;
-	struct mem_cgroup *ret;
+	unsigned short ret;
 
 	if (!do_swap_account)
-		return NULL;
+		return 0;
 
 	ctrl = &swap_cgroup_ctrl[type];
 	mappage = ctrl->map[idx];
 	sc = page_address(mappage);
 	sc += pos;
-	ret = sc->val;
+	ret = sc->id;
 	return ret;
 }
 
@@ -432,7 +428,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
 
 	printk(KERN_INFO
 		"swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
-		" and %ld bytes to hold mem_cgroup pointers on swap\n",
+		" and %ld bytes to hold mem_cgroup information per swap ents\n",
 		array_size, length * PAGE_SIZE);
 	printk(KERN_INFO
 	"swap_cgroup can be disabled by noswapaccount boot option.\n");
-- 
cgit v1.2.3-71-gd317


From bd1a8ab73edd449fecda633449cc277b856ad4f5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:50 -0700
Subject: cgroups: add 'data' field to struct cgroup_scanner

We need to pass some data to test_task() or process_task() in some cases.
Will be used later.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 43763bd772b9..4316a546beb5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -337,6 +337,7 @@ struct cgroup_scanner {
 	void (*process_task)(struct task_struct *p,
 			struct cgroup_scanner *scan);
 	struct ptr_heap *heap;
+	void *data;
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From a1bc5a4eee990a1f290735c8694d0aebdad095fa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:54 -0700
Subject: cpusets: replace zone allowed functions with node allowed

The cpuset_zone_allowed() variants are actually only a function of the
zone's node.

Cc: Paul Menage <menage@google.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 33 +++++++++++++++++++++++-----
 kernel/cpuset.c        | 59 +++++++++++++++++++++-----------------------------
 2 files changed, 52 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2e0d79678deb..05ea1dd7d681 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/cgroup.h>
+#include <linux/mm.h>
 
 #ifdef CONFIG_CPUSETS
 
@@ -29,19 +30,29 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
-extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
 
-static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_softwall(z, gfp_mask);
+		__cpuset_node_allowed_softwall(node, gfp_mask);
 }
 
-static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_hardwall(z, gfp_mask);
+		__cpuset_node_allowed_hardwall(node, gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -112,6 +123,16 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
 static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 {
 	return 1;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0619f109d38d..3ff910eb30d3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2181,26 +2181,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 }
 
 /**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.  If
- * __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.  If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
  * Otherwise, no.
  *
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
  *
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2239,20 +2237,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  *
  * Rule:
- *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
  *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  *    the code that might scan up ancestor cpusets and sleep.
  */
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
@@ -2281,15 +2276,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 }
 
 /*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.   If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes.  Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2297,20 +2292,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
  * any node on the zonelist except the first.  By the time any such
  * calls get to this routine, we should just shut up and say 'yes'.
  *
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
  * mems_allowed or that we're in interrupt.  It does not scan up the
  * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
  * It never sleeps.
  */
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
-
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	if (node_isset(node, current->mems_allowed))
 		return 1;
 	/*
-- 
cgit v1.2.3-71-gd317


From 43918f2bf4806675943416d539d9d5e4d585ebff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:00 -0700
Subject: signals: remove 'handler' parameter to tracehook functions

Container-init must behave like global-init to processes within the
container and hence it must be immune to unhandled fatal signals from
within the container (i.e SIG_DFL signals that terminate the process).

But the same container-init must behave like a normal process to processes
in ancestor namespaces and so if it receives the same fatal signal from a
process in ancestor namespace, the signal must be processed.

Implementing these semantics requires that send_signal() determine pid
namespace of the sender but since signals can originate from workqueues/
interrupt-handlers, determining pid namespace of sender may not always be
possible or safe.

This patchset implements the design/simplified semantics suggested by
Oleg Nesterov.  The simplified semantics for container-init are:

	- container-init must never be terminated by a signal from a
	  descendant process.

	- container-init must never be immune to SIGKILL from an ancestor
	  namespace (so a process in parent namespace must always be able
	  to terminate a descendant container).

	- container-init may be immune to unhandled fatal signals (like
	  SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP
	  are the only reliable signals to a container-init from ancestor
	  namespace.

This patch:

Based on an earlier patch submitted by Oleg Nesterov and comments from
Roland McGrath (http://lkml.org/lkml/2008/11/19/258).

The handler parameter is currently unused in the tracehook functions.
Besides, the tracehook functions are called with siglock held, so the
functions can check the handler if they later need to.

Removing the parameter simiplifies changes to sig_ignored() in a follow-on
patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ptrace.c  |  2 +-
 include/linux/tracehook.h | 13 ++++---------
 kernel/signal.c           |  6 +++---
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 19378715f415..b7cc21bc6ae0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1455,6 +1455,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
 	 * system call instruction.
 	 */
 	if (test_thread_flag(TIF_SINGLESTEP) &&
-	    tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+	    tracehook_consider_fatal_signal(current, SIGTRAP))
 		send_sigtrap(current, regs, 0, TRAP_BRKPT);
 }
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6186a789d6c7..eb4c6545b384 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -388,17 +388,14 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
  * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_IGN or %SIG_DFL
  *
  * Return zero iff tracing doesn't care to examine this ignored signal,
  * so it can short-circuit normal delivery and never even get queued.
- * Either @handler is %SIG_DFL and @sig's default is ignore, or it's %SIG_IGN.
  *
  * Called with @task->sighand->siglock held.
  */
 static inline int tracehook_consider_ignored_signal(struct task_struct *task,
-						    int sig,
-						    void __user *handler)
+						    int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
@@ -407,19 +404,17 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
  * tracehook_consider_fatal_signal - suppress special handling of fatal signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_DFL or %SIG_IGN
  *
  * Return nonzero to prevent special handling of this termination signal.
- * Normally @handler is %SIG_DFL.  It can be %SIG_IGN if @sig is ignored,
- * in which case force_sig() is about to reset it to %SIG_DFL.
+ * Normally handler for signal is %SIG_DFL.  It can be %SIG_IGN if @sig is
+ * ignored, in which case force_sig() is about to reset it to %SIG_DFL.
  * When this returns zero, this signal might cause a quick termination
  * that does not give the debugger a chance to intercept the signal.
  *
  * Called with or without @task->sighand->siglock held.
  */
 static inline int tracehook_consider_fatal_signal(struct task_struct *task,
-						  int sig,
-						  void __user *handler)
+						  int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a11..92a1ab004498 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -74,7 +74,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	/*
 	 * Tracers may want to know about even ignored signals.
 	 */
-	return !tracehook_consider_ignored_signal(t, sig, handler);
+	return !tracehook_consider_ignored_signal(t, sig);
 }
 
 /*
@@ -318,7 +318,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 		return 1;
 	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return !tracehook_consider_fatal_signal(tsk, sig, handler);
+	return !tracehook_consider_fatal_signal(tsk, sig);
 }
 
 
@@ -777,7 +777,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL ||
-	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
+	     !tracehook_consider_fatal_signal(t, sig))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.2.3-71-gd317


From 4576145c1ecdaaea9ef8976a48335206aa1ebf91 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:14 -0700
Subject: ptrace: fix possible zombie leak on PTRACE_DETACH

When ptrace_detach() takes tasklist, the tracee can be SIGKILL'ed.  If it
has already passed exit_notify() we can leak a zombie, because a) ptracing
disables the auto-reaping logic, and b) ->real_parent was not notified
about the child's death.

ptrace_detach() should follow the ptrace_exit's logic, change the code
accordingly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Tested-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 1 +
 kernel/ptrace.c        | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 98b93ca4db06..1a2b0cb55535 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
+extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f62a568e84ec..ee553b6ad125 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -237,6 +237,8 @@ out:
 
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
+	int dead = 0;
+
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -244,18 +246,21 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	/* protect against de_thread()->release_task() */
 	write_lock_irq(&tasklist_lock);
+	/* protect against de_thread()->release_task() */
 	if (child->ptrace) {
 		child->exit_code = data;
 
-		__ptrace_unlink(child);
+		dead = __ptrace_detach(current, child);
 
 		if (!child->exit_state)
 			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
+	if (unlikely(dead))
+		release_task(child);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:18 -0700
Subject: forget_original_parent: split out the un-ptrace part

By discussion with Roland.

- Rename ptrace_exit() to exit_ptrace(), and change it to do all the
  necessary work with ->ptraced list by its own.

- Move this code from exit.c to ptrace.c

- Update the comment in ptrace_detach() to explain the rechecking of
  the child->ptrace.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  2 +-
 include/linux/sched.h  |  5 +++
 kernel/exit.c          | 95 ++++----------------------------------------------
 kernel/ptrace.c        | 78 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 88 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 1a2b0cb55535..67c15653fc23 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,7 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
+extern void exit_ptrace(struct task_struct *tracer);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9186f8c5d5f2..b47c94e7560b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2061,6 +2061,11 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
+static inline int task_detached(struct task_struct *p)
+{
+	return p->exit_signal == -1;
+}
+
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index 3e09b7cb3b20..506693dfdd4e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -61,11 +61,6 @@ DEFINE_TRACE(sched_process_wait);
 
 static void exit_mm(struct task_struct * tsk);
 
-static inline int task_detached(struct task_struct *p)
-{
-	return p->exit_signal == -1;
-}
-
 static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
@@ -731,85 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/*
- * Called with irqs disabled, returns true if childs should reap themselves.
- */
-static int ignoring_children(struct sighand_struct *sigh)
-{
-	int ret;
-	spin_lock(&sigh->siglock);
-	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
-	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
-	spin_unlock(&sigh->siglock);
-	return ret;
-}
-
-/* Returns nonzero if the tracee should be released. */
-int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
-{
-	__ptrace_unlink(p);
-
-	if (p->exit_state != EXIT_ZOMBIE)
-		return 0;
-	/*
-	 * If it's a zombie, our attachedness prevented normal
-	 * parent notification or self-reaping.  Do notification
-	 * now if it would have happened earlier.  If it should
-	 * reap itself we return true.
-	 *
-	 * If it's our own child, there is no notification to do.
-	 * But if our normal children self-reap, then this child
-	 * was prevented by ptrace and we must reap it now.
-	 */
-	if (!task_detached(p) && thread_group_empty(p)) {
-		if (!same_thread_group(p->real_parent, tracer))
-			do_notify_parent(p, p->exit_signal);
-		else if (ignoring_children(tracer->sighand))
-			p->exit_signal = -1;
-	}
-
-	if (!task_detached(p))
-		return 0;
-
-	/* Mark it as in the process of being reaped. */
-	p->exit_state = EXIT_DEAD;
-	return 1;
-}
-
-/*
- * Detach all tasks we were using ptrace on.
- * Any that need to be release_task'd are put on the @dead list.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		if (__ptrace_detach(parent, p))
-			list_add(&p->ptrace_entry, dead);
-	}
-}
-
-/*
- * Finish up exit-time ptrace cleanup.
- *
- * Called without locks.
- */
-static void ptrace_exit_finish(struct task_struct *parent,
-			       struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	BUG_ON(!list_empty(&parent->ptraced));
-
-	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
-		release_task(p);
-	}
-}
-
 /* Returns nonzero if the child should be released. */
 static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
@@ -894,12 +810,10 @@ static void forget_original_parent(struct task_struct *father)
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(ptrace_dead);
 
+	exit_ptrace(father);
+
 	write_lock_irq(&tasklist_lock);
 	reaper = find_new_reaper(father);
-	/*
-	 * First clean up ptrace if we were using it.
-	 */
-	ptrace_exit(father, &ptrace_dead);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
@@ -914,7 +828,10 @@ static void forget_original_parent(struct task_struct *father)
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
 
-	ptrace_exit_finish(father, &ptrace_dead);
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
 }
 
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee553b6ad125..f5a9fa5aafa1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,9 +235,57 @@ out:
 	return retval;
 }
 
+/*
+ * Called with irqs disabled, returns true if childs should reap themselves.
+ */
+static int ignoring_children(struct sighand_struct *sigh)
+{
+	int ret;
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
+	return ret;
+}
+
+/*
+ * Called with tasklist_lock held for writing.
+ * Unlink a traced task, and clean it up if it was a traced zombie.
+ * Return true if it needs to be reaped with release_task().
+ * (We can't call release_task() here because we already hold tasklist_lock.)
+ *
+ * If it's a zombie, our attachedness prevented normal parent notification
+ * or self-reaping.  Do notification now if it would have happened earlier.
+ * If it should reap itself, return true.
+ *
+ * If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
+ */
+static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state == EXIT_ZOMBIE) {
+		if (!task_detached(p) && thread_group_empty(p)) {
+			if (!same_thread_group(p->real_parent, tracer))
+				do_notify_parent(p, p->exit_signal);
+			else if (ignoring_children(tracer->sighand))
+				p->exit_signal = -1;
+		}
+		if (task_detached(p)) {
+			/* Mark it as in the process of being reaped. */
+			p->exit_state = EXIT_DEAD;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
-	int dead = 0;
+	bool dead = false;
 
 	if (!valid_signal(data))
 		return -EIO;
@@ -247,7 +295,10 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	write_lock_irq(&tasklist_lock);
-	/* protect against de_thread()->release_task() */
+	/*
+	 * This child can be already killed. Make sure de_thread() or
+	 * our sub-thread doing do_wait() didn't do release_task() yet.
+	 */
 	if (child->ptrace) {
 		child->exit_code = data;
 
@@ -264,6 +315,29 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	return 0;
 }
 
+/*
+ * Detach all tasks we were using ptrace on.
+ */
+void exit_ptrace(struct task_struct *tracer)
+{
+	struct task_struct *p, *n;
+	LIST_HEAD(ptrace_dead);
+
+	write_lock_irq(&tasklist_lock);
+	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+		if (__ptrace_detach(tracer, p))
+			list_add(&p->ptrace_entry, &ptrace_dead);
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	BUG_ON(!list_empty(&tracer->ptraced));
+
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+}
+
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
 	int copied = 0;
-- 
cgit v1.2.3-71-gd317


From bb24c679a51b1a9b726b901330649e3861814ac0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:20 -0700
Subject: tracehook_notify_death: use task_detached() helper

Now that task_detached() is exported, change tracehook_notify_death() to
use this helper, nobody else checks ->exit_signal == -1 by hand.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index eb4c6545b384..c7aa154f4bfc 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -502,7 +502,7 @@ static inline int tracehook_notify_jctl(int notify, int why)
 static inline int tracehook_notify_death(struct task_struct *task,
 					 void **death_cookie, int group_dead)
 {
-	if (task->exit_signal == -1)
+	if (task_detached(task))
 		return task->ptrace ? SIGCHLD : DEATH_REAP;
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 40e8a10de2c9f87e892dcd5a6f9d1b208329ffea Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:58:25 -0700
Subject: cpu hotplug: remove unused cpuhotplug_mutex_lock()

cpuhotplug_mutex_lock() is not used, remove it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpu.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index c2747ac2ae43..2643d848df90 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -23,7 +23,6 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
-#include <linux/mutex.h>
 
 struct cpu {
 	int node_id;		/* The node which contains the CPU */
@@ -103,16 +102,6 @@ extern struct sysdev_class cpu_sysdev_class;
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
 
-static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
-{
-	mutex_lock(cpu_hp_mutex);
-}
-
-static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
-{
-	mutex_unlock(cpu_hp_mutex);
-}
-
 extern void get_online_cpus(void);
 extern void put_online_cpus(void);
 #define hotcpu_notifier(fn, pri) {				\
@@ -126,11 +115,6 @@ int cpu_down(unsigned int cpu);
 
 #else		/* CONFIG_HOTPLUG_CPU */
 
-static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
-{ }
-static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
-{ }
-
 #define get_online_cpus()	do { } while (0)
 #define put_online_cpus()	do { } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-- 
cgit v1.2.3-71-gd317


From a50b0aa4bd9a7d42112442a385f3dc0e775284dd Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Thu, 2 Apr 2009 16:58:29 -0700
Subject: struct linux_binprm: drop unused fields

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/binfmts.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 77b4a9e46004..6638b8148de7 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -35,8 +35,7 @@ struct linux_binprm{
 #endif
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
-	unsigned int sh_bang:1,
-		misc_bang:1,
+	unsigned int
 		cred_prepared:1,/* true if creds already prepared (multiple
 				 * preps happen for interpreters) */
 		cap_effective:1;/* true if has elevated effective capabilities,
-- 
cgit v1.2.3-71-gd317


From 1f80769ffd36e74357fe896dc43dddf1af1510f3 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Thu, 2 Apr 2009 16:58:30 -0700
Subject: synclink_gt: add clock options

Add support for x8 asynchronous sample rate and ability to specify base
clock frequency.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/synclink_gt.c | 58 ++++++++++++++++++++++++++++++----------------
 include/linux/synclink.h   |  1 +
 2 files changed, 39 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 6ec6e13d47d7..5e256494686a 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -298,6 +298,7 @@ struct slgt_info {
 
 	unsigned int rbuf_fill_level;
 	unsigned int if_mode;
+	unsigned int base_clock;
 
 	/* device status */
 
@@ -1156,22 +1157,26 @@ static long set_params32(struct slgt_info *info, struct MGSL_PARAMS32 __user *ne
 		return -EFAULT;
 
 	spin_lock(&info->lock);
-	info->params.mode            = tmp_params.mode;
-	info->params.loopback        = tmp_params.loopback;
-	info->params.flags           = tmp_params.flags;
-	info->params.encoding        = tmp_params.encoding;
-	info->params.clock_speed     = tmp_params.clock_speed;
-	info->params.addr_filter     = tmp_params.addr_filter;
-	info->params.crc_type        = tmp_params.crc_type;
-	info->params.preamble_length = tmp_params.preamble_length;
-	info->params.preamble        = tmp_params.preamble;
-	info->params.data_rate       = tmp_params.data_rate;
-	info->params.data_bits       = tmp_params.data_bits;
-	info->params.stop_bits       = tmp_params.stop_bits;
-	info->params.parity          = tmp_params.parity;
+	if (tmp_params.mode == MGSL_MODE_BASE_CLOCK) {
+		info->base_clock = tmp_params.clock_speed;
+	} else {
+		info->params.mode            = tmp_params.mode;
+		info->params.loopback        = tmp_params.loopback;
+		info->params.flags           = tmp_params.flags;
+		info->params.encoding        = tmp_params.encoding;
+		info->params.clock_speed     = tmp_params.clock_speed;
+		info->params.addr_filter     = tmp_params.addr_filter;
+		info->params.crc_type        = tmp_params.crc_type;
+		info->params.preamble_length = tmp_params.preamble_length;
+		info->params.preamble        = tmp_params.preamble;
+		info->params.data_rate       = tmp_params.data_rate;
+		info->params.data_bits       = tmp_params.data_bits;
+		info->params.stop_bits       = tmp_params.stop_bits;
+		info->params.parity          = tmp_params.parity;
+	}
 	spin_unlock(&info->lock);
 
- 	change_params(info);
+	program_hw(info);
 
 	return 0;
 }
@@ -2559,10 +2564,13 @@ static int set_params(struct slgt_info *info, MGSL_PARAMS __user *new_params)
 		return -EFAULT;
 
 	spin_lock_irqsave(&info->lock, flags);
-	memcpy(&info->params, &tmp_params, sizeof(MGSL_PARAMS));
+	if (tmp_params.mode == MGSL_MODE_BASE_CLOCK)
+		info->base_clock = tmp_params.clock_speed;
+	else
+		memcpy(&info->params, &tmp_params, sizeof(MGSL_PARAMS));
 	spin_unlock_irqrestore(&info->lock, flags);
 
- 	change_params(info);
+	program_hw(info);
 
 	return 0;
 }
@@ -3432,6 +3440,7 @@ static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev
 		info->magic = MGSL_MAGIC;
 		INIT_WORK(&info->task, bh_handler);
 		info->max_frame_size = 4096;
+		info->base_clock = 14745600;
 		info->rbuf_fill_level = DMABUFSIZE;
 		info->port.close_delay = 5*HZ/10;
 		info->port.closing_wait = 30*HZ;
@@ -3779,7 +3788,7 @@ static void enable_loopback(struct slgt_info *info)
 static void set_rate(struct slgt_info *info, u32 rate)
 {
 	unsigned int div;
-	static unsigned int osc = 14745600;
+	unsigned int osc = info->base_clock;
 
 	/* div = osc/rate - 1
 	 *
@@ -4083,18 +4092,27 @@ static void async_mode(struct slgt_info *info)
 	 * 06  CTS      IRQ enable
 	 * 05  DCD      IRQ enable
 	 * 04  RI       IRQ enable
-	 * 03  reserved, must be zero
+	 * 03  0=16x sampling, 1=8x sampling
 	 * 02  1=txd->rxd internal loopback enable
 	 * 01  reserved, must be zero
 	 * 00  1=master IRQ enable
 	 */
 	val = BIT15 + BIT14 + BIT0;
+	/* JCR[8] : 1 = x8 async mode feature available */
+	if ((rd_reg32(info, JCR) & BIT8) && info->params.data_rate &&
+	    ((info->base_clock < (info->params.data_rate * 16)) ||
+	     (info->base_clock % (info->params.data_rate * 16)))) {
+		/* use 8x sampling */
+		val |= BIT3;
+		set_rate(info, info->params.data_rate * 8);
+	} else {
+		/* use 16x sampling */
+		set_rate(info, info->params.data_rate * 16);
+	}
 	wr_reg16(info, SCR, val);
 
 	slgt_irq_on(info, IRQ_RXBREAK | IRQ_RXOVER);
 
-	set_rate(info, info->params.data_rate * 16);
-
 	if (info->params.loopback)
 		enable_loopback(info);
 }
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 99b8bdb17b2b..0ff2779c44d0 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -125,6 +125,7 @@
 #define MGSL_MODE_MONOSYNC	3
 #define MGSL_MODE_BISYNC	4
 #define MGSL_MODE_RAW		6
+#define MGSL_MODE_BASE_CLOCK    7
 
 #define MGSL_BUS_TYPE_ISA	1
 #define MGSL_BUS_TYPE_EISA	2
-- 
cgit v1.2.3-71-gd317


From 6dda81f4384b94930826eded254d8c16f89a9248 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:35 -0700
Subject: pids: document task_pgrp/task_session is not safe without
 tasklist/rcu

Even if task == current, it is not safe to dereference the result of
task_pgrp/task_session.  We can race with another thread which changes the
special pid via setpgid/setsid.

Document this.  The next 2 patches give an example of the unsafe usage, we
have more bad users.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b47c94e7560b..722dd313bf8a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1489,6 +1489,11 @@ static inline struct pid *task_tgid(struct task_struct *task)
 	return task->group_leader->pids[PIDTYPE_PID].pid;
 }
 
+/*
+ * Without tasklist or rcu lock it is not safe to dereference
+ * the result of task_pgrp/task_session even if task == current,
+ * we can race with another thread doing sys_setsid/sys_setpgid.
+ */
 static inline struct pid *task_pgrp(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PGID].pid;
-- 
cgit v1.2.3-71-gd317


From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:38 -0700
Subject: pids: refactor vnr/nr_ns helpers to make them safe

Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.

task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.

As for "special" pids, vnr/nr_ns helpers always need rcu.  However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.

And almost every helper has a callsite which needs a fix.

Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".

This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.

After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 27 ++++++++++++++++++++-------
 kernel/pid.c          | 31 ++++++++++++++++---------------
 2 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 722dd313bf8a..49df878a0cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1519,17 +1519,23 @@ struct pid_namespace;
  *
  * see also pid_nr() etc in include/linux/pid.h
  */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns);
 
 static inline pid_t task_pid_nr(struct task_struct *tsk)
 {
 	return tsk->pid;
 }
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
 
 static inline pid_t task_pid_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pid(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
 }
 
 
@@ -1551,11 +1557,15 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return tsk->signal->__pgrp;
 }
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
 
 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pgrp(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
 }
 
 
@@ -1564,14 +1574,17 @@ static inline pid_t task_session_nr(struct task_struct *tsk)
 	return tsk->signal->__session;
 }
 
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
 
 static inline pid_t task_session_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_session(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
-
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/pid.c b/kernel/pid.c
index 6628abcc520e..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -452,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns)
 {
-	return pid_nr_ns(task_pid(tsk), ns);
+	pid_t nr = 0;
+
+	rcu_read_lock();
+	if (!ns)
+		ns = current->nsproxy->pid_ns;
+	if (likely(pid_alive(task))) {
+		if (type != PIDTYPE_PID)
+			task = task->group_leader;
+		nr = pid_nr_ns(task->pids[type].pid, ns);
+	}
+	rcu_read_unlock();
+
+	return nr;
 }
-EXPORT_SYMBOL(task_pid_nr_ns);
+EXPORT_SYMBOL(__task_pid_nr_ns);
 
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
@@ -464,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_tgid_nr_ns);
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_pgrp(tsk), ns);
-}
-EXPORT_SYMBOL(task_pgrp_nr_ns);
-
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_session(tsk), ns);
-}
-EXPORT_SYMBOL(task_session_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
-- 
cgit v1.2.3-71-gd317


From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:39 -0700
Subject: pids: kill signal_struct-> __pgrp/__session and friends

We are wasting 2 words in signal_struct without any reason to implement
task_pgrp_nr() and task_session_nr().

task_session_nr() has no callers since
2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it.

task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and
fs/coda.

This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills
__pgrp/__session and the related helpers.

The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense
anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alan Cox <number6@the-village.bc.nu>		[tty parts]
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c |  4 ++--
 include/linux/sched.h | 43 ++++++-------------------------------------
 kernel/exit.c         | 10 +++-------
 kernel/fork.c         |  2 --
 kernel/sys.c          |  4 +---
 5 files changed, 12 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a44b701c5bba..66b99a2049e3 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2681,7 +2681,7 @@ void __do_SAK(struct tty_struct *tty)
 	/* Kill the entire session */
 	do_each_pid_task(session, PIDTYPE_SID, p) {
 		printk(KERN_NOTICE "SAK: killed process %d"
-			" (%s): task_session_nr(p)==tty->session\n",
+			" (%s): task_session(p)==tty->session\n",
 			task_pid_nr(p), p->comm);
 		send_sig(SIGKILL, p, 1);
 	} while_each_pid_task(session, PIDTYPE_SID, p);
@@ -2691,7 +2691,7 @@ void __do_SAK(struct tty_struct *tty)
 	do_each_thread(g, p) {
 		if (p->signal->tty == tty) {
 			printk(KERN_NOTICE "SAK: killed process %d"
-			    " (%s): task_session_nr(p)==tty->session\n",
+			    " (%s): task_session(p)==tty->session\n",
 			    task_pid_nr(p), p->comm);
 			send_sig(SIGKILL, p, 1);
 			continue;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49df878a0cad..206ac003e8c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,25 +547,8 @@ struct signal_struct {
 
 	struct list_head cpu_timers[3];
 
-	/* job control IDs */
-
-	/*
-	 * pgrp and session fields are deprecated.
-	 * use the task_session_Xnr and task_pgrp_Xnr routines below
-	 */
-
-	union {
-		pid_t pgrp __deprecated;
-		pid_t __pgrp;
-	};
-
 	struct pid *tty_old_pgrp;
 
-	union {
-		pid_t session __deprecated;
-		pid_t __session;
-	};
-
 	/* boolean value for session group leader */
 	int leader;
 
@@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }
 
-static inline void set_task_session(struct task_struct *tsk, pid_t session)
-{
-	tsk->signal->__session = session;
-}
-
-static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
-{
-	tsk->signal->__pgrp = pgrp;
-}
-
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
@@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__pgrp;
-}
-
 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_session_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__session;
-}
-
 static inline pid_t task_session_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk)
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
+/* obsolete, do not use */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/exit.c b/kernel/exit.c
index 384f09caf2ef..3bec141c82f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -357,16 +357,12 @@ static void reparent_to_kthreadd(void)
 void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
-	pid_t nr = pid_nr(pid);
 
-	if (task_session(curr) != pid) {
+	if (task_session(curr) != pid)
 		change_pid(curr, PIDTYPE_SID, pid);
-		set_task_session(curr, nr);
-	}
-	if (task_pgrp(curr) != pid) {
+
+	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
-		set_task_pgrp(curr, nr);
-	}
 }
 
 static void set_special_pids(struct pid *pid)
diff --git a/kernel/fork.c b/kernel/fork.c
index adbea16ec649..f74458231449 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1265,8 +1265,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			p->signal->leader_pid = pid;
 			tty_kref_put(p->signal->tty);
 			p->signal->tty = tty_kref_get(current->signal->tty);
-			set_task_pgrp(p, task_pgrp_nr(current));
-			set_task_session(p, task_session_nr(current));
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..742cefa527e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	if (err)
 		goto out;
 
-	if (task_pgrp(p) != pgrp) {
+	if (task_pgrp(p) != pgrp)
 		change_pid(p, PIDTYPE_PGID, pgrp);
-		set_task_pgrp(p, pid_nr(pgrp));
-	}
 
 	err = 0;
 out:
-- 
cgit v1.2.3-71-gd317


From 7c5ff4f92e2b47c56d777a5adbadd9a52841b635 Mon Sep 17 00:00:00 2001
From: Harry Ciao <qingtao.cao@windriver.com>
Date: Thu, 2 Apr 2009 16:58:48 -0700
Subject: pci: Add AMD8111 PCI Bridge PCI Device ID

Add the PCI Device ID of the PCI Bridge Controller on AMD8111 chip.

Signed-off-by: Harry Ciao <qingtao.cao@windriver.com>
Cc: Doug Thompson <norsk5@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cb14fd260837..170f8b1f22db 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -526,6 +526,7 @@
 #define PCI_DEVICE_ID_AMD_OPUS_7443	0x7443
 #define PCI_DEVICE_ID_AMD_VIPER_7443	0x7443
 #define PCI_DEVICE_ID_AMD_OPUS_7445	0x7445
+#define PCI_DEVICE_ID_AMD_8111_PCI	0x7460
 #define PCI_DEVICE_ID_AMD_8111_LPC	0x7468
 #define PCI_DEVICE_ID_AMD_8111_IDE	0x7469
 #define PCI_DEVICE_ID_AMD_8111_SMBUS2	0x746a
-- 
cgit v1.2.3-71-gd317


From 04d491ab2a53008a1aa98ac09561768c7f3adda3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 2 Apr 2009 16:58:57 -0700
Subject: kexec: add dmesg log symbols to /proc/vmcoreinfo lists

It would be nice to be able to extract the dmesg log from a vmcore file
without needing to keep the debug symbols for the running kernel handy all
the time.  We have a facility to do this in /proc/vmcore.  This patch adds
the log_buf and log_end symbols to the vmcoreinfo area so that tools (like
makedumpfile) can easily extract the dmesg logs from a vmcore image.

[akpm@linux-foundation.org: several fixes and cleanups]
[akpm@linux-foundation.org: fix unused log_buf_kexec_setup()]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  4 ++++
 kernel/kexec.c         |  1 +
 kernel/printk.c        | 19 +++++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e720b0da7751..556d781e69fe 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,7 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+void log_buf_kexec_setup(void);
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -253,6 +254,9 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+static inline void log_buf_kexec_setup(void)
+{
+}
 #endif
 
 extern int printk_needs_cpu(int cpu);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 93eed85fe017..589832aac41f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vm_struct, addr);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_kexec_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
 	VMCOREINFO_NUMBER(PG_lru);
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..a5f61a9acedb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/kexec.h>
 
 #include <asm/uaccess.h>
 
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
 
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcoreinfo
+ *
+ * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate.  These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+	VMCOREINFO_SYMBOL(log_buf);
+	VMCOREINFO_SYMBOL(log_end);
+	VMCOREINFO_SYMBOL(log_buf_len);
+	VMCOREINFO_SYMBOL(logged_chars);
+}
+#endif
+
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned size = memparse(str, &str);
-- 
cgit v1.2.3-71-gd317


From f3554f4bc69803ac2baaf7cf2aa4339e1f4b693e Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 2 Apr 2009 16:59:23 -0700
Subject: preadv/pwritev: Add preadv and pwritev system calls.

This patch adds preadv and pwritev system calls.  These syscalls are a
pretty straightforward combination of pread and readv (same for write).
They are quite useful for doing vectored I/O in threaded applications.
Using lseek+readv instead opens race windows you'll have to plug with
locking.

Other systems have such system calls too, for example NetBSD, check
here: http://www.daemon-systems.org/man/preadv.2.html

The application-visible interface provided by glibc should look like
this to be compatible to the existing implementations in the *BSD family:

  ssize_t preadv(int d, const struct iovec *iov, int iovcnt, off_t offset);
  ssize_t pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset);

This prototype has one problem though: On 32bit archs is the (64bit)
offset argument unaligned, which the syscall ABI of several archs doesn't
allow to do.  At least s390 needs a wrapper in glibc to handle this.  As
we'll need a wrappers in glibc anyway I've decided to push problem to
glibc entriely and use a syscall prototype which works without
arch-specific wrappers inside the kernel: The offset argument is
explicitly splitted into two 32bit values.

The patch sports the actual system call implementation and the windup in
the x86 system call tables.  Other archs follow as separate patches.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  2 ++
 arch/x86/include/asm/unistd_32.h   |  2 ++
 arch/x86/include/asm/unistd_64.h   |  4 +++
 arch/x86/kernel/syscall_table_32.S |  2 ++
 fs/compat.c                        | 36 +++++++++++++++++++++++++++
 fs/read_write.c                    | 50 ++++++++++++++++++++++++++++++++++++++
 include/linux/compat.h             |  6 +++++
 include/linux/syscalls.h           |  4 +++
 8 files changed, 106 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index db0c803170ab..a505202086e8 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -828,4 +828,6 @@ ia32_sys_call_table:
 	.quad sys_dup3			/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad compat_sys_preadv
+	.quad compat_sys_pwritev
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..6e72d74cf8dc 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,8 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_preadv		333
+#define __NR_pwritev		334
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f..f81829462325 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,10 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_preadv				295
+__SYSCALL(__NR_preadv, sys_preadv)
+#define __NR_pwritev				296
+__SYSCALL(__NR_pwritev, sys_pwritev)
 
 
 #ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 3bdb64829b82..ff5c8736b491 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,5 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_preadv
+	.long sys_pwritev
diff --git a/fs/compat.c b/fs/compat.c
index e04b4660db84..7c1615183d1e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1232,6 +1232,24 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
 	return ret;
 }
 
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+		  unsigned long vlen, u32 pos_high, u32 pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+	ret = compat_readv(file, vec, vlen, &pos);
+	fput(file);
+	return ret;
+}
+
 static size_t compat_writev(struct file *file,
 			    const struct compat_iovec __user *vec,
 			    unsigned long vlen, loff_t *pos)
@@ -1269,6 +1287,24 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
 	return ret;
 }
 
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+		   unsigned long vlen, u32 pos_high, u32 pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+	ret = compat_writev(file, vec, vlen, &pos);
+	fput(file);
+	return ret;
+}
+
 asmlinkage long
 compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
 		    unsigned int nr_segs, unsigned int flags)
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..6d5d8ff238aa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,56 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, u32, pos_high, u32, pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, u32, pos_high, u32, pos_low)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 			   size_t count, loff_t max)
 {
diff --git a/include/linux/compat.h b/include/linux/compat.h
index b880864672de..9723edd6455c 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -191,6 +191,12 @@ asmlinkage ssize_t compat_sys_readv(unsigned long fd,
 		const struct compat_iovec __user *vec, unsigned long vlen);
 asmlinkage ssize_t compat_sys_writev(unsigned long fd,
 		const struct compat_iovec __user *vec, unsigned long vlen);
+asmlinkage ssize_t compat_sys_preadv(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, u32 pos_high, u32 pos_low);
+asmlinkage ssize_t compat_sys_pwritev(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, u32 pos_high, u32 pos_low);
 
 int compat_do_execve(char * filename, compat_uptr_t __user *argv,
 	        compat_uptr_t __user *envp, struct pt_regs * regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..b299a82a05e7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -461,6 +461,10 @@ asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 			    size_t count, loff_t pos);
 asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 			     size_t count, loff_t pos);
+asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
+			   unsigned long vlen, u32 pos_high, u32 pos_low);
+asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
+			    unsigned long vlen, u32 pos_high, u32 pos_low);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_mkdir(const char __user *pathname, int mode);
 asmlinkage long sys_chdir(const char __user *filename);
-- 
cgit v1.2.3-71-gd317


From e8c158bb313c1df421eab7dc4299cd39cbbf5895 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:45 -0700
Subject: Factor out #ifdefs from kernel/spinlock.c to LOCK_CONTENDED_FLAGS

SGI has observed that on large systems, interrupts are not serviced for a
long period of time when waiting for a rwlock.  The following patch series
re-enables irqs while waiting for the lock, resembling the code which is
already there for spinlocks.

I only made the ia64 version, because the patch adds some overhead to the
fast path.  I assume there is currently no demand to have this for other
architectures, because the systems are not so large.  Of course, the
possibility to implement raw_{read|write}_lock_flags for any architecture
is still there.

This patch:

The new macro LOCK_CONTENDED_FLAGS expands to the correct implementation
depending on the config options, so that IRQ's are re-enabled when
possible, but they remain disabled if CONFIG_LOCKDEP is set.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h | 17 +++++++++++++++++
 kernel/spinlock.c       | 12 ++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 5a58ea3e91e9..da5a5a1f4cd2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -364,6 +364,23 @@ do {								\
 
 #endif /* CONFIG_LOCK_STAT */
 
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_*_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	LOCK_CONTENDED((_lock), (try), (lock))
+
+#else /* CONFIG_LOCKDEP */
+
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	lockfl((_lock), (flags))
+
+#endif /* CONFIG_LOCKDEP */
+
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern void early_init_irq_lock_class(void);
 #else
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7283c6dc2d59 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -299,16 +299,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * _raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
-	_raw_spin_lock_flags(lock, &flags);
-#endif
+	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+				_raw_spin_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
-- 
cgit v1.2.3-71-gd317


From f5f7eac41db827a47b2163330eecd7bb55ae9f12 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:46 -0700
Subject: Allow rwlocks to re-enable interrupts

Pass the original flags to rwlock arch-code, so that it can re-enable
interrupts if implemented for that architecture.

Initially, make __raw_read_lock_flags and __raw_write_lock_flags stubs
which just do the same thing as non-flags variants.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-arch@vger.kernel.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/spinlock.h          | 3 +++
 arch/arm/include/asm/spinlock.h            | 3 +++
 arch/cris/include/arch-v32/arch/spinlock.h | 2 ++
 arch/ia64/include/asm/spinlock.h           | 3 +++
 arch/mips/include/asm/spinlock.h           | 2 ++
 arch/parisc/include/asm/spinlock.h         | 3 +++
 arch/powerpc/include/asm/spinlock.h        | 3 +++
 arch/s390/include/asm/spinlock.h           | 3 +++
 arch/sh/include/asm/spinlock.h             | 3 +++
 arch/sparc/include/asm/spinlock_32.h       | 2 ++
 arch/sparc/include/asm/spinlock_64.h       | 2 ++
 arch/x86/include/asm/spinlock.h            | 3 +++
 include/asm-m32r/spinlock.h                | 3 +++
 include/linux/spinlock.h                   | 6 ++++++
 kernel/spinlock.c                          | 6 ++++--
 15 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index aeeb125f6851..e38fb95cb335 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -166,6 +166,9 @@ static inline void __raw_write_unlock(raw_rwlock_t * lock)
 	lock->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 2b41ebbfa7ff..c13681ac1ede 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -217,6 +217,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *rw)
 /* read_can_lock - would read_trylock() succeed? */
 #define __raw_read_can_lock(x)		((x)->lock < 0x80000000)
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/cris/include/arch-v32/arch/spinlock.h b/arch/cris/include/arch-v32/arch/spinlock.h
index 0d5709b983a1..129756b96661 100644
--- a/arch/cris/include/arch-v32/arch/spinlock.h
+++ b/arch/cris/include/arch-v32/arch/spinlock.h
@@ -121,6 +121,8 @@ static  inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return 1;
 }
 
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index 0229fb95fb38..0a619618b2fa 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -213,6 +213,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *x)
 	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 10e82441b496..5b60a09a0f08 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -480,6 +480,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return ret;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index f3d2090a18dc..fae03e136fa8 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -187,6 +187,9 @@ static __inline__ int __raw_write_can_lock(raw_rwlock_t *rw)
 	return !rw->counter;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 36864364e601..c3b193121f81 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -287,6 +287,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 	rw->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	__spin_yield(lock)
 #define _raw_read_relax(lock)	__rw_yield(lock)
 #define _raw_write_relax(lock)	__rw_yield(lock)
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index df84ae96915f..f3861b09ebb0 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -172,6 +172,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return _raw_write_trylock_retry(rw);
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
 
diff --git a/arch/sh/include/asm/spinlock.h b/arch/sh/include/asm/spinlock.h
index e793181d64da..60283565f89b 100644
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -216,6 +216,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return (oldval > (RW_LOCK_BIAS - 1));
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index bf2d532593e3..46f91ab66a50 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -177,6 +177,8 @@ static inline int __read_trylock(raw_rwlock_t *rw)
 #define __raw_write_unlock(rw)	do { (rw)->lock = 0; } while(0)
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_read_lock_flags(rw, flags)   __raw_read_lock(rw)
+#define __raw_write_lock_flags(rw, flags)  __raw_write_lock(rw)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index c4d274d330e9..f6b2b92ad8d2 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -211,9 +211,11 @@ static int inline __write_trylock(raw_rwlock_t *lock)
 }
 
 #define __raw_read_lock(p)	__read_lock(p)
+#define __raw_read_lock_flags(p, f) __read_lock(p)
 #define __raw_read_trylock(p)	__read_trylock(p)
 #define __raw_read_unlock(p)	__read_unlock(p)
 #define __raw_write_lock(p)	__write_lock(p)
+#define __raw_write_lock_flags(p, f) __write_lock(p)
 #define __raw_write_unlock(p)	__write_unlock(p)
 #define __raw_write_trylock(p)	__write_trylock(p)
 
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3a5696656680..e5e6caffec87 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -295,6 +295,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h
index f5cfba81ee10..dded923883b2 100644
--- a/include/asm-m32r/spinlock.h
+++ b/include/asm-m32r/spinlock.h
@@ -316,6 +316,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
 	return 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a0c66a2e00ad..252b245cfcf4 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -153,9 +153,11 @@ do {								\
  extern int _raw_spin_trylock(spinlock_t *lock);
  extern void _raw_spin_unlock(spinlock_t *lock);
  extern void _raw_read_lock(rwlock_t *lock);
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
  extern int _raw_read_trylock(rwlock_t *lock);
  extern void _raw_read_unlock(rwlock_t *lock);
  extern void _raw_write_lock(rwlock_t *lock);
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
  extern int _raw_write_trylock(rwlock_t *lock);
  extern void _raw_write_unlock(rwlock_t *lock);
 #else
@@ -165,9 +167,13 @@ do {								\
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_read_lock(rwlock)		__raw_read_lock(&(rwlock)->raw_lock)
+# define _raw_read_lock_flags(lock, flags) \
+		__raw_read_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
 # define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
+# define _raw_write_lock_flags(lock, flags) \
+		__raw_write_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 # define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7283c6dc2d59..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
+			     _raw_read_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
+			     _raw_write_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
-- 
cgit v1.2.3-71-gd317


From f7ab34ea723ed304b19698efca85d6f40cecd99b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 3 Apr 2009 01:34:35 -0400
Subject: ext3: Add replace-on-truncate hueristics for data=writeback mode

In data=writeback mode, start an asynchronous flush when closing a
file which had been previously truncated down to zero.  This lowers
the probability of data loss in the case of applications that attempt
to replace a file using truncate.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/file.c          | 4 ++++
 fs/ext3/inode.c         | 3 +++
 include/linux/ext3_fs.h | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..4a04cbb1c231 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
  */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
+	if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+		filemap_flush(inode->i_mapping);
+		EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5fa453b49a64..0f5bca0d82fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2346,6 +2346,9 @@ void ext3_truncate(struct inode *inode)
 	if (!ext3_can_truncate(inode))
 		return;
 
+	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+		ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside journal_start().
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index dd495b8c3091..d2630c56cb34 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -208,6 +208,7 @@ static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
 #define EXT3_STATE_JDATA		0x00000001 /* journaled data exists */
 #define EXT3_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT3_STATE_XATTR		0x00000004 /* has in-inode xattrs */
+#define EXT3_STATE_FLUSH_ON_CLOSE	0x00000008
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext3_new_group_input {
-- 
cgit v1.2.3-71-gd317


From 07fe7cb7c7c179f473fd9c823348fd3eb5dad369 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Create a dynamically sized pool of threads for doing very slow work
 items

Create a dynamically sized pool of threads for doing very slow work items, such
as invoking mkdir() or rmdir() - things that may take a long time and may
sleep, holding mutexes/semaphores and hogging a thread, and are thus unsuitable
for workqueues.

The number of threads is always at least a settable minimum, but more are
started when there's more work to do, up to a limit.  Because of the nature of
the load, it's not suitable for a 1-thread-per-CPU type pool.  A system with
one CPU may well want several threads.

This is used by FS-Cache to do slow caching operations in the background, such
as looking up, creating or deleting cache objects.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |  88 +++++++++++
 init/Kconfig              |  12 ++
 kernel/Makefile           |   1 +
 kernel/slow-work.c        | 388 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 489 insertions(+)
 create mode 100644 include/linux/slow-work.h
 create mode 100644 kernel/slow-work.c

(limited to 'include/linux')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
new file mode 100644
index 000000000000..4dd754af393e
--- /dev/null
+++ b/include/linux/slow-work.h
@@ -0,0 +1,88 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_SLOW_WORK_H
+#define _LINUX_SLOW_WORK_H
+
+#ifdef CONFIG_SLOW_WORK
+
+struct slow_work;
+
+/*
+ * The operations used to support slow work items
+ */
+struct slow_work_ops {
+	/* get a ref on a work item
+	 * - return 0 if successful, -ve if not
+	 */
+	int (*get_ref)(struct slow_work *work);
+
+	/* discard a ref to a work item */
+	void (*put_ref)(struct slow_work *work);
+
+	/* execute a work item */
+	void (*execute)(struct slow_work *work);
+};
+
+/*
+ * A slow work item
+ * - A reference is held on the parent object by the thread pool when it is
+ *   queued
+ */
+struct slow_work {
+	unsigned long		flags;
+#define SLOW_WORK_PENDING	0	/* item pending (further) execution */
+#define SLOW_WORK_EXECUTING	1	/* item currently executing */
+#define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
+#define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
+	const struct slow_work_ops *ops; /* operations table for this item */
+	struct list_head	link;	/* link in queue */
+};
+
+/**
+ * slow_work_init - Initialise a slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a slow work item.
+ */
+static inline void slow_work_init(struct slow_work *work,
+				  const struct slow_work_ops *ops)
+{
+	work->flags = 0;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+/**
+ * slow_work_init - Initialise a very slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a very slow work item.  This item will be restricted such that
+ * only a certain number of the pool threads will be able to execute items of
+ * this type.
+ */
+static inline void vslow_work_init(struct slow_work *work,
+				   const struct slow_work_ops *ops)
+{
+	work->flags = 1 << SLOW_WORK_VERY_SLOW;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+extern int slow_work_enqueue(struct slow_work *work);
+extern int slow_work_register_user(void);
+extern void slow_work_unregister_user(void);
+
+
+#endif /* CONFIG_SLOW_WORK */
+#endif /* _LINUX_SLOW_WORK_H */
diff --git a/init/Kconfig b/init/Kconfig
index 1398a14b0191..236a79377b8e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1014,6 +1014,18 @@ config MARKERS
 
 source "arch/Kconfig"
 
+config SLOW_WORK
+	default n
+	bool "Enable slow work thread pool"
+	help
+	  The slow work thread pool provides a number of dynamically allocated
+	  threads that can be used by the kernel to perform operations that
+	  take a relatively long time.
+
+	  An example of this would be CacheFiles doing a path lookup followed
+	  by a series of mkdirs and a create call, all of which have to touch
+	  disk.
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55d..bab1dffe37e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..5a7392734c82
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,388 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <asm/system.h>
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+					     * very slow work */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+/*
+ * The queues of work items and the lock governing access to them.  These are
+ * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls.  A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock.  Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool.  This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+	unsigned vsmax;
+
+	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+	vsmax /= 100;
+	vsmax = max(vsmax, 1U);
+	return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread.  Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+	struct slow_work *work = NULL;
+	unsigned vsmax;
+	bool very_slow;
+
+	vsmax = slow_work_calc_vsmax();
+
+	/* find something to execute */
+	spin_lock_irq(&slow_work_queue_lock);
+	if (!list_empty(&vslow_work_queue) &&
+	    atomic_read(&vslow_work_executing_count) < vsmax) {
+		work = list_entry(vslow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		atomic_inc(&vslow_work_executing_count);
+		very_slow = true;
+	} else if (!list_empty(&slow_work_queue)) {
+		work = list_entry(slow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		very_slow = false;
+	} else {
+		very_slow = false; /* avoid the compiler warning */
+	}
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	if (!work)
+		return false;
+
+	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+		BUG();
+
+	work->ops->execute(work);
+
+	if (very_slow)
+		atomic_dec(&vslow_work_executing_count);
+	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+	/* if someone tried to enqueue the item whilst we were executing it,
+	 * then it'll be left unenqueued to avoid multiple threads trying to
+	 * execute it simultaneously
+	 *
+	 * there is, however, a race between us testing the pending flag and
+	 * getting the spinlock, and between the enqueuer setting the pending
+	 * flag and getting the spinlock, so we use a deferral bit to tell us
+	 * if the enqueuer got there first
+	 */
+	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+			goto auto_requeue;
+
+		spin_unlock_irq(&slow_work_queue_lock);
+	}
+
+	work->ops->put_ref(work);
+	return true;
+
+auto_requeue:
+	/* we must complete the enqueue operation
+	 * - we transfer our ref on the item back to the appropriate queue
+	 * - don't wake another thread up as we're awake already
+	 */
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+		list_add_tail(&work->link, &vslow_work_queue);
+	else
+		list_add_tail(&work->link, &slow_work_queue);
+	spin_unlock_irq(&slow_work_queue_lock);
+	return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing.  If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations.  The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention.  The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute.  This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+	unsigned long flags;
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+	BUG_ON(!work->ops->get_ref);
+
+	/* when honouring an enqueue request, we only promise that we will run
+	 * the work function in the future; we do not promise to run it once
+	 * per enqueue request
+	 *
+	 * we use the PENDING bit to merge together repeat requests without
+	 * having to disable IRQs and take the spinlock, whilst still
+	 * maintaining our promise
+	 */
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		/* we promise that we will not attempt to execute the work
+		 * function in more than one thread simultaneously
+		 *
+		 * this, however, leaves us with a problem if we're asked to
+		 * enqueue the work whilst someone is executing the work
+		 * function as simply queueing the work immediately means that
+		 * another thread may try executing it whilst it is already
+		 * under execution
+		 *
+		 * to deal with this, we set the ENQ_DEFERRED bit instead of
+		 * enqueueing, and the thread currently executing the work
+		 * function will enqueue the work item when the work function
+		 * returns and it has cleared the EXECUTING bit
+		 */
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+		} else {
+			if (work->ops->get_ref(work) < 0)
+				goto cant_get_ref;
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			wake_up(&slow_work_thread_wq);
+		}
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+	return 0;
+
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+	return !list_empty(&slow_work_queue) ||
+		(!list_empty(&vslow_work_queue) &&
+		 atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+	int vsmax;
+
+	DEFINE_WAIT(wait);
+
+	set_freezable();
+	set_user_nice(current, -5);
+
+	for (;;) {
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		prepare_to_wait(&slow_work_thread_wq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (!freezing(current) &&
+		    !slow_work_threads_should_exit &&
+		    !slow_work_available(vsmax))
+			schedule();
+		finish_wait(&slow_work_thread_wq, &wait);
+
+		try_to_freeze();
+
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		if (slow_work_available(vsmax) && slow_work_execute()) {
+			cond_resched();
+			continue;
+		}
+
+		if (slow_work_threads_should_exit)
+			break;
+	}
+
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete_and_exit(&slow_work_last_thread_exited, 0);
+	return 0;
+}
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point.  This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+	struct task_struct *p;
+	int loop;
+
+	mutex_lock(&slow_work_user_lock);
+
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+		init_completion(&slow_work_last_thread_exited);
+
+		slow_work_threads_should_exit = false;
+
+		/* start the minimum number of threads */
+		for (loop = 0; loop < slow_work_min_threads; loop++) {
+			atomic_inc(&slow_work_thread_count);
+			p = kthread_run(slow_work_thread, NULL, "kslowd");
+			if (IS_ERR(p))
+				goto error;
+		}
+		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+	}
+
+	slow_work_user_count++;
+	mutex_unlock(&slow_work_user_lock);
+	return 0;
+
+error:
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete(&slow_work_last_thread_exited);
+	if (loop > 0) {
+		printk(KERN_ERR "Slow work thread pool:"
+		       " Aborting startup on ENOMEM\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_ERR "Slow work thread pool: Aborted\n");
+	}
+	mutex_unlock(&slow_work_user_lock);
+	return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+	mutex_lock(&slow_work_user_lock);
+
+	BUG_ON(slow_work_user_count <= 0);
+
+	slow_work_user_count--;
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_NOTICE "Slow work thread pool:"
+		       " Shut down complete\n");
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+	unsigned nr_cpus = num_possible_cpus();
+
+	if (nr_cpus > slow_work_max_threads)
+		slow_work_max_threads = nr_cpus;
+	return 0;
+}
+
+subsys_initcall(init_slow_work);
-- 
cgit v1.2.3-71-gd317


From 12e22c5e4bc08ab4b05ac079fe40d9891c5e81a0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make the slow work pool configurable

Make the slow work pool configurable through /proc/sys/kernel/slow-work.

 (*) /proc/sys/kernel/slow-work/min-threads

     The minimum number of threads that should be in the pool as long as it is
     in use.  This may be anywhere between 2 and max-threads.

 (*) /proc/sys/kernel/slow-work/max-threads

     The maximum number of threads that should in the pool.  This may be
     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.

 (*) /proc/sys/kernel/slow-work/vslow-percentage

     The percentage of active threads in the pool that may be used to execute
     very slow work items.  This may be between 1 and 99.  The resultant number
     is bounded to between 1 and one fewer than the number of active threads.
     This ensures there is always at least one thread that can process very
     slow work items, and always at least one thread that won't.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |   5 ++
 kernel/slow-work.c        | 118 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c           |   9 ++++
 3 files changed, 130 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 4dd754af393e..8262809dfa8b 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_SLOW_WORK
 
+#include <linux/sysctl.h>
+
 struct slow_work;
 
 /*
@@ -83,6 +85,9 @@ extern int slow_work_enqueue(struct slow_work *work);
 extern int slow_work_register_user(void);
 extern void slow_work_unregister_user(void);
 
+#ifdef CONFIG_SYSCTL
+extern ctl_table slow_work_sysctls[];
+#endif
 
 #endif /* CONFIG_SLOW_WORK */
 #endif /* _LINUX_SLOW_WORK_H */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 454abb21c8bd..3f65900aa3cb 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <asm/system.h>
 
 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
 					 * things to do */
@@ -24,6 +23,14 @@
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+					void __user *, size_t *, loff_t *);
+#endif
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -34,6 +41,51 @@ static unsigned slow_work_min_threads = 2;
 static unsigned slow_work_max_threads = 4;
 static unsigned vslow_work_proportion = 50; /* % of threads that may process
 					     * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "min-threads",
+		.data		= &slow_work_min_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_min_threads_sysctl,
+		.extra1		= (void *) &slow_work_min_min_threads,
+		.extra2		= &slow_work_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max-threads",
+		.data		= &slow_work_max_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_max_threads_sysctl,
+		.extra1		= &slow_work_min_threads,
+		.extra2		= (void *) &slow_work_max_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "vslow-percentage",
+		.data		= &vslow_work_proportion,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= (void *) &slow_work_min_vslow,
+		.extra2		= (void *) &slow_work_max_vslow,
+	},
+	{ .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
@@ -427,6 +479,64 @@ static void slow_work_oom_timeout(unsigned long data)
 	slow_work_may_not_start_new_thread = false;
 }
 
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to start or stop threads */
+			n = atomic_read(&slow_work_thread_count) -
+				slow_work_min_threads;
+
+			if (n < 0 && !slow_work_may_not_start_new_thread)
+				slow_work_enqueue(&slow_work_new_thread);
+			else if (n > 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to stop threads */
+			n = slow_work_max_threads -
+				atomic_read(&slow_work_thread_count);
+
+			if (n < 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -516,8 +626,12 @@ static int __init init_slow_work(void)
 {
 	unsigned nr_cpus = num_possible_cpus();
 
-	if (nr_cpus > slow_work_max_threads)
+	if (slow_work_max_threads < nr_cpus)
 		slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+	if (slow_work_max_max_threads < nr_cpus * 2)
+		slow_work_max_max_threads = nr_cpus * 2;
+#endif
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ec4543dfc06..82350f8f04f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/slow-work.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &scan_unevictable_handler,
 	},
 #endif
+#ifdef CONFIG_SLOW_WORK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slow-work",
+		.mode		= 0555,
+		.child		= slow_work_sysctls,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3-71-gd317


From 8f0aa2f25b31ba27db84259141e52ee6ec0d2820 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Document the slow work thread pool

Document the slow work thread pool.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/slow-work.txt | 174 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/slow-work.h   |   2 +
 kernel/slow-work.c          |   2 +
 3 files changed, 178 insertions(+)
 create mode 100644 Documentation/slow-work.txt

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
new file mode 100644
index 000000000000..ebc50f808ea4
--- /dev/null
+++ b/Documentation/slow-work.txt
@@ -0,0 +1,174 @@
+		     ====================================
+		     SLOW WORK ITEM EXECUTION THREAD POOL
+		     ====================================
+
+By: David Howells <dhowells@redhat.com>
+
+The slow work item execution thread pool is a pool of threads for performing
+things that take a relatively long time, such as making mkdir calls.
+Typically, when processing something, these items will spend a lot of time
+blocking a thread on I/O, thus making that thread unavailable for doing other
+work.
+
+The standard workqueue model is unsuitable for this class of work item as that
+limits the owner to a single thread or a single thread per CPU.  For some
+tasks, however, more threads - or fewer - are required.
+
+There is just one pool per system.  It contains no threads unless something
+wants to use it - and that something must register its interest first.  When
+the pool is active, the number of threads it contains is dynamic, varying
+between a maximum and minimum setting, depending on the load.
+
+
+====================
+CLASSES OF WORK ITEM
+====================
+
+This pool support two classes of work items:
+
+ (*) Slow work items.
+
+ (*) Very slow work items.
+
+The former are expected to finish much quicker than the latter.
+
+An operation of the very slow class may do a batch combination of several
+lookups, mkdirs, and a create for instance.
+
+An operation of the ordinarily slow class may, for example, write stuff or
+expand files, provided the time taken to do so isn't too long.
+
+Operations of both types may sleep during execution, thus tying up the thread
+loaned to it.
+
+
+THREAD-TO-CLASS ALLOCATION
+--------------------------
+
+Not all the threads in the pool are available to work on very slow work items.
+The number will be between one and one fewer than the number of active threads.
+This is configurable (see the "Pool Configuration" section).
+
+All the threads are available to work on ordinarily slow work items, but a
+percentage of the threads will prefer to work on very slow work items.
+
+The configuration ensures that at least one thread will be available to work on
+very slow work items, and at least one thread will be available that won't work
+on very slow work items at all.
+
+
+=====================
+USING SLOW WORK ITEMS
+=====================
+
+Firstly, a module or subsystem wanting to make use of slow work items must
+register its interest:
+
+	 int ret = slow_work_register_user();
+
+This will return 0 if successful, or a -ve error upon failure.
+
+
+Slow work items may then be set up by:
+
+ (1) Declaring a slow_work struct type variable:
+
+	#include <linux/slow-work.h>
+
+	struct slow_work myitem;
+
+ (2) Declaring the operations to be used for this item:
+
+	struct slow_work_ops myitem_ops = {
+		.get_ref = myitem_get_ref,
+		.put_ref = myitem_put_ref,
+		.execute = myitem_execute,
+	};
+
+     [*] For a description of the ops, see section "Item Operations".
+
+ (3) Initialising the item:
+
+	slow_work_init(&myitem, &myitem_ops);
+
+     or:
+
+	vslow_work_init(&myitem, &myitem_ops);
+
+     depending on its class.
+
+A suitably set up work item can then be enqueued for processing:
+
+	int ret = slow_work_enqueue(&myitem);
+
+This will return a -ve error if the thread pool is unable to gain a reference
+on the item, 0 otherwise.
+
+
+The items are reference counted, so there ought to be no need for a flush
+operation.  When all a module's slow work items have been processed, and the
+module has no further interest in the facility, it should unregister its
+interest:
+
+	slow_work_unregister_user();
+
+
+===============
+ITEM OPERATIONS
+===============
+
+Each work item requires a table of operations of type struct slow_work_ops.
+All members are required:
+
+ (*) Get a reference on an item:
+
+	int (*get_ref)(struct slow_work *work);
+
+     This allows the thread pool to attempt to pin an item by getting a
+     reference on it.  This function should return 0 if the reference was
+     granted, or a -ve error otherwise.  If an error is returned,
+     slow_work_enqueue() will fail.
+
+     The reference is held whilst the item is queued and whilst it is being
+     executed.  The item may then be requeued with the same reference held, or
+     the reference will be released.
+
+ (*) Release a reference on an item:
+
+	void (*put_ref)(struct slow_work *work);
+
+     This allows the thread pool to unpin an item by releasing the reference on
+     it.  The thread pool will not touch the item again once this has been
+     called.
+
+ (*) Execute an item:
+
+	void (*execute)(struct slow_work *work);
+
+     This should perform the work required of the item.  It may sleep, it may
+     perform disk I/O and it may wait for locks.
+
+
+==================
+POOL CONFIGURATION
+==================
+
+The slow-work thread pool has a number of configurables:
+
+ (*) /proc/sys/kernel/slow-work/min-threads
+
+     The minimum number of threads that should be in the pool whilst it is in
+     use.  This may be anywhere between 2 and max-threads.
+
+ (*) /proc/sys/kernel/slow-work/max-threads
+
+     The maximum number of threads that should in the pool.  This may be
+     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
+
+ (*) /proc/sys/kernel/slow-work/vslow-percentage
+
+     The percentage of active threads in the pool that may be used to execute
+     very slow work items.  This may be between 1 and 99.  The resultant number
+     is bounded to between 1 and one fewer than the number of active threads.
+     This ensures there is always at least one thread that can process very
+     slow work items, and always at least one thread that won't.
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 8262809dfa8b..85958277f83d 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #ifndef _LINUX_SLOW_WORK_H
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 3f65900aa3cb..cf2bc01186ef 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #include <linux/module.h>
-- 
cgit v1.2.3-71-gd317


From 03fb3d2af96c2783c3a5bc03f3d984cf422f0e69 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: FS-Cache: Release page->private after failed readahead

The attached patch causes read_cache_pages() to release page-private data on a
page for which add_to_page_cache() fails.  If the filler function fails, then
the problematic page is left attached to the pagecache (with appropriate flags
set, one presumes) and the remaining to-be-attached pages are invalidated and
discarded.  This permits pages with caching references associated with them to
be cleaned up.

The invalidatepage() address space op is called (indirectly) to do the honours.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/page-flags.h |  2 +-
 mm/readahead.c             | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 61df1779b2a5..9d99e7471ade 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -182,7 +182,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
 
 struct page;	/* forward declaration */
 
-TESTPAGEFLAG(Locked, locked)
+TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
 PAGEFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
diff --git a/mm/readahead.c b/mm/readahead.c
index 9ce303d4b810..6be927569cf6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -31,6 +31,41 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
 
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 
+/*
+ * see if a page needs releasing upon read_cache_pages() failure
+ * - the caller of read_cache_pages() may have set PG_private before calling,
+ *   such as the NFS fs marking pages that are cached locally on disk, thus we
+ *   need to give the fs a chance to clean up in the event of an error
+ */
+static void read_cache_pages_invalidate_page(struct address_space *mapping,
+					     struct page *page)
+{
+	if (PagePrivate(page)) {
+		if (!trylock_page(page))
+			BUG();
+		page->mapping = mapping;
+		do_invalidatepage(page, 0);
+		page->mapping = NULL;
+		unlock_page(page);
+	}
+	page_cache_release(page);
+}
+
+/*
+ * release a list of pages, invalidating them first if need be
+ */
+static void read_cache_pages_invalidate_pages(struct address_space *mapping,
+					      struct list_head *pages)
+{
+	struct page *victim;
+
+	while (!list_empty(pages)) {
+		victim = list_to_page(pages);
+		list_del(&victim->lru);
+		read_cache_pages_invalidate_page(mapping, victim);
+	}
+}
+
 /**
  * read_cache_pages - populate an address space with some pages & start reads against them
  * @mapping: the address_space
@@ -52,14 +87,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
 		list_del(&page->lru);
 		if (add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
-			page_cache_release(page);
+			read_cache_pages_invalidate_page(mapping, page);
 			continue;
 		}
 		page_cache_release(page);
 
 		ret = filler(data, page);
 		if (unlikely(ret)) {
-			put_pages_list(pages);
+			read_cache_pages_invalidate_pages(mapping, pages);
 			break;
 		}
 		task_io_account_read(PAGE_CACHE_SIZE);
-- 
cgit v1.2.3-71-gd317


From 266cf658efcf6ac33541a46740f74f50c79d2b6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:36 +0100
Subject: FS-Cache: Recruit a page flags for cache management

Recruit a page flag to aid in cache management.  The following extra flag is
defined:

 (1) PG_fscache (PG_private_2)

     The marked page is backed by a local cache and is pinning resources in the
     cache driver.

If PG_fscache is set, then things that checked for PG_private will now also
check for that.  This includes things like truncation and page invalidation.
The function page_has_private() had been added to make the checks for both
PG_private and PG_private_2 at the same time.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/splice.c                |  3 ++-
 include/linux/page-flags.h | 38 +++++++++++++++++++++++++++++++++-----
 mm/filemap.c               |  3 +++
 mm/migrate.c               | 10 +++++-----
 mm/readahead.c             |  9 +++++----
 mm/swap.c                  |  4 ++--
 mm/truncate.c              | 10 +++++-----
 mm/vmscan.c                |  6 +++---
 8 files changed, 58 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..dd727d43e5b7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		 */
 		wait_on_page_writeback(page);
 
-		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+		if (page_has_private(page) &&
+		    !try_to_release_page(page, GFP_KERNEL))
 			goto out_unlock;
 
 		/*
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9d99e7471ade..62214c7d2d93 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -82,6 +82,7 @@ enum pageflags {
 	PG_arch_1,
 	PG_reserved,
 	PG_private,		/* If pagecache, has fs-private data */
+	PG_private_2,		/* If pagecache, has fs aux data */
 	PG_writeback,		/* Page is under writeback */
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	PG_head,		/* A head page */
@@ -108,6 +109,12 @@ enum pageflags {
 	/* Filesystems */
 	PG_checked = PG_owner_priv_1,
 
+	/* Two page bits are conscripted by FS-Cache to maintain local caching
+	 * state.  These bits are set on pages belonging to the netfs's inodes
+	 * when those inodes are being locally cached.
+	 */
+	PG_fscache = PG_private_2,	/* page backed by cache */
+
 	/* XEN */
 	PG_pinned = PG_owner_priv_1,
 	PG_savepinned = PG_dirty,
@@ -194,8 +201,6 @@ PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
-	__SETPAGEFLAG(Private, private)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobPage, slob_page)
@@ -204,6 +209,16 @@ __PAGEFLAG(SlobFree, slob_free)
 __PAGEFLAG(SlubFrozen, slub_frozen)
 __PAGEFLAG(SlubDebug, slub_debug)
 
+/*
+ * Private page markings that may be used by the filesystem that owns the page
+ * for its own purposes.
+ * - PG_private and PG_private_2 cause releasepage() and co to be invoked
+ */
+PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
+	__CLEARPAGEFLAG(Private, private)
+PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
+PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
@@ -384,9 +399,10 @@ static inline void __ClearPageTail(struct page *page)
  * these flags set.  It they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE \
-	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
-	 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
-	 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
+	(1 << PG_lru	 | 1 << PG_locked    | \
+	 1 << PG_private | 1 << PG_private_2 | \
+	 1 << PG_buddy	 | 1 << PG_writeback | 1 << PG_reserved | \
+	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
 	 __PG_UNEVICTABLE | __PG_MLOCKED)
 
 /*
@@ -397,4 +413,16 @@ static inline void __ClearPageTail(struct page *page)
 #define PAGE_FLAGS_CHECK_AT_PREP	((1 << NR_PAGEFLAGS) - 1)
 
 #endif /* !__GENERATING_BOUNDS_H */
+
+/**
+ * page_has_private - Determine if page has private stuff
+ * @page: The page to be checked
+ *
+ * Determine if a page has private stuff, indicating that release routines
+ * should be invoked upon it.
+ */
+#define page_has_private(page)			\
+	((page)->flags & ((1 << PG_private) |	\
+			  (1 << PG_private_2)))
+
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d3973b3d1..cbc5772e7171 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2463,6 +2463,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
  * (presumably at page->private).  If the release was successful, return `1'.
  * Otherwise return zero.
  *
+ * This may also be called if PG_fscache is set on a page, indicating that the
+ * page is known to the local caching routines.
+ *
  * The @gfp_mask argument specifies whether I/O may be performed to release
  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
  *
diff --git a/mm/migrate.c b/mm/migrate.c
index a9eff3f092f6..068655d8f883 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -250,7 +250,7 @@ out:
  * The number of remaining references must be:
  * 1 for anonymous pages without a mapping
  * 2 for pages with a mapping
- * 3 for pages with a mapping and PagePrivate set.
+ * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page)
@@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
  					page_index(page));
 
-	expected_count = 2 + !!PagePrivate(page);
+	expected_count = 2 + !!page_has_private(page);
 	if (page_count(page) != expected_count ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
 		spin_unlock_irq(&mapping->tree_lock);
@@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page);
 
 /*
  * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
+ * pages that do not use PagePrivate/PagePrivate2.
  *
  * Pages are locked upon entry and exit.
  */
@@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	 * Buffers may be managed in a filesystem specific way.
 	 * We must have no buffers or drop them.
 	 */
-	if (PagePrivate(page) &&
+	if (page_has_private(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
@@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	 * free the metadata, so the page can be freed.
 	 */
 	if (!page->mapping) {
-		if (!PageAnon(page) && PagePrivate(page)) {
+		if (!PageAnon(page) && page_has_private(page)) {
 			/*
 			 * Go direct to try_to_free_buffers() here because
 			 * a) that's what try_to_release_page() would do anyway
diff --git a/mm/readahead.c b/mm/readahead.c
index 6be927569cf6..133b6d525513 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -33,14 +33,15 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
 
 /*
  * see if a page needs releasing upon read_cache_pages() failure
- * - the caller of read_cache_pages() may have set PG_private before calling,
- *   such as the NFS fs marking pages that are cached locally on disk, thus we
- *   need to give the fs a chance to clean up in the event of an error
+ * - the caller of read_cache_pages() may have set PG_private or PG_fscache
+ *   before calling, such as the NFS fs marking pages that are cached locally
+ *   on disk, thus we need to give the fs a chance to clean up in the event of
+ *   an error
  */
 static void read_cache_pages_invalidate_page(struct address_space *mapping,
 					     struct page *page)
 {
-	if (PagePrivate(page)) {
+	if (page_has_private(page)) {
 		if (!trylock_page(page))
 			BUG();
 		page->mapping = mapping;
diff --git a/mm/swap.c b/mm/swap.c
index 6e83084c1f6c..bede23ce64ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec)
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
-		if (PagePrivate(page) && trylock_page(page)) {
-			if (PagePrivate(page))
+		if (page_has_private(page) && trylock_page(page)) {
+			if (page_has_private(page))
 				try_to_release_page(page, 0);
 			unlock_page(page);
 		}
diff --git a/mm/truncate.c b/mm/truncate.c
index 1229211104f8..55206fab7b99 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-	if (PagePrivate(page))
+	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }
 
@@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return;
 
-	if (PagePrivate(page))
+	if (page_has_private(page))
 		do_invalidatepage(page, 0);
 
 	cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return 0;
 
-	if (PagePrivate(page) && !try_to_release_page(page, 0))
+	if (page_has_private(page) && !try_to_release_page(page, 0))
 		return 0;
 
 	clear_page_mlock(page);
@@ -348,7 +348,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return 0;
 
-	if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
 	spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +356,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 		goto failed;
 
 	clear_page_mlock(page);
-	BUG_ON(PagePrivate(page));
+	BUG_ON(page_has_private(page));
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
 	page_cache_release(page);	/* pagecache ref */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 06e72693b458..425244988bb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -283,7 +283,7 @@ static inline int page_mapping_inuse(struct page *page)
 
 static inline int is_page_cache_freeable(struct page *page)
 {
-	return page_count(page) - !!PagePrivate(page) == 2;
+	return page_count(page) - !!page_has_private(page) == 2;
 }
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 		 * Some data journaling orphaned pages can have
 		 * page->mapping == NULL while being dirty with clean buffers.
 		 */
-		if (PagePrivate(page)) {
+		if (page_has_private(page)) {
 			if (try_to_free_buffers(page)) {
 				ClearPageDirty(page);
 				printk("%s: orphaned page\n", __func__);
@@ -727,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * process address space (page_count == 1) it can be freed.
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
-		if (PagePrivate(page)) {
+		if (page_has_private(page)) {
 			if (!try_to_release_page(page, sc->gfp_mask))
 				goto activate_locked;
 			if (!mapping && page_count(page) == 1) {
-- 
cgit v1.2.3-71-gd317


From 2d6fff637037395cc946ef910a880b5fa67b5370 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:36 +0100
Subject: FS-Cache: Add the FS-Cache netfs API and documentation

Add the API for a generic facility (FS-Cache) by which filesystems (such as AFS
or NFS) may call on local caching capabilities without having to know anything
about how the cache works, or even if there is a cache:

	+---------+
	|         |                        +--------------+
	|   NFS   |--+                     |              |
	|         |  |                 +-->|   CacheFS    |
	+---------+  |   +----------+  |   |  /dev/hda5   |
	             |   |          |  |   +--------------+
	+---------+  +-->|          |  |
	|         |      |          |--+
	|   AFS   |----->| FS-Cache |
	|         |      |          |--+
	+---------+  +-->|          |  |
	             |   |          |  |   +--------------+
	+---------+  |   +----------+  |   |              |
	|         |  |                 +-->|  CacheFiles  |
	|  ISOFS  |--+                     |  /var/cache  |
	|         |                        +--------------+
	+---------+

General documentation and documentation of the netfs specific API are provided
in addition to the header files.

As this patch stands, it is possible to build a filesystem against the facility
and attempt to use it.  All that will happen is that all requests will be
immediately denied as if no cache is present.

Further patches will implement the core of the facility.  The facility will
transfer requests from networking filesystems to appropriate caches if
possible, or else gracefully deny them.

If this facility is disabled in the kernel configuration, then all its
operations will trivially reduce to nothing during compilation.

WHY NOT I_MAPPING?
==================

I have added my own API to implement caching rather than using i_mapping to do
this for a number of reasons.  These have been discussed a lot on the LKML and
CacheFS mailing lists, but to summarise the basics:

 (1) Most filesystems don't do hole reportage.  Holes in files are treated as
     blocks of zeros and can't be distinguished otherwise, making it difficult
     to distinguish blocks that have been read from the network and cached from
     those that haven't.

 (2) The backing inode must be fully populated before being exposed to
     userspace through the main inode because the VM/VFS goes directly to the
     backing inode and does not interrogate the front inode's VM ops.

     Therefore:

     (a) The backing inode must fit entirely within the cache.

     (b) All backed files currently open must fit entirely within the cache at
     	 the same time.

     (c) A working set of files in total larger than the cache may not be
     	 cached.

     (d) A file may not grow larger than the available space in the cache.

     (e) A file that's open and cached, and remotely grows larger than the
     	 cache is potentially stuffed.

 (3) Writes go to the backing filesystem, and can only be transferred to the
     network when the file is closed.

 (4) There's no record of what changes have been made, so the whole file must
     be written back.

 (5) The pages belong to the backing filesystem, and all metadata associated
     with that page are relevant only to the backing filesystem, and not
     anything stacked atop it.

OVERVIEW
========

FS-Cache provides (or will provide) the following facilities:

 (1) Caches can be added / removed at any time, even whilst in use.

 (2) Adds a facility by which tags can be used to refer to caches, even if
     they're not available yet.

 (3) More than one cache can be used at once.  Caches can be selected
     explicitly by use of tags.

 (4) The netfs is provided with an interface that allows either party to
     withdraw caching facilities from a file (required for (1)).

 (5) A netfs may annotate cache objects that belongs to it.  This permits the
     storage of coherency maintenance data.

 (6) Cache objects will be pinnable and space reservations will be possible.

 (7) The interface to the netfs returns as few errors as possible, preferring
     rather to let the netfs remain oblivious.

 (8) Cookies are used to represent indices, files and other objects to the
     netfs.  The simplest cookie is just a NULL pointer - indicating nothing
     cached there.

 (9) The netfs is allowed to propose - dynamically - any index hierarchy it
     desires, though it must be aware that the index search function is
     recursive, stack space is limited, and indices can only be children of
     indices.

(10) Indices can be used to group files together to reduce key size and to make
     group invalidation easier.  The use of indices may make lookup quicker,
     but that's cache dependent.

(11) Data I/O is effectively done directly to and from the netfs's pages.  The
     netfs indicates that page A is at index B of the data-file represented by
     cookie C, and that it should be read or written.  The cache backend may or
     may not start I/O on that page, but if it does, a netfs callback will be
     invoked to indicate completion.  The I/O may be either synchronous or
     asynchronous.

(12) Cookies can be "retired" upon release.  At this point FS-Cache will mark
     them as obsolete and the index hierarchy rooted at that point will get
     recycled.

(13) The netfs provides a "match" function for index searches.  In addition to
     saying whether a match was made or not, this can also specify that an
     entry should be updated or deleted.

FS-Cache maintains a virtual index tree in which all indices, files, objects
and pages are kept.  Bits of this tree may actually reside in one or more
caches.

                                           FSDEF
                                             |
                        +------------------------------------+
                        |                                    |
                       NFS                                  AFS
                        |                                    |
           +--------------------------+                +-----------+
           |                          |                |           |
        homedir                     mirror          afs.org   redhat.com
           |                          |                            |
     +------------+           +---------------+              +----------+
     |            |           |               |              |          |
   00001        00002       00007           00125        vol00001   vol00002
     |            |           |               |                         |
 +---+---+     +-----+      +---+      +------+------+            +-----+----+
 |   |   |     |     |      |   |      |      |      |            |     |    |
PG0 PG1 PG2   PG0  XATTR   PG0 PG1   DIRENT DIRENT DIRENT        R/W   R/O  Bak
                     |                                            |
                    PG0                                       +-------+
                                                              |       |
                                                            00001   00003
                                                              |
                                                          +---+---+
                                                          |   |   |
                                                         PG0 PG1 PG2

In the example above, two netfs's can be seen to be backed: NFS and AFS.  These
have different index hierarchies:

 (*) The NFS primary index will probably contain per-server indices.  Each
     server index is indexed by NFS file handles to get data file objects.
     Each data file objects can have an array of pages, but may also have
     further child objects, such as extended attributes and directory entries.
     Extended attribute objects themselves have page-array contents.

 (*) The AFS primary index contains per-cell indices.  Each cell index contains
     per-logical-volume indices.  Each of volume index contains up to three
     indices for the read-write, read-only and backup mirrors of those volumes.
     Each of these contains vnode data file objects, each of which contains an
     array of pages.

The very top index is the FS-Cache master index in which individual netfs's
have entries.

Any index object may reside in more than one cache, provided it only has index
children.  Any index with non-index object children will be assumed to only
reside in one cache.

The FS-Cache overview can be found in:

	Documentation/filesystems/caching/fscache.txt

The netfs API to FS-Cache can be found in:

	Documentation/filesystems/caching/netfs-api.txt

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/filesystems/caching/fscache.txt   | 330 ++++++++++
 Documentation/filesystems/caching/netfs-api.txt | 778 ++++++++++++++++++++++++
 include/linux/fscache.h                         | 548 +++++++++++++++++
 3 files changed, 1656 insertions(+)
 create mode 100644 Documentation/filesystems/caching/fscache.txt
 create mode 100644 Documentation/filesystems/caching/netfs-api.txt
 create mode 100644 include/linux/fscache.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
new file mode 100644
index 000000000000..a759d916273e
--- /dev/null
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -0,0 +1,330 @@
+			  ==========================
+			  General Filesystem Caching
+			  ==========================
+
+========
+OVERVIEW
+========
+
+This facility is a general purpose cache for network filesystems, though it
+could be used for caching other things such as ISO9660 filesystems too.
+
+FS-Cache mediates between cache backends (such as CacheFS) and network
+filesystems:
+
+	+---------+
+	|         |                        +--------------+
+	|   NFS   |--+                     |              |
+	|         |  |                 +-->|   CacheFS    |
+	+---------+  |   +----------+  |   |  /dev/hda5   |
+	             |   |          |  |   +--------------+
+	+---------+  +-->|          |  |
+	|         |      |          |--+
+	|   AFS   |----->| FS-Cache |
+	|         |      |          |--+
+	+---------+  +-->|          |  |
+	             |   |          |  |   +--------------+
+	+---------+  |   +----------+  |   |              |
+	|         |  |                 +-->|  CacheFiles  |
+	|  ISOFS  |--+                     |  /var/cache  |
+	|         |                        +--------------+
+	+---------+
+
+Or to look at it another way, FS-Cache is a module that provides a caching
+facility to a network filesystem such that the cache is transparent to the
+user:
+
+	+---------+
+	|         |
+	| Server  |
+	|         |
+	+---------+
+	     |                  NETWORK
+	~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	     |
+	     |           +----------+
+	     V           |          |
+	+---------+      |          |
+	|         |      |          |
+	|   NFS   |----->| FS-Cache |
+	|         |      |          |--+
+	+---------+      |          |  |   +--------------+   +--------------+
+	     |           |          |  |   |              |   |              |
+	     V           +----------+  +-->|  CacheFiles  |-->|  Ext3        |
+	+---------+                        |  /var/cache  |   |  /dev/sda6   |
+	|         |                        +--------------+   +--------------+
+	|   VFS   |                                ^                     ^
+	|         |                                |                     |
+	+---------+                                +--------------+      |
+	     |                  KERNEL SPACE                      |      |
+	~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~
+	     |                  USER SPACE                        |      |
+	     V                                                    |      |
+	+---------+                                           +--------------+
+	|         |                                           |              |
+	| Process |                                           | cachefilesd  |
+	|         |                                           |              |
+	+---------+                                           +--------------+
+
+
+FS-Cache does not follow the idea of completely loading every netfs file
+opened in its entirety into a cache before permitting it to be accessed and
+then serving the pages out of that cache rather than the netfs inode because:
+
+ (1) It must be practical to operate without a cache.
+
+ (2) The size of any accessible file must not be limited to the size of the
+     cache.
+
+ (3) The combined size of all opened files (this includes mapped libraries)
+     must not be limited to the size of the cache.
+
+ (4) The user should not be forced to download an entire file just to do a
+     one-off access of a small portion of it (such as might be done with the
+     "file" program).
+
+It instead serves the cache out in PAGE_SIZE chunks as and when requested by
+the netfs('s) using it.
+
+
+FS-Cache provides the following facilities:
+
+ (1) More than one cache can be used at once.  Caches can be selected
+     explicitly by use of tags.
+
+ (2) Caches can be added / removed at any time.
+
+ (3) The netfs is provided with an interface that allows either party to
+     withdraw caching facilities from a file (required for (2)).
+
+ (4) The interface to the netfs returns as few errors as possible, preferring
+     rather to let the netfs remain oblivious.
+
+ (5) Cookies are used to represent indices, files and other objects to the
+     netfs.  The simplest cookie is just a NULL pointer - indicating nothing
+     cached there.
+
+ (6) The netfs is allowed to propose - dynamically - any index hierarchy it
+     desires, though it must be aware that the index search function is
+     recursive, stack space is limited, and indices can only be children of
+     indices.
+
+ (7) Data I/O is done direct to and from the netfs's pages.  The netfs
+     indicates that page A is at index B of the data-file represented by cookie
+     C, and that it should be read or written.  The cache backend may or may
+     not start I/O on that page, but if it does, a netfs callback will be
+     invoked to indicate completion.  The I/O may be either synchronous or
+     asynchronous.
+
+ (8) Cookies can be "retired" upon release.  At this point FS-Cache will mark
+     them as obsolete and the index hierarchy rooted at that point will get
+     recycled.
+
+ (9) The netfs provides a "match" function for index searches.  In addition to
+     saying whether a match was made or not, this can also specify that an
+     entry should be updated or deleted.
+
+(10) As much as possible is done asynchronously.
+
+
+FS-Cache maintains a virtual indexing tree in which all indices, files, objects
+and pages are kept.  Bits of this tree may actually reside in one or more
+caches.
+
+                                           FSDEF
+                                             |
+                        +------------------------------------+
+                        |                                    |
+                       NFS                                  AFS
+                        |                                    |
+           +--------------------------+                +-----------+
+           |                          |                |           |
+        homedir                     mirror          afs.org   redhat.com
+           |                          |                            |
+     +------------+           +---------------+              +----------+
+     |            |           |               |              |          |
+   00001        00002       00007           00125        vol00001   vol00002
+     |            |           |               |                         |
+ +---+---+     +-----+      +---+      +------+------+            +-----+----+
+ |   |   |     |     |      |   |      |      |      |            |     |    |
+PG0 PG1 PG2   PG0  XATTR   PG0 PG1   DIRENT DIRENT DIRENT        R/W   R/O  Bak
+                     |                                            |
+                    PG0                                       +-------+
+                                                              |       |
+                                                            00001   00003
+                                                              |
+                                                          +---+---+
+                                                          |   |   |
+                                                         PG0 PG1 PG2
+
+In the example above, you can see two netfs's being backed: NFS and AFS.  These
+have different index hierarchies:
+
+ (*) The NFS primary index contains per-server indices.  Each server index is
+     indexed by NFS file handles to get data file objects.  Each data file
+     objects can have an array of pages, but may also have further child
+     objects, such as extended attributes and directory entries.  Extended
+     attribute objects themselves have page-array contents.
+
+ (*) The AFS primary index contains per-cell indices.  Each cell index contains
+     per-logical-volume indices.  Each of volume index contains up to three
+     indices for the read-write, read-only and backup mirrors of those volumes.
+     Each of these contains vnode data file objects, each of which contains an
+     array of pages.
+
+The very top index is the FS-Cache master index in which individual netfs's
+have entries.
+
+Any index object may reside in more than one cache, provided it only has index
+children.  Any index with non-index object children will be assumed to only
+reside in one cache.
+
+
+The netfs API to FS-Cache can be found in:
+
+	Documentation/filesystems/caching/netfs-api.txt
+
+The cache backend API to FS-Cache can be found in:
+
+	Documentation/filesystems/caching/backend-api.txt
+
+
+=======================
+STATISTICAL INFORMATION
+=======================
+
+If FS-Cache is compiled with the following options enabled:
+
+	CONFIG_FSCACHE_PROC=y (implied by the following two)
+	CONFIG_FSCACHE_STATS=y
+	CONFIG_FSCACHE_HISTOGRAM=y
+
+then it will gather certain statistics and display them through a number of
+proc files.
+
+ (*) /proc/fs/fscache/stats
+
+     This shows counts of a number of events that can happen in FS-Cache:
+
+	CLASS	EVENT	MEANING
+	=======	=======	=======================================================
+	Cookies	idx=N	Number of index cookies allocated
+		dat=N	Number of data storage cookies allocated
+		spc=N	Number of special cookies allocated
+	Objects	alc=N	Number of objects allocated
+		nal=N	Number of object allocation failures
+		avl=N	Number of objects that reached the available state
+		ded=N	Number of objects that reached the dead state
+	ChkAux	non=N	Number of objects that didn't have a coherency check
+		ok=N	Number of objects that passed a coherency check
+		upd=N	Number of objects that needed a coherency data update
+		obs=N	Number of objects that were declared obsolete
+	Pages	mrk=N	Number of pages marked as being cached
+		unc=N	Number of uncache page requests seen
+	Acquire	n=N	Number of acquire cookie requests seen
+		nul=N	Number of acq reqs given a NULL parent
+		noc=N	Number of acq reqs rejected due to no cache available
+		ok=N	Number of acq reqs succeeded
+		nbf=N	Number of acq reqs rejected due to error
+		oom=N	Number of acq reqs failed on ENOMEM
+	Lookups	n=N	Number of lookup calls made on cache backends
+		neg=N	Number of negative lookups made
+		pos=N	Number of positive lookups made
+		crt=N	Number of objects created by lookup
+	Updates	n=N	Number of update cookie requests seen
+		nul=N	Number of upd reqs given a NULL parent
+		run=N	Number of upd reqs granted CPU time
+	Relinqs	n=N	Number of relinquish cookie requests seen
+		nul=N	Number of rlq reqs given a NULL parent
+		wcr=N	Number of rlq reqs waited on completion of creation
+	AttrChg	n=N	Number of attribute changed requests seen
+		ok=N	Number of attr changed requests queued
+		nbf=N	Number of attr changed rejected -ENOBUFS
+		oom=N	Number of attr changed failed -ENOMEM
+		run=N	Number of attr changed ops given CPU time
+	Allocs	n=N	Number of allocation requests seen
+		ok=N	Number of successful alloc reqs
+		wt=N	Number of alloc reqs that waited on lookup completion
+		nbf=N	Number of alloc reqs rejected -ENOBUFS
+		ops=N	Number of alloc reqs submitted
+		owt=N	Number of alloc reqs waited for CPU time
+	Retrvls	n=N	Number of retrieval (read) requests seen
+		ok=N	Number of successful retr reqs
+		wt=N	Number of retr reqs that waited on lookup completion
+		nod=N	Number of retr reqs returned -ENODATA
+		nbf=N	Number of retr reqs rejected -ENOBUFS
+		int=N	Number of retr reqs aborted -ERESTARTSYS
+		oom=N	Number of retr reqs failed -ENOMEM
+		ops=N	Number of retr reqs submitted
+		owt=N	Number of retr reqs waited for CPU time
+	Stores	n=N	Number of storage (write) requests seen
+		ok=N	Number of successful store reqs
+		agn=N	Number of store reqs on a page already pending storage
+		nbf=N	Number of store reqs rejected -ENOBUFS
+		oom=N	Number of store reqs failed -ENOMEM
+		ops=N	Number of store reqs submitted
+		run=N	Number of store reqs granted CPU time
+	Ops	pend=N	Number of times async ops added to pending queues
+		run=N	Number of times async ops given CPU time
+		enq=N	Number of times async ops queued for processing
+		dfr=N	Number of async ops queued for deferred release
+		rel=N	Number of async ops released
+		gc=N	Number of deferred-release async ops garbage collected
+
+
+ (*) /proc/fs/fscache/histogram
+
+	cat /proc/fs/fscache/histogram
+	+HZ   +TIME OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS
+	===== ===== ========= ========= ========= ========= =========
+
+     This shows the breakdown of the number of times each amount of time
+     between 0 jiffies and HZ-1 jiffies a variety of tasks took to run.  The
+     columns are as follows:
+
+	COLUMN		TIME MEASUREMENT
+	=======		=======================================================
+	OBJ INST	Length of time to instantiate an object
+	OP RUNS		Length of time a call to process an operation took
+	OBJ RUNS	Length of time a call to process an object event took
+	RETRV DLY	Time between an requesting a read and lookup completing
+	RETRIEVLS	Time between beginning and end of a retrieval
+
+     Each row shows the number of events that took a particular range of times.
+     Each step is 1 jiffy in size.  The +HZ column indicates the particular
+     jiffy range covered, and the +TIME field the equivalent number of seconds.
+
+
+=========
+DEBUGGING
+=========
+
+The FS-Cache facility can have runtime debugging enabled by adjusting the value
+in:
+
+	/sys/module/fscache/parameters/debug
+
+This is a bitmask of debugging streams to enable:
+
+	BIT	VALUE	STREAM				POINT
+	=======	=======	===============================	=======================
+	0	1	Cache management		Function entry trace
+	1	2					Function exit trace
+	2	4					General
+	3	8	Cookie management		Function entry trace
+	4	16					Function exit trace
+	5	32					General
+	6	64	Page handling			Function entry trace
+	7	128					Function exit trace
+	8	256					General
+	9	512	Operation management		Function entry trace
+	10	1024					Function exit trace
+	11	2048					General
+
+The appropriate set of values should be OR'd together and the result written to
+the control file.  For example:
+
+	echo $((1|8|64)) >/sys/module/fscache/parameters/debug
+
+will turn on all function entry debugging.
+
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
new file mode 100644
index 000000000000..4db125b3a5c6
--- /dev/null
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -0,0 +1,778 @@
+			===============================
+			FS-CACHE NETWORK FILESYSTEM API
+			===============================
+
+There's an API by which a network filesystem can make use of the FS-Cache
+facilities.  This is based around a number of principles:
+
+ (1) Caches can store a number of different object types.  There are two main
+     object types: indices and files.  The first is a special type used by
+     FS-Cache to make finding objects faster and to make retiring of groups of
+     objects easier.
+
+ (2) Every index, file or other object is represented by a cookie.  This cookie
+     may or may not have anything associated with it, but the netfs doesn't
+     need to care.
+
+ (3) Barring the top-level index (one entry per cached netfs), the index
+     hierarchy for each netfs is structured according the whim of the netfs.
+
+This API is declared in <linux/fscache.h>.
+
+This document contains the following sections:
+
+	 (1) Network filesystem definition
+	 (2) Index definition
+	 (3) Object definition
+	 (4) Network filesystem (un)registration
+	 (5) Cache tag lookup
+	 (6) Index registration
+	 (7) Data file registration
+	 (8) Miscellaneous object registration
+	 (9) Setting the data file size
+	(10) Page alloc/read/write
+	(11) Page uncaching
+	(12) Index and data file update
+	(13) Miscellaneous cookie operations
+	(14) Cookie unregistration
+	(15) Index and data file invalidation
+	(16) FS-Cache specific page flags.
+
+
+=============================
+NETWORK FILESYSTEM DEFINITION
+=============================
+
+FS-Cache needs a description of the network filesystem.  This is specified
+using a record of the following structure:
+
+	struct fscache_netfs {
+		uint32_t			version;
+		const char			*name;
+		struct fscache_cookie		*primary_index;
+		...
+	};
+
+This first two fields should be filled in before registration, and the third
+will be filled in by the registration function; any other fields should just be
+ignored and are for internal use only.
+
+The fields are:
+
+ (1) The name of the netfs (used as the key in the toplevel index).
+
+ (2) The version of the netfs (if the name matches but the version doesn't, the
+     entire in-cache hierarchy for this netfs will be scrapped and begun
+     afresh).
+
+ (3) The cookie representing the primary index will be allocated according to
+     another parameter passed into the registration function.
+
+For example, kAFS (linux/fs/afs/) uses the following definitions to describe
+itself:
+
+	struct fscache_netfs afs_cache_netfs = {
+		.version	= 0,
+		.name		= "afs",
+	};
+
+
+================
+INDEX DEFINITION
+================
+
+Indices are used for two purposes:
+
+ (1) To aid the finding of a file based on a series of keys (such as AFS's
+     "cell", "volume ID", "vnode ID").
+
+ (2) To make it easier to discard a subset of all the files cached based around
+     a particular key - for instance to mirror the removal of an AFS volume.
+
+However, since it's unlikely that any two netfs's are going to want to define
+their index hierarchies in quite the same way, FS-Cache tries to impose as few
+restraints as possible on how an index is structured and where it is placed in
+the tree.  The netfs can even mix indices and data files at the same level, but
+it's not recommended.
+
+Each index entry consists of a key of indeterminate length plus some auxilliary
+data, also of indeterminate length.
+
+There are some limits on indices:
+
+ (1) Any index containing non-index objects should be restricted to a single
+     cache.  Any such objects created within an index will be created in the
+     first cache only.  The cache in which an index is created can be
+     controlled by cache tags (see below).
+
+ (2) The entry data must be atomically journallable, so it is limited to about
+     400 bytes at present.  At least 400 bytes will be available.
+
+ (3) The depth of the index tree should be judged with care as the search
+     function is recursive.  Too many layers will run the kernel out of stack.
+
+
+=================
+OBJECT DEFINITION
+=================
+
+To define an object, a structure of the following type should be filled out:
+
+	struct fscache_cookie_def
+	{
+		uint8_t name[16];
+		uint8_t type;
+
+		struct fscache_cache_tag *(*select_cache)(
+			const void *parent_netfs_data,
+			const void *cookie_netfs_data);
+
+		uint16_t (*get_key)(const void *cookie_netfs_data,
+				    void *buffer,
+				    uint16_t bufmax);
+
+		void (*get_attr)(const void *cookie_netfs_data,
+				 uint64_t *size);
+
+		uint16_t (*get_aux)(const void *cookie_netfs_data,
+				    void *buffer,
+				    uint16_t bufmax);
+
+		enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
+						   const void *data,
+						   uint16_t datalen);
+
+		void (*get_context)(void *cookie_netfs_data, void *context);
+
+		void (*put_context)(void *cookie_netfs_data, void *context);
+
+		void (*mark_pages_cached)(void *cookie_netfs_data,
+					  struct address_space *mapping,
+					  struct pagevec *cached_pvec);
+
+		void (*now_uncached)(void *cookie_netfs_data);
+	};
+
+This has the following fields:
+
+ (1) The type of the object [mandatory].
+
+     This is one of the following values:
+
+	(*) FSCACHE_COOKIE_TYPE_INDEX
+
+	    This defines an index, which is a special FS-Cache type.
+
+	(*) FSCACHE_COOKIE_TYPE_DATAFILE
+
+	    This defines an ordinary data file.
+
+	(*) Any other value between 2 and 255
+
+	    This defines an extraordinary object such as an XATTR.
+
+ (2) The name of the object type (NUL terminated unless all 16 chars are used)
+     [optional].
+
+ (3) A function to select the cache in which to store an index [optional].
+
+     This function is invoked when an index needs to be instantiated in a cache
+     during the instantiation of a non-index object.  Only the immediate index
+     parent for the non-index object will be queried.  Any indices above that
+     in the hierarchy may be stored in multiple caches.  This function does not
+     need to be supplied for any non-index object or any index that will only
+     have index children.
+
+     If this function is not supplied or if it returns NULL then the first
+     cache in the parent's list will be chosed, or failing that, the first
+     cache in the master list.
+
+ (4) A function to retrieve an object's key from the netfs [mandatory].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function and the maximum length of key data that it may
+     provide.  It should write the required key data into the given buffer and
+     return the quantity it wrote.
+
+ (5) A function to retrieve attribute data from the netfs [optional].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function.  It should return the size of the file if
+     this is a data file.  The size may be used to govern how much cache must
+     be reserved for this file in the cache.
+
+     If the function is absent, a file size of 0 is assumed.
+
+ (6) A function to retrieve auxilliary data from the netfs [optional].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function and the maximum length of auxilliary data that
+     it may provide.  It should write the auxilliary data into the given buffer
+     and return the quantity it wrote.
+
+     If this function is absent, the auxilliary data length will be set to 0.
+
+     The length of the auxilliary data buffer may be dependent on the key
+     length.  A netfs mustn't rely on being able to provide more than 400 bytes
+     for both.
+
+ (7) A function to check the auxilliary data [optional].
+
+     This function will be called to check that a match found in the cache for
+     this object is valid.  For instance with AFS it could check the auxilliary
+     data against the data version number returned by the server to determine
+     whether the index entry in a cache is still valid.
+
+     If this function is absent, it will be assumed that matching objects in a
+     cache are always valid.
+
+     If present, the function should return one of the following values:
+
+	(*) FSCACHE_CHECKAUX_OKAY		- the entry is okay as is
+	(*) FSCACHE_CHECKAUX_NEEDS_UPDATE	- the entry requires update
+	(*) FSCACHE_CHECKAUX_OBSOLETE		- the entry should be deleted
+
+     This function can also be used to extract data from the auxilliary data in
+     the cache and copy it into the netfs's structures.
+
+ (8) A pair of functions to manage contexts for the completion callback
+     [optional].
+
+     The cache read/write functions are passed a context which is then passed
+     to the I/O completion callback function.  To ensure this context remains
+     valid until after the I/O completion is called, two functions may be
+     provided: one to get an extra reference on the context, and one to drop a
+     reference to it.
+
+     If the context is not used or is a type of object that won't go out of
+     scope, then these functions are not required.  These functions are not
+     required for indices as indices may not contain data.  These functions may
+     be called in interrupt context and so may not sleep.
+
+ (9) A function to mark a page as retaining cache metadata [optional].
+
+     This is called by the cache to indicate that it is retaining in-memory
+     information for this page and that the netfs should uncache the page when
+     it has finished.  This does not indicate whether there's data on the disk
+     or not.  Note that several pages at once may be presented for marking.
+
+     The PG_fscache bit is set on the pages before this function would be
+     called, so the function need not be provided if this is sufficient.
+
+     This function is not required for indices as they're not permitted data.
+
+(10) A function to unmark all the pages retaining cache metadata [mandatory].
+
+     This is called by FS-Cache to indicate that a backing store is being
+     unbound from a cookie and that all the marks on the pages should be
+     cleared to prevent confusion.  Note that the cache will have torn down all
+     its tracking information so that the pages don't need to be explicitly
+     uncached.
+
+     This function is not required for indices as they're not permitted data.
+
+
+===================================
+NETWORK FILESYSTEM (UN)REGISTRATION
+===================================
+
+The first step is to declare the network filesystem to the cache.  This also
+involves specifying the layout of the primary index (for AFS, this would be the
+"cell" level).
+
+The registration function is:
+
+	int fscache_register_netfs(struct fscache_netfs *netfs);
+
+It just takes a pointer to the netfs definition.  It returns 0 or an error as
+appropriate.
+
+For kAFS, registration is done as follows:
+
+	ret = fscache_register_netfs(&afs_cache_netfs);
+
+The last step is, of course, unregistration:
+
+	void fscache_unregister_netfs(struct fscache_netfs *netfs);
+
+
+================
+CACHE TAG LOOKUP
+================
+
+FS-Cache permits the use of more than one cache.  To permit particular index
+subtrees to be bound to particular caches, the second step is to look up cache
+representation tags.  This step is optional; it can be left entirely up to
+FS-Cache as to which cache should be used.  The problem with doing that is that
+FS-Cache will always pick the first cache that was registered.
+
+To get the representation for a named tag:
+
+	struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name);
+
+This takes a text string as the name and returns a representation of a tag.  It
+will never return an error.  It may return a dummy tag, however, if it runs out
+of memory; this will inhibit caching with this tag.
+
+Any representation so obtained must be released by passing it to this function:
+
+	void fscache_release_cache_tag(struct fscache_cache_tag *tag);
+
+The tag will be retrieved by FS-Cache when it calls the object definition
+operation select_cache().
+
+
+==================
+INDEX REGISTRATION
+==================
+
+The third step is to inform FS-Cache about part of an index hierarchy that can
+be used to locate files.  This is done by requesting a cookie for each index in
+the path to the file:
+
+	struct fscache_cookie *
+	fscache_acquire_cookie(struct fscache_cookie *parent,
+			       const struct fscache_object_def *def,
+			       void *netfs_data);
+
+This function creates an index entry in the index represented by parent,
+filling in the index entry by calling the operations pointed to by def.
+
+Note that this function never returns an error - all errors are handled
+internally.  It may, however, return NULL to indicate no cookie.  It is quite
+acceptable to pass this token back to this function as the parent to another
+acquisition (or even to the relinquish cookie, read page and write page
+functions - see below).
+
+Note also that no indices are actually created in a cache until a non-index
+object needs to be created somewhere down the hierarchy.  Furthermore, an index
+may be created in several different caches independently at different times.
+This is all handled transparently, and the netfs doesn't see any of it.
+
+For example, with AFS, a cell would be added to the primary index.  This index
+entry would have a dependent inode containing a volume location index for the
+volume mappings within this cell:
+
+	cell->cache =
+		fscache_acquire_cookie(afs_cache_netfs.primary_index,
+				       &afs_cell_cache_index_def,
+				       cell);
+
+Then when a volume location was accessed, it would be entered into the cell's
+index and an inode would be allocated that acts as a volume type and hash chain
+combination:
+
+	vlocation->cache =
+		fscache_acquire_cookie(cell->cache,
+				       &afs_vlocation_cache_index_def,
+				       vlocation);
+
+And then a particular flavour of volume (R/O for example) could be added to
+that index, creating another index for vnodes (AFS inode equivalents):
+
+	volume->cache =
+		fscache_acquire_cookie(vlocation->cache,
+				       &afs_volume_cache_index_def,
+				       volume);
+
+
+======================
+DATA FILE REGISTRATION
+======================
+
+The fourth step is to request a data file be created in the cache.  This is
+identical to index cookie acquisition.  The only difference is that the type in
+the object definition should be something other than index type.
+
+	vnode->cache =
+		fscache_acquire_cookie(volume->cache,
+				       &afs_vnode_cache_object_def,
+				       vnode);
+
+
+=================================
+MISCELLANEOUS OBJECT REGISTRATION
+=================================
+
+An optional step is to request an object of miscellaneous type be created in
+the cache.  This is almost identical to index cookie acquisition.  The only
+difference is that the type in the object definition should be something other
+than index type.  Whilst the parent object could be an index, it's more likely
+it would be some other type of object such as a data file.
+
+	xattr->cache =
+		fscache_acquire_cookie(vnode->cache,
+				       &afs_xattr_cache_object_def,
+				       xattr);
+
+Miscellaneous objects might be used to store extended attributes or directory
+entries for example.
+
+
+==========================
+SETTING THE DATA FILE SIZE
+==========================
+
+The fifth step is to set the physical attributes of the file, such as its size.
+This doesn't automatically reserve any space in the cache, but permits the
+cache to adjust its metadata for data tracking appropriately:
+
+	int fscache_attr_changed(struct fscache_cookie *cookie);
+
+The cache will return -ENOBUFS if there is no backing cache or if there is no
+space to allocate any extra metadata required in the cache.  The attributes
+will be accessed with the get_attr() cookie definition operation.
+
+Note that attempts to read or write data pages in the cache over this size may
+be rebuffed with -ENOBUFS.
+
+This operation schedules an attribute adjustment to happen asynchronously at
+some point in the future, and as such, it may happen after the function returns
+to the caller.  The attribute adjustment excludes read and write operations.
+
+
+=====================
+PAGE READ/ALLOC/WRITE
+=====================
+
+And the sixth step is to store and retrieve pages in the cache.  There are
+three functions that are used to do this.
+
+Note:
+
+ (1) A page should not be re-read or re-allocated without uncaching it first.
+
+ (2) A read or allocated page must be uncached when the netfs page is released
+     from the pagecache.
+
+ (3) A page should only be written to the cache if previous read or allocated.
+
+This permits the cache to maintain its page tracking in proper order.
+
+
+PAGE READ
+---------
+
+Firstly, the netfs should ask FS-Cache to examine the caches and read the
+contents cached for a particular page of a particular file if present, or else
+allocate space to store the contents if not:
+
+	typedef
+	void (*fscache_rw_complete_t)(struct page *page,
+				      void *context,
+				      int error);
+
+	int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+				       struct page *page,
+				       fscache_rw_complete_t end_io_func,
+				       void *context,
+				       gfp_t gfp);
+
+The cookie argument must specify a cookie for an object that isn't an index,
+the page specified will have the data loaded into it (and is also used to
+specify the page number), and the gfp argument is used to control how any
+memory allocations made are satisfied.
+
+If the cookie indicates the inode is not cached:
+
+ (1) The function will return -ENOBUFS.
+
+Else if there's a copy of the page resident in the cache:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) The function will submit a request to read the data from the cache's
+     backing device directly into the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the read is complete, end_io_func() will be invoked with:
+
+     (*) The netfs data supplied when the cookie was created.
+
+     (*) The page descriptor.
+
+     (*) The context argument passed to the above function.  This will be
+         maintained with the get_context/put_context functions mentioned above.
+
+     (*) An argument that's 0 on success or negative for an error code.
+
+     If an error occurs, it should be assumed that the page contains no usable
+     data.
+
+     end_io_func() will be called in process context if the read is results in
+     an error, but it might be called in interrupt context if the read is
+     successful.
+
+Otherwise, if there's not a copy available in cache, but the cache may be able
+to store the page:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) A block may be reserved in the cache and attached to the object at the
+     appropriate place.
+
+ (3) The function will return -ENODATA.
+
+This function may also return -ENOMEM or -EINTR, in which case it won't have
+read any data from the cache.
+
+
+PAGE ALLOCATE
+-------------
+
+Alternatively, if there's not expected to be any data in the cache for a page
+because the file has been extended, a block can simply be allocated instead:
+
+	int fscache_alloc_page(struct fscache_cookie *cookie,
+			       struct page *page,
+			       gfp_t gfp);
+
+This is similar to the fscache_read_or_alloc_page() function, except that it
+never reads from the cache.  It will return 0 if a block has been allocated,
+rather than -ENODATA as the other would.  One or the other must be performed
+before writing to the cache.
+
+The mark_pages_cached() cookie operation will be called on the page if
+successful.
+
+
+PAGE WRITE
+----------
+
+Secondly, if the netfs changes the contents of the page (either due to an
+initial download or if a user performs a write), then the page should be
+written back to the cache:
+
+	int fscache_write_page(struct fscache_cookie *cookie,
+			       struct page *page,
+			       gfp_t gfp);
+
+The cookie argument must specify a data file cookie, the page specified should
+contain the data to be written (and is also used to specify the page number),
+and the gfp argument is used to control how any memory allocations made are
+satisfied.
+
+The page must have first been read or allocated successfully and must not have
+been uncached before writing is performed.
+
+If the cookie indicates the inode is not cached then:
+
+ (1) The function will return -ENOBUFS.
+
+Else if space can be allocated in the cache to hold this page:
+
+ (1) PG_fscache_write will be set on the page.
+
+ (2) The function will submit a request to write the data to cache's backing
+     device directly from the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the write is complete PG_fscache_write is cleared on the page and
+     anyone waiting for that bit will be woken up.
+
+Else if there's no space available in the cache, -ENOBUFS will be returned.  It
+is also possible for the PG_fscache_write bit to be cleared when no write took
+place if unforeseen circumstances arose (such as a disk error).
+
+Writing takes place asynchronously.
+
+
+MULTIPLE PAGE READ
+------------------
+
+A facility is provided to read several pages at once, as requested by the
+readpages() address space operation:
+
+	int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+					struct address_space *mapping,
+					struct list_head *pages,
+					int *nr_pages,
+					fscache_rw_complete_t end_io_func,
+					void *context,
+					gfp_t gfp);
+
+This works in a similar way to fscache_read_or_alloc_page(), except:
+
+ (1) Any page it can retrieve data for is removed from pages and nr_pages and
+     dispatched for reading to the disk.  Reads of adjacent pages on disk may
+     be merged for greater efficiency.
+
+ (2) The mark_pages_cached() cookie operation will be called on several pages
+     at once if they're being read or allocated.
+
+ (3) If there was an general error, then that error will be returned.
+
+     Else if some pages couldn't be allocated or read, then -ENOBUFS will be
+     returned.
+
+     Else if some pages couldn't be read but were allocated, then -ENODATA will
+     be returned.
+
+     Otherwise, if all pages had reads dispatched, then 0 will be returned, the
+     list will be empty and *nr_pages will be 0.
+
+ (4) end_io_func will be called once for each page being read as the reads
+     complete.  It will be called in process context if error != 0, but it may
+     be called in interrupt context if there is no error.
+
+Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude
+some of the pages being read and some being allocated.  Those pages will have
+been marked appropriately and will need uncaching.
+
+
+==============
+PAGE UNCACHING
+==============
+
+To uncache a page, this function should be called:
+
+	void fscache_uncache_page(struct fscache_cookie *cookie,
+				  struct page *page);
+
+This function permits the cache to release any in-memory representation it
+might be holding for this netfs page.  This function must be called once for
+each page on which the read or write page functions above have been called to
+make sure the cache's in-memory tracking information gets torn down.
+
+Note that pages can't be explicitly deleted from the a data file.  The whole
+data file must be retired (see the relinquish cookie function below).
+
+Furthermore, note that this does not cancel the asynchronous read or write
+operation started by the read/alloc and write functions, so the page
+invalidation and release functions must use:
+
+	bool fscache_check_page_write(struct fscache_cookie *cookie,
+				      struct page *page);
+
+to see if a page is being written to the cache, and:
+
+	void fscache_wait_on_page_write(struct fscache_cookie *cookie,
+					struct page *page);
+
+to wait for it to finish if it is.
+
+
+==========================
+INDEX AND DATA FILE UPDATE
+==========================
+
+To request an update of the index data for an index or other object, the
+following function should be called:
+
+	void fscache_update_cookie(struct fscache_cookie *cookie);
+
+This function will refer back to the netfs_data pointer stored in the cookie by
+the acquisition function to obtain the data to write into each revised index
+entry.  The update method in the parent index definition will be called to
+transfer the data.
+
+Note that partial updates may happen automatically at other times, such as when
+data blocks are added to a data file object.
+
+
+===============================
+MISCELLANEOUS COOKIE OPERATIONS
+===============================
+
+There are a number of operations that can be used to control cookies:
+
+ (*) Cookie pinning:
+
+	int fscache_pin_cookie(struct fscache_cookie *cookie);
+	void fscache_unpin_cookie(struct fscache_cookie *cookie);
+
+     These operations permit data cookies to be pinned into the cache and to
+     have the pinning removed.  They are not permitted on index cookies.
+
+     The pinning function will return 0 if successful, -ENOBUFS in the cookie
+     isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning,
+     -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+     -EIO if there's any other problem.
+
+ (*) Data space reservation:
+
+	int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size);
+
+     This permits a netfs to request cache space be reserved to store up to the
+     given amount of a file.  It is permitted to ask for more than the current
+     size of the file to allow for future file expansion.
+
+     If size is given as zero then the reservation will be cancelled.
+
+     The function will return 0 if successful, -ENOBUFS in the cookie isn't
+     backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations,
+     -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+     -EIO if there's any other problem.
+
+     Note that this doesn't pin an object in a cache; it can still be culled to
+     make space if it's not in use.
+
+
+=====================
+COOKIE UNREGISTRATION
+=====================
+
+To get rid of a cookie, this function should be called.
+
+	void fscache_relinquish_cookie(struct fscache_cookie *cookie,
+				       int retire);
+
+If retire is non-zero, then the object will be marked for recycling, and all
+copies of it will be removed from all active caches in which it is present.
+Not only that but all child objects will also be retired.
+
+If retire is zero, then the object may be available again when next the
+acquisition function is called.  Retirement here will overrule the pinning on a
+cookie.
+
+One very important note - relinquish must NOT be called for a cookie unless all
+the cookies for "child" indices, objects and pages have been relinquished
+first.
+
+
+================================
+INDEX AND DATA FILE INVALIDATION
+================================
+
+There is no direct way to invalidate an index subtree or a data file.  To do
+this, the caller should relinquish and retire the cookie they have, and then
+acquire a new one.
+
+
+===========================
+FS-CACHE SPECIFIC PAGE FLAG
+===========================
+
+FS-Cache makes use of a page flag, PG_private_2, for its own purpose.  This is
+given the alternative name PG_fscache.
+
+PG_fscache is used to indicate that the page is known by the cache, and that
+the cache must be informed if the page is going to go away.  It's an indication
+to the netfs that the cache has an interest in this page, where an interest may
+be a pointer to it, resources allocated or reserved for it, or I/O in progress
+upon it.
+
+The netfs can use this information in methods such as releasepage() to
+determine whether it needs to uncache a page or update it.
+
+Furthermore, if this bit is set, releasepage() and invalidatepage() operations
+will be called on a page to get rid of it, even if PG_private is not set.  This
+allows caching to attempted on a page before read_cache_pages() to be called
+after fscache_read_or_alloc_pages() as the former will try and release pages it
+was given under certain circumstances.
+
+This bit does not overlap with such as PG_private.  This means that FS-Cache
+can be used with a filesystem that uses the block buffering code.
+
+There are a number of operations defined on this flag:
+
+	int PageFsCache(struct page *page);
+	void SetPageFsCache(struct page *page)
+	void ClearPageFsCache(struct page *page)
+	int TestSetPageFsCache(struct page *page)
+	int TestClearPageFsCache(struct page *page)
+
+These functions are bit test, bit set, bit clear, bit test and set and bit
+test and clear operations on PG_fscache.
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
new file mode 100644
index 000000000000..feb3b0e0af4d
--- /dev/null
+++ b/include/linux/fscache.h
@@ -0,0 +1,548 @@
+/* General filesystem caching interface
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * NOTE!!! See:
+ *
+ *	Documentation/filesystems/caching/netfs-api.txt
+ *
+ * for a description of the network filesystem interface declared here.
+ */
+
+#ifndef _LINUX_FSCACHE_H
+#define _LINUX_FSCACHE_H
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+
+#if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
+#define fscache_available() (1)
+#define fscache_cookie_valid(cookie) (cookie)
+#else
+#define fscache_available() (0)
+#define fscache_cookie_valid(cookie) (0)
+#endif
+
+
+/*
+ * overload PG_private_2 to give us PG_fscache - this is used to indicate that
+ * a page is currently backed by a local disk cache
+ */
+#define PageFsCache(page)		PagePrivate2((page))
+#define SetPageFsCache(page)		SetPagePrivate2((page))
+#define ClearPageFsCache(page)		ClearPagePrivate2((page))
+#define TestSetPageFsCache(page)	TestSetPagePrivate2((page))
+#define TestClearPageFsCache(page)	TestClearPagePrivate2((page))
+
+/* pattern used to fill dead space in an index entry */
+#define FSCACHE_INDEX_DEADFILL_PATTERN 0x79
+
+struct pagevec;
+struct fscache_cache_tag;
+struct fscache_cookie;
+struct fscache_netfs;
+
+typedef void (*fscache_rw_complete_t)(struct page *page,
+				      void *context,
+				      int error);
+
+/* result of index entry consultation */
+enum fscache_checkaux {
+	FSCACHE_CHECKAUX_OKAY,		/* entry okay as is */
+	FSCACHE_CHECKAUX_NEEDS_UPDATE,	/* entry requires update */
+	FSCACHE_CHECKAUX_OBSOLETE,	/* entry requires deletion */
+};
+
+/*
+ * fscache cookie definition
+ */
+struct fscache_cookie_def {
+	/* name of cookie type */
+	char name[16];
+
+	/* cookie type */
+	uint8_t type;
+#define FSCACHE_COOKIE_TYPE_INDEX	0
+#define FSCACHE_COOKIE_TYPE_DATAFILE	1
+
+	/* select the cache into which to insert an entry in this index
+	 * - optional
+	 * - should return a cache identifier or NULL to cause the cache to be
+	 *   inherited from the parent if possible or the first cache picked
+	 *   for a non-index file if not
+	 */
+	struct fscache_cache_tag *(*select_cache)(
+		const void *parent_netfs_data,
+		const void *cookie_netfs_data);
+
+	/* get an index key
+	 * - should store the key data in the buffer
+	 * - should return the amount of amount stored
+	 * - not permitted to return an error
+	 * - the netfs data from the cookie being used as the source is
+	 *   presented
+	 */
+	uint16_t (*get_key)(const void *cookie_netfs_data,
+			    void *buffer,
+			    uint16_t bufmax);
+
+	/* get certain file attributes from the netfs data
+	 * - this function can be absent for an index
+	 * - not permitted to return an error
+	 * - the netfs data from the cookie being used as the source is
+	 *   presented
+	 */
+	void (*get_attr)(const void *cookie_netfs_data, uint64_t *size);
+
+	/* get the auxilliary data from netfs data
+	 * - this function can be absent if the index carries no state data
+	 * - should store the auxilliary data in the buffer
+	 * - should return the amount of amount stored
+	 * - not permitted to return an error
+	 * - the netfs data from the cookie being used as the source is
+	 *   presented
+	 */
+	uint16_t (*get_aux)(const void *cookie_netfs_data,
+			    void *buffer,
+			    uint16_t bufmax);
+
+	/* consult the netfs about the state of an object
+	 * - this function can be absent if the index carries no state data
+	 * - the netfs data from the cookie being used as the target is
+	 *   presented, as is the auxilliary data
+	 */
+	enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
+					   const void *data,
+					   uint16_t datalen);
+
+	/* get an extra reference on a read context
+	 * - this function can be absent if the completion function doesn't
+	 *   require a context
+	 */
+	void (*get_context)(void *cookie_netfs_data, void *context);
+
+	/* release an extra reference on a read context
+	 * - this function can be absent if the completion function doesn't
+	 *   require a context
+	 */
+	void (*put_context)(void *cookie_netfs_data, void *context);
+
+	/* indicate pages that now have cache metadata retained
+	 * - this function should mark the specified pages as now being cached
+	 * - the pages will have been marked with PG_fscache before this is
+	 *   called, so this is optional
+	 */
+	void (*mark_pages_cached)(void *cookie_netfs_data,
+				  struct address_space *mapping,
+				  struct pagevec *cached_pvec);
+
+	/* indicate the cookie is no longer cached
+	 * - this function is called when the backing store currently caching
+	 *   a cookie is removed
+	 * - the netfs should use this to clean up any markers indicating
+	 *   cached pages
+	 * - this is mandatory for any object that may have data
+	 */
+	void (*now_uncached)(void *cookie_netfs_data);
+};
+
+/*
+ * fscache cached network filesystem type
+ * - name, version and ops must be filled in before registration
+ * - all other fields will be set during registration
+ */
+struct fscache_netfs {
+	uint32_t			version;	/* indexing version */
+	const char			*name;		/* filesystem name */
+	struct fscache_cookie		*primary_index;
+	struct list_head		link;		/* internal link */
+};
+
+/*
+ * slow-path functions for when there is actually caching available, and the
+ * netfs does actually have a valid token
+ * - these are not to be called directly
+ * - these are undefined symbols when FS-Cache is not configured and the
+ *   optimiser takes care of not using them
+ */
+
+/**
+ * fscache_register_netfs - Register a filesystem as desiring caching services
+ * @netfs: The description of the filesystem
+ *
+ * Register a filesystem as desiring caching services if they're available.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_register_netfs(struct fscache_netfs *netfs)
+{
+	return 0;
+}
+
+/**
+ * fscache_unregister_netfs - Indicate that a filesystem no longer desires
+ * caching services
+ * @netfs: The description of the filesystem
+ *
+ * Indicate that a filesystem no longer desires caching services for the
+ * moment.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+}
+
+/**
+ * fscache_lookup_cache_tag - Look up a cache tag
+ * @name: The name of the tag to search for
+ *
+ * Acquire a specific cache referral tag that can be used to select a specific
+ * cache in which to cache an index.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
+{
+	return NULL;
+}
+
+/**
+ * fscache_release_cache_tag - Release a cache tag
+ * @tag: The tag to release
+ *
+ * Release a reference to a cache referral tag previously looked up.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+}
+
+/**
+ * fscache_acquire_cookie - Acquire a cookie to represent a cache object
+ * @parent: The cookie that's to be the parent of this one
+ * @def: A description of the cache object, including callback operations
+ * @netfs_data: An arbitrary piece of data to be kept in the cookie to
+ * represent the cache object to the netfs
+ *
+ * This function is used to inform FS-Cache about part of an index hierarchy
+ * that can be used to locate files.  This is done by requesting a cookie for
+ * each index in the path to the file.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+struct fscache_cookie *fscache_acquire_cookie(
+	struct fscache_cookie *parent,
+	const struct fscache_cookie_def *def,
+	void *netfs_data)
+{
+	return NULL;
+}
+
+/**
+ * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding
+ * it
+ * @cookie: The cookie being returned
+ * @retire: True if the cache object the cookie represents is to be discarded
+ *
+ * This function returns a cookie to the cache, forcibly discarding the
+ * associated cache object if retire is set to true.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+}
+
+/**
+ * fscache_update_cookie - Request that a cache object be updated
+ * @cookie: The cookie representing the cache object
+ *
+ * Request an update of the index data for the cache object associated with the
+ * cookie.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_update_cookie(struct fscache_cookie *cookie)
+{
+}
+
+/**
+ * fscache_pin_cookie - Pin a data-storage cache object in its cache
+ * @cookie: The cookie representing the cache object
+ *
+ * Permit data-storage cache objects to be pinned in the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_pin_cookie(struct fscache_cookie *cookie)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_pin_cookie - Unpin a data-storage cache object in its cache
+ * @cookie: The cookie representing the cache object
+ *
+ * Permit data-storage cache objects to be unpinned from the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_unpin_cookie(struct fscache_cookie *cookie)
+{
+}
+
+/**
+ * fscache_attr_changed - Notify cache that an object's attributes changed
+ * @cookie: The cookie representing the cache object
+ *
+ * Send a notification to the cache indicating that an object's attributes have
+ * changed.  This includes the data size.  These attributes will be obtained
+ * through the get_attr() cookie definition op.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_attr_changed(struct fscache_cookie *cookie)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_reserve_space - Reserve data space for a cached object
+ * @cookie: The cookie representing the cache object
+ * @i_size: The amount of space to be reserved
+ *
+ * Reserve an amount of space in the cache for the cache object attached to a
+ * cookie so that a write to that object within the space can always be
+ * honoured.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_read_or_alloc_page - Read a page from the cache or allocate a block
+ * in which to store it
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to fill if possible
+ * @end_io_func: The callback to invoke when and if the page is filled
+ * @context: An arbitrary piece of data to pass on to end_io_func()
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Read a page from the cache, or if that's not possible make a potential
+ * one-block reservation in the cache into which the page may be stored once
+ * fetched from the server.
+ *
+ * If the page is not backed by the cache object, or if it there's some reason
+ * it can't be, -ENOBUFS will be returned and nothing more will be done for
+ * that page.
+ *
+ * Else, if that page is backed by the cache, a read will be initiated directly
+ * to the netfs's page and 0 will be returned by this function.  The
+ * end_io_func() callback will be invoked when the operation terminates on a
+ * completion or failure.  Note that the callback may be invoked before the
+ * return.
+ *
+ * Else, if the page is unbacked, -ENODATA is returned and a block may have
+ * been allocated in the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+			       struct page *page,
+			       fscache_rw_complete_t end_io_func,
+			       void *context,
+			       gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_read_or_alloc_pages - Read pages from the cache and/or allocate
+ * blocks in which to store them
+ * @cookie: The cookie representing the cache object
+ * @mapping: The netfs inode mapping to which the pages will be attached
+ * @pages: A list of potential netfs pages to be filled
+ * @end_io_func: The callback to invoke when and if each page is filled
+ * @context: An arbitrary piece of data to pass on to end_io_func()
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Read a set of pages from the cache, or if that's not possible, attempt to
+ * make a potential one-block reservation for each page in the cache into which
+ * that page may be stored once fetched from the server.
+ *
+ * If some pages are not backed by the cache object, or if it there's some
+ * reason they can't be, -ENOBUFS will be returned and nothing more will be
+ * done for that pages.
+ *
+ * Else, if some of the pages are backed by the cache, a read will be initiated
+ * directly to the netfs's page and 0 will be returned by this function.  The
+ * end_io_func() callback will be invoked when the operation terminates on a
+ * completion or failure.  Note that the callback may be invoked before the
+ * return.
+ *
+ * Else, if a page is unbacked, -ENODATA is returned and a block may have
+ * been allocated in the cache.
+ *
+ * Because the function may want to return all of -ENOBUFS, -ENODATA and 0 in
+ * regard to different pages, the return values are prioritised in that order.
+ * Any pages submitted for reading are removed from the pages list.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned *nr_pages,
+				fscache_rw_complete_t end_io_func,
+				void *context,
+				gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_alloc_page - Allocate a block in which to store a page
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to allocate a page for
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Request Allocation a block in the cache in which to store a netfs page
+ * without retrieving any contents from the cache.
+ *
+ * If the page is not backed by a file then -ENOBUFS will be returned and
+ * nothing more will be done, and no reservation will be made.
+ *
+ * Else, a block will be allocated if one wasn't already, and 0 will be
+ * returned
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_alloc_page(struct fscache_cookie *cookie,
+		       struct page *page,
+		       gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_write_page - Request storage of a page in the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to store
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Request the contents of the netfs page be written into the cache.  This
+ * request may be ignored if no cache block is currently allocated, in which
+ * case it will return -ENOBUFS.
+ *
+ * If a cache block was already allocated, a write will be initiated and 0 will
+ * be returned.  The PG_fscache_write page bit is set immediately and will then
+ * be cleared at the completion of the write to indicate the success or failure
+ * of the operation.  Note that the completion may happen before the return.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_write_page(struct fscache_cookie *cookie,
+		       struct page *page,
+		       gfp_t gfp)
+{
+	return -ENOBUFS;
+}
+
+/**
+ * fscache_uncache_page - Indicate that caching is no longer required on a page
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that was being cached.
+ *
+ * Tell the cache that we no longer want a page to be cached and that it should
+ * remove any knowledge of the netfs page it may have.
+ *
+ * Note that this cannot cancel any outstanding I/O operations between this
+ * page and the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_uncache_page(struct fscache_cookie *cookie,
+			  struct page *page)
+{
+}
+
+/**
+ * fscache_check_page_write - Ask if a page is being writing to the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that is being cached.
+ *
+ * Ask the cache if a page is being written to the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+bool fscache_check_page_write(struct fscache_cookie *cookie,
+			      struct page *page)
+{
+	return false;
+}
+
+/**
+ * fscache_wait_on_page_write - Wait for a page to complete writing to the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that is being cached.
+ *
+ * Ask the cache to wake us up when a page is no longer being written to the
+ * cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_wait_on_page_write(struct fscache_cookie *cookie,
+				struct page *page)
+{
+}
+
+#endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3-71-gd317


From 0dfc41d1efcc4180abfd32f68f0ade540e636ff6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:36 +0100
Subject: FS-Cache: Add the FS-Cache cache backend API and documentation

Add the API for a generic facility (FS-Cache) by which caches may declare them
selves open for business, and may obtain work to be done from network
filesystems.  The header file is included by:

	#include <linux/fscache-cache.h>

Documentation for the API is also added to:

	Documentation/filesystems/caching/backend-api.txt

This API is not usable without the implementation of the utility functions
which will be added in further patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/filesystems/caching/backend-api.txt | 664 ++++++++++++++++++++++
 include/linux/fscache-cache.h                     | 509 +++++++++++++++++
 2 files changed, 1173 insertions(+)
 create mode 100644 Documentation/filesystems/caching/backend-api.txt
 create mode 100644 include/linux/fscache-cache.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
new file mode 100644
index 000000000000..17723053aa91
--- /dev/null
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -0,0 +1,664 @@
+			  ==========================
+			  FS-CACHE CACHE BACKEND API
+			  ==========================
+
+The FS-Cache system provides an API by which actual caches can be supplied to
+FS-Cache for it to then serve out to network filesystems and other interested
+parties.
+
+This API is declared in <linux/fscache-cache.h>.
+
+
+====================================
+INITIALISING AND REGISTERING A CACHE
+====================================
+
+To start off, a cache definition must be initialised and registered for each
+cache the backend wants to make available.  For instance, CacheFS does this in
+the fill_super() operation on mounting.
+
+The cache definition (struct fscache_cache) should be initialised by calling:
+
+	void fscache_init_cache(struct fscache_cache *cache,
+				struct fscache_cache_ops *ops,
+				const char *idfmt,
+				...);
+
+Where:
+
+ (*) "cache" is a pointer to the cache definition;
+
+ (*) "ops" is a pointer to the table of operations that the backend supports on
+     this cache; and
+
+ (*) "idfmt" is a format and printf-style arguments for constructing a label
+     for the cache.
+
+
+The cache should then be registered with FS-Cache by passing a pointer to the
+previously initialised cache definition to:
+
+	int fscache_add_cache(struct fscache_cache *cache,
+			      struct fscache_object *fsdef,
+			      const char *tagname);
+
+Two extra arguments should also be supplied:
+
+ (*) "fsdef" which should point to the object representation for the FS-Cache
+     master index in this cache.  Netfs primary index entries will be created
+     here.  FS-Cache keeps the caller's reference to the index object if
+     successful and will release it upon withdrawal of the cache.
+
+ (*) "tagname" which, if given, should be a text string naming this cache.  If
+     this is NULL, the identifier will be used instead.  For CacheFS, the
+     identifier is set to name the underlying block device and the tag can be
+     supplied by mount.
+
+This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag
+is already in use.  0 will be returned on success.
+
+
+=====================
+UNREGISTERING A CACHE
+=====================
+
+A cache can be withdrawn from the system by calling this function with a
+pointer to the cache definition:
+
+	void fscache_withdraw_cache(struct fscache_cache *cache);
+
+In CacheFS's case, this is called by put_super().
+
+
+========
+SECURITY
+========
+
+The cache methods are executed one of two contexts:
+
+ (1) that of the userspace process that issued the netfs operation that caused
+     the cache method to be invoked, or
+
+ (2) that of one of the processes in the FS-Cache thread pool.
+
+In either case, this may not be an appropriate context in which to access the
+cache.
+
+The calling process's fsuid, fsgid and SELinux security identities may need to
+be masqueraded for the duration of the cache driver's access to the cache.
+This is left to the cache to handle; FS-Cache makes no effort in this regard.
+
+
+===================================
+CONTROL AND STATISTICS PRESENTATION
+===================================
+
+The cache may present data to the outside world through FS-Cache's interfaces
+in sysfs and procfs - the former for control and the latter for statistics.
+
+A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
+is enabled.  This is accessible through the kobject struct fscache_cache::kobj
+and is for use by the cache as it sees fit.
+
+The cache driver may create itself a directory named for the cache type in the
+/proc/fs/fscache/ directory.  This is available if CONFIG_FSCACHE_PROC is
+enabled and is accessible through:
+
+	struct proc_dir_entry *proc_fscache;
+
+
+========================
+RELEVANT DATA STRUCTURES
+========================
+
+ (*) Index/Data file FS-Cache representation cookie:
+
+	struct fscache_cookie {
+		struct fscache_object_def	*def;
+		struct fscache_netfs		*netfs;
+		void				*netfs_data;
+		...
+	};
+
+     The fields that might be of use to the backend describe the object
+     definition, the netfs definition and the netfs's data for this cookie.
+     The object definition contain functions supplied by the netfs for loading
+     and matching index entries; these are required to provide some of the
+     cache operations.
+
+
+ (*) In-cache object representation:
+
+	struct fscache_object {
+		int				debug_id;
+		enum {
+			FSCACHE_OBJECT_RECYCLING,
+			...
+		}				state;
+		spinlock_t			lock
+		struct fscache_cache		*cache;
+		struct fscache_cookie		*cookie;
+		...
+	};
+
+     Structures of this type should be allocated by the cache backend and
+     passed to FS-Cache when requested by the appropriate cache operation.  In
+     the case of CacheFS, they're embedded in CacheFS's internal object
+     structures.
+
+     The debug_id is a simple integer that can be used in debugging messages
+     that refer to a particular object.  In such a case it should be printed
+     using "OBJ%x" to be consistent with FS-Cache.
+
+     Each object contains a pointer to the cookie that represents the object it
+     is backing.  An object should retired when put_object() is called if it is
+     in state FSCACHE_OBJECT_RECYCLING.  The fscache_object struct should be
+     initialised by calling fscache_object_init(object).
+
+
+ (*) FS-Cache operation record:
+
+	struct fscache_operation {
+		atomic_t		usage;
+		struct fscache_object	*object;
+		unsigned long		flags;
+	#define FSCACHE_OP_EXCLUSIVE
+		void (*processor)(struct fscache_operation *op);
+		void (*release)(struct fscache_operation *op);
+		...
+	};
+
+     FS-Cache has a pool of threads that it uses to give CPU time to the
+     various asynchronous operations that need to be done as part of driving
+     the cache.  These are represented by the above structure.  The processor
+     method is called to give the op CPU time, and the release method to get
+     rid of it when its usage count reaches 0.
+
+     An operation can be made exclusive upon an object by setting the
+     appropriate flag before enqueuing it with fscache_enqueue_operation().  If
+     an operation needs more processing time, it should be enqueued again.
+
+
+ (*) FS-Cache retrieval operation record:
+
+	struct fscache_retrieval {
+		struct fscache_operation op;
+		struct address_space	*mapping;
+		struct list_head	*to_do;
+		...
+	};
+
+     A structure of this type is allocated by FS-Cache to record retrieval and
+     allocation requests made by the netfs.  This struct is then passed to the
+     backend to do the operation.  The backend may get extra refs to it by
+     calling fscache_get_retrieval() and refs may be discarded by calling
+     fscache_put_retrieval().
+
+     A retrieval operation can be used by the backend to do retrieval work.  To
+     do this, the retrieval->op.processor method pointer should be set
+     appropriately by the backend and fscache_enqueue_retrieval() called to
+     submit it to the thread pool.  CacheFiles, for example, uses this to queue
+     page examination when it detects PG_lock being cleared.
+
+     The to_do field is an empty list available for the cache backend to use as
+     it sees fit.
+
+
+ (*) FS-Cache storage operation record:
+
+	struct fscache_storage {
+		struct fscache_operation op;
+		pgoff_t			store_limit;
+		...
+	};
+
+     A structure of this type is allocated by FS-Cache to record outstanding
+     writes to be made.  FS-Cache itself enqueues this operation and invokes
+     the write_page() method on the object at appropriate times to effect
+     storage.
+
+
+================
+CACHE OPERATIONS
+================
+
+The cache backend provides FS-Cache with a table of operations that can be
+performed on the denizens of the cache.  These are held in a structure of type:
+
+	struct fscache_cache_ops
+
+ (*) Name of cache provider [mandatory]:
+
+	const char *name
+
+     This isn't strictly an operation, but should be pointed at a string naming
+     the backend.
+
+
+ (*) Allocate a new object [mandatory]:
+
+	struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
+					       struct fscache_cookie *cookie)
+
+     This method is used to allocate a cache object representation to back a
+     cookie in a particular cache.  fscache_object_init() should be called on
+     the object to initialise it prior to returning.
+
+     This function may also be used to parse the index key to be used for
+     multiple lookup calls to turn it into a more convenient form.  FS-Cache
+     will call the lookup_complete() method to allow the cache to release the
+     form once lookup is complete or aborted.
+
+
+ (*) Look up and create object [mandatory]:
+
+	void (*lookup_object)(struct fscache_object *object)
+
+     This method is used to look up an object, given that the object is already
+     allocated and attached to the cookie.  This should instantiate that object
+     in the cache if it can.
+
+     The method should call fscache_object_lookup_negative() as soon as
+     possible if it determines the object doesn't exist in the cache.  If the
+     object is found to exist and the netfs indicates that it is valid then
+     fscache_obtained_object() should be called once the object is in a
+     position to have data stored in it.  Similarly, fscache_obtained_object()
+     should also be called once a non-present object has been created.
+
+     If a lookup error occurs, fscache_object_lookup_error() should be called
+     to abort the lookup of that object.
+
+
+ (*) Release lookup data [mandatory]:
+
+	void (*lookup_complete)(struct fscache_object *object)
+
+     This method is called to ask the cache to release any resources it was
+     using to perform a lookup.
+
+
+ (*) Increment object refcount [mandatory]:
+
+	struct fscache_object *(*grab_object)(struct fscache_object *object)
+
+     This method is called to increment the reference count on an object.  It
+     may fail (for instance if the cache is being withdrawn) by returning NULL.
+     It should return the object pointer if successful.
+
+
+ (*) Lock/Unlock object [mandatory]:
+
+	void (*lock_object)(struct fscache_object *object)
+	void (*unlock_object)(struct fscache_object *object)
+
+     These methods are used to exclusively lock an object.  It must be possible
+     to schedule with the lock held, so a spinlock isn't sufficient.
+
+
+ (*) Pin/Unpin object [optional]:
+
+	int (*pin_object)(struct fscache_object *object)
+	void (*unpin_object)(struct fscache_object *object)
+
+     These methods are used to pin an object into the cache.  Once pinned an
+     object cannot be reclaimed to make space.  Return -ENOSPC if there's not
+     enough space in the cache to permit this.
+
+
+ (*) Update object [mandatory]:
+
+	int (*update_object)(struct fscache_object *object)
+
+     This is called to update the index entry for the specified object.  The
+     new information should be in object->cookie->netfs_data.  This can be
+     obtained by calling object->cookie->def->get_aux()/get_attr().
+
+
+ (*) Discard object [mandatory]:
+
+	void (*drop_object)(struct fscache_object *object)
+
+     This method is called to indicate that an object has been unbound from its
+     cookie, and that the cache should release the object's resources and
+     retire it if it's in state FSCACHE_OBJECT_RECYCLING.
+
+     This method should not attempt to release any references held by the
+     caller.  The caller will invoke the put_object() method as appropriate.
+
+
+ (*) Release object reference [mandatory]:
+
+	void (*put_object)(struct fscache_object *object)
+
+     This method is used to discard a reference to an object.  The object may
+     be freed when all the references to it are released.
+
+
+ (*) Synchronise a cache [mandatory]:
+
+	void (*sync)(struct fscache_cache *cache)
+
+     This is called to ask the backend to synchronise a cache with its backing
+     device.
+
+
+ (*) Dissociate a cache [mandatory]:
+
+	void (*dissociate_pages)(struct fscache_cache *cache)
+
+     This is called to ask a cache to perform any page dissociations as part of
+     cache withdrawal.
+
+
+ (*) Notification that the attributes on a netfs file changed [mandatory]:
+
+	int (*attr_changed)(struct fscache_object *object);
+
+     This is called to indicate to the cache that certain attributes on a netfs
+     file have changed (for example the maximum size a file may reach).  The
+     cache can read these from the netfs by calling the cookie's get_attr()
+     method.
+
+     The cache may use the file size information to reserve space on the cache.
+     It should also call fscache_set_store_limit() to indicate to FS-Cache the
+     highest byte it's willing to store for an object.
+
+     This method may return -ve if an error occurred or the cache object cannot
+     be expanded.  In such a case, the object will be withdrawn from service.
+
+     This operation is run asynchronously from FS-Cache's thread pool, and
+     storage and retrieval operations from the netfs are excluded during the
+     execution of this operation.
+
+
+ (*) Reserve cache space for an object's data [optional]:
+
+	int (*reserve_space)(struct fscache_object *object, loff_t size);
+
+     This is called to request that cache space be reserved to hold the data
+     for an object and the metadata used to track it.  Zero size should be
+     taken as request to cancel a reservation.
+
+     This should return 0 if successful, -ENOSPC if there isn't enough space
+     available, or -ENOMEM or -EIO on other errors.
+
+     The reservation may exceed the current size of the object, thus permitting
+     future expansion.  If the amount of space consumed by an object would
+     exceed the reservation, it's permitted to refuse requests to allocate
+     pages, but not required.  An object may be pruned down to its reservation
+     size if larger than that already.
+
+
+ (*) Request page be read from cache [mandatory]:
+
+	int (*read_or_alloc_page)(struct fscache_retrieval *op,
+				  struct page *page,
+				  gfp_t gfp)
+
+     This is called to attempt to read a netfs page from the cache, or to
+     reserve a backing block if not.  FS-Cache will have done as much checking
+     as it can before calling, but most of the work belongs to the backend.
+
+     If there's no page in the cache, then -ENODATA should be returned if the
+     backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it
+     didn't.
+
+     If there is suitable data in the cache, then a read operation should be
+     queued and 0 returned.  When the read finishes, fscache_end_io() should be
+     called.
+
+     The fscache_mark_pages_cached() should be called for the page if any cache
+     metadata is retained.  This will indicate to the netfs that the page needs
+     explicit uncaching.  This operation takes a pagevec, thus allowing several
+     pages to be marked at once.
+
+     The retrieval record pointed to by op should be retained for each page
+     queued and released when I/O on the page has been formally ended.
+     fscache_get/put_retrieval() are available for this purpose.
+
+     The retrieval record may be used to get CPU time via the FS-Cache thread
+     pool.  If this is desired, the op->op.processor should be set to point to
+     the appropriate processing routine, and fscache_enqueue_retrieval() should
+     be called at an appropriate point to request CPU time.  For instance, the
+     retrieval routine could be enqueued upon the completion of a disk read.
+     The to_do field in the retrieval record is provided to aid in this.
+
+     If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS
+     returned if possible or fscache_end_io() called with a suitable error
+     code..
+
+
+ (*) Request pages be read from cache [mandatory]:
+
+	int (*read_or_alloc_pages)(struct fscache_retrieval *op,
+				   struct list_head *pages,
+				   unsigned *nr_pages,
+				   gfp_t gfp)
+
+     This is like the read_or_alloc_page() method, except it is handed a list
+     of pages instead of one page.  Any pages on which a read operation is
+     started must be added to the page cache for the specified mapping and also
+     to the LRU.  Such pages must also be removed from the pages list and
+     *nr_pages decremented per page.
+
+     If there was an error such as -ENOMEM, then that should be returned; else
+     if one or more pages couldn't be read or allocated, then -ENOBUFS should
+     be returned; else if one or more pages couldn't be read, then -ENODATA
+     should be returned.  If all the pages are dispatched then 0 should be
+     returned.
+
+
+ (*) Request page be allocated in the cache [mandatory]:
+
+	int (*allocate_page)(struct fscache_retrieval *op,
+			     struct page *page,
+			     gfp_t gfp)
+
+     This is like the read_or_alloc_page() method, except that it shouldn't
+     read from the cache, even if there's data there that could be retrieved.
+     It should, however, set up any internal metadata required such that
+     the write_page() method can write to the cache.
+
+     If there's no backing block available, then -ENOBUFS should be returned
+     (or -ENOMEM if there were other problems).  If a block is successfully
+     allocated, then the netfs page should be marked and 0 returned.
+
+
+ (*) Request pages be allocated in the cache [mandatory]:
+
+	int (*allocate_pages)(struct fscache_retrieval *op,
+			      struct list_head *pages,
+			      unsigned *nr_pages,
+			      gfp_t gfp)
+
+     This is an multiple page version of the allocate_page() method.  pages and
+     nr_pages should be treated as for the read_or_alloc_pages() method.
+
+
+ (*) Request page be written to cache [mandatory]:
+
+	int (*write_page)(struct fscache_storage *op,
+			  struct page *page);
+
+     This is called to write from a page on which there was a previously
+     successful read_or_alloc_page() call or similar.  FS-Cache filters out
+     pages that don't have mappings.
+
+     This method is called asynchronously from the FS-Cache thread pool.  It is
+     not required to actually store anything, provided -ENODATA is then
+     returned to the next read of this page.
+
+     If an error occurred, then a negative error code should be returned,
+     otherwise zero should be returned.  FS-Cache will take appropriate action
+     in response to an error, such as withdrawing this object.
+
+     If this method returns success then FS-Cache will inform the netfs
+     appropriately.
+
+
+ (*) Discard retained per-page metadata [mandatory]:
+
+	void (*uncache_page)(struct fscache_object *object, struct page *page)
+
+     This is called when a netfs page is being evicted from the pagecache.  The
+     cache backend should tear down any internal representation or tracking it
+     maintains for this page.
+
+
+==================
+FS-CACHE UTILITIES
+==================
+
+FS-Cache provides some utilities that a cache backend may make use of:
+
+ (*) Note occurrence of an I/O error in a cache:
+
+	void fscache_io_error(struct fscache_cache *cache)
+
+     This tells FS-Cache that an I/O error occurred in the cache.  After this
+     has been called, only resource dissociation operations (object and page
+     release) will be passed from the netfs to the cache backend for the
+     specified cache.
+
+     This does not actually withdraw the cache.  That must be done separately.
+
+
+ (*) Invoke the retrieval I/O completion function:
+
+	void fscache_end_io(struct fscache_retrieval *op, struct page *page,
+			    int error);
+
+     This is called to note the end of an attempt to retrieve a page.  The
+     error value should be 0 if successful and an error otherwise.
+
+
+ (*) Set highest store limit:
+
+	void fscache_set_store_limit(struct fscache_object *object,
+				     loff_t i_size);
+
+     This sets the limit FS-Cache imposes on the highest byte it's willing to
+     try and store for a netfs.  Any page over this limit is automatically
+     rejected by fscache_read_alloc_page() and co with -ENOBUFS.
+
+
+ (*) Mark pages as being cached:
+
+	void fscache_mark_pages_cached(struct fscache_retrieval *op,
+				       struct pagevec *pagevec);
+
+     This marks a set of pages as being cached.  After this has been called,
+     the netfs must call fscache_uncache_page() to unmark the pages.
+
+
+ (*) Perform coherency check on an object:
+
+	enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+						const void *data,
+						uint16_t datalen);
+
+     This asks the netfs to perform a coherency check on an object that has
+     just been looked up.  The cookie attached to the object will determine the
+     netfs to use.  data and datalen should specify where the auxiliary data
+     retrieved from the cache can be found.
+
+     One of three values will be returned:
+
+	(*) FSCACHE_CHECKAUX_OKAY
+
+	    The coherency data indicates the object is valid as is.
+
+	(*) FSCACHE_CHECKAUX_NEEDS_UPDATE
+
+	    The coherency data needs updating, but otherwise the object is
+	    valid.
+
+	(*) FSCACHE_CHECKAUX_OBSOLETE
+
+	    The coherency data indicates that the object is obsolete and should
+	    be discarded.
+
+
+ (*) Initialise a freshly allocated object:
+
+	void fscache_object_init(struct fscache_object *object);
+
+     This initialises all the fields in an object representation.
+
+
+ (*) Indicate the destruction of an object:
+
+	void fscache_object_destroyed(struct fscache_cache *cache);
+
+     This must be called to inform FS-Cache that an object that belonged to a
+     cache has been destroyed and deallocated.  This will allow continuation
+     of the cache withdrawal process when it is stopped pending destruction of
+     all the objects.
+
+
+ (*) Indicate negative lookup on an object:
+
+	void fscache_object_lookup_negative(struct fscache_object *object);
+
+     This is called to indicate to FS-Cache that a lookup process for an object
+     found a negative result.
+
+     This changes the state of an object to permit reads pending on lookup
+     completion to go off and start fetching data from the netfs server as it's
+     known at this point that there can't be any data in the cache.
+
+     This may be called multiple times on an object.  Only the first call is
+     significant - all subsequent calls are ignored.
+
+
+ (*) Indicate an object has been obtained:
+
+	void fscache_obtained_object(struct fscache_object *object);
+
+     This is called to indicate to FS-Cache that a lookup process for an object
+     produced a positive result, or that an object was created.  This should
+     only be called once for any particular object.
+
+     This changes the state of an object to indicate:
+
+	(1) if no call to fscache_object_lookup_negative() has been made on
+	    this object, that there may be data available, and that reads can
+	    now go and look for it; and
+
+        (2) that writes may now proceed against this object.
+
+
+ (*) Indicate that object lookup failed:
+
+	void fscache_object_lookup_error(struct fscache_object *object);
+
+     This marks an object as having encountered a fatal error (usually EIO)
+     and causes it to move into a state whereby it will be withdrawn as soon
+     as possible.
+
+
+ (*) Get and release references on a retrieval record:
+
+	void fscache_get_retrieval(struct fscache_retrieval *op);
+	void fscache_put_retrieval(struct fscache_retrieval *op);
+
+     These two functions are used to retain a retrieval record whilst doing
+     asynchronous data retrieval and block allocation.
+
+
+ (*) Enqueue a retrieval record for processing.
+
+	void fscache_enqueue_retrieval(struct fscache_retrieval *op);
+
+     This enqueues a retrieval record for processing by the FS-Cache thread
+     pool.  One of the threads in the pool will invoke the retrieval record's
+     op->op.processor callback function.  This function may be called from
+     within the callback function.
+
+
+ (*) List of object state names:
+
+	const char *fscache_object_states[];
+
+     For debugging purposes, this may be used to turn the state that an object
+     is in into a text string for display purposes.
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
new file mode 100644
index 000000000000..b2a9a484c4cf
--- /dev/null
+++ b/include/linux/fscache-cache.h
@@ -0,0 +1,509 @@
+/* General filesystem caching backing cache interface
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * NOTE!!! See:
+ *
+ *	Documentation/filesystems/caching/backend-api.txt
+ *
+ * for a description of the cache backend interface declared here.
+ */
+
+#ifndef _LINUX_FSCACHE_CACHE_H
+#define _LINUX_FSCACHE_CACHE_H
+
+#include <linux/fscache.h>
+#include <linux/sched.h>
+#include <linux/slow-work.h>
+
+#define NR_MAXCACHES BITS_PER_LONG
+
+struct fscache_cache;
+struct fscache_cache_ops;
+struct fscache_object;
+struct fscache_operation;
+
+#ifdef CONFIG_FSCACHE_PROC
+extern struct proc_dir_entry *proc_fscache;
+#endif
+
+/*
+ * cache tag definition
+ */
+struct fscache_cache_tag {
+	struct list_head	link;
+	struct fscache_cache	*cache;		/* cache referred to by this tag */
+	unsigned long		flags;
+#define FSCACHE_TAG_RESERVED	0		/* T if tag is reserved for a cache */
+	atomic_t		usage;
+	char			name[0];	/* tag name */
+};
+
+/*
+ * cache definition
+ */
+struct fscache_cache {
+	const struct fscache_cache_ops *ops;
+	struct fscache_cache_tag *tag;		/* tag representing this cache */
+	struct kobject		*kobj;		/* system representation of this cache */
+	struct list_head	link;		/* link in list of caches */
+	size_t			max_index_size;	/* maximum size of index data */
+	char			identifier[36];	/* cache label */
+
+	/* node management */
+	struct work_struct	op_gc;		/* operation garbage collector */
+	struct list_head	object_list;	/* list of data/index objects */
+	struct list_head	op_gc_list;	/* list of ops to be deleted */
+	spinlock_t		object_list_lock;
+	spinlock_t		op_gc_list_lock;
+	atomic_t		object_count;	/* no. of live objects in this cache */
+	struct fscache_object	*fsdef;		/* object for the fsdef index */
+	unsigned long		flags;
+#define FSCACHE_IOERROR		0	/* cache stopped on I/O error */
+#define FSCACHE_CACHE_WITHDRAWN	1	/* cache has been withdrawn */
+};
+
+extern wait_queue_head_t fscache_cache_cleared_wq;
+
+/*
+ * operation to be applied to a cache object
+ * - retrieval initiation operations are done in the context of the process
+ *   that issued them, and not in an async thread pool
+ */
+typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
+typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
+
+struct fscache_operation {
+	union {
+		struct work_struct fast_work;	/* record for fast ops */
+		struct slow_work slow_work;	/* record for (very) slow ops */
+	};
+	struct list_head	pend_link;	/* link in object->pending_ops */
+	struct fscache_object	*object;	/* object to be operated upon */
+
+	unsigned long		flags;
+#define FSCACHE_OP_TYPE		0x000f	/* operation type */
+#define FSCACHE_OP_FAST		0x0001	/* - fast op, processor may not sleep for disk */
+#define FSCACHE_OP_SLOW		0x0002	/* - (very) slow op, processor may sleep for disk */
+#define FSCACHE_OP_MYTHREAD	0x0003	/* - processing is done be issuing thread, not pool */
+#define FSCACHE_OP_WAITING	4	/* cleared when op is woken */
+#define FSCACHE_OP_EXCLUSIVE	5	/* exclusive op, other ops must wait */
+#define FSCACHE_OP_DEAD		6	/* op is now dead */
+
+	atomic_t		usage;
+	unsigned		debug_id;	/* debugging ID */
+
+	/* operation processor callback
+	 * - can be NULL if FSCACHE_OP_WAITING is going to be used to perform
+	 *   the op in a non-pool thread */
+	fscache_operation_processor_t processor;
+
+	/* operation releaser */
+	fscache_operation_release_t release;
+};
+
+extern atomic_t fscache_op_debug_id;
+extern const struct slow_work_ops fscache_op_slow_work_ops;
+
+extern void fscache_enqueue_operation(struct fscache_operation *);
+extern void fscache_put_operation(struct fscache_operation *);
+
+/**
+ * fscache_operation_init - Do basic initialisation of an operation
+ * @op: The operation to initialise
+ * @release: The release function to assign
+ *
+ * Do basic initialisation of an operation.  The caller must still set flags,
+ * object, either fast_work or slow_work if necessary, and processor if needed.
+ */
+static inline void fscache_operation_init(struct fscache_operation *op,
+					  fscache_operation_release_t release)
+{
+	atomic_set(&op->usage, 1);
+	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+	op->release = release;
+	INIT_LIST_HEAD(&op->pend_link);
+}
+
+/**
+ * fscache_operation_init_slow - Do additional initialisation of a slow op
+ * @op: The operation to initialise
+ * @processor: The processor function to assign
+ *
+ * Do additional initialisation of an operation as required for slow work.
+ */
+static inline
+void fscache_operation_init_slow(struct fscache_operation *op,
+				 fscache_operation_processor_t processor)
+{
+	op->processor = processor;
+	slow_work_init(&op->slow_work, &fscache_op_slow_work_ops);
+}
+
+/*
+ * data read operation
+ */
+struct fscache_retrieval {
+	struct fscache_operation op;
+	struct address_space	*mapping;	/* netfs pages */
+	fscache_rw_complete_t	end_io_func;	/* function to call on I/O completion */
+	void			*context;	/* netfs read context (pinned) */
+	struct list_head	to_do;		/* list of things to be done by the backend */
+	unsigned long		start_time;	/* time at which retrieval started */
+};
+
+typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op,
+					     struct page *page,
+					     gfp_t gfp);
+
+typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op,
+					      struct list_head *pages,
+					      unsigned *nr_pages,
+					      gfp_t gfp);
+
+/**
+ * fscache_get_retrieval - Get an extra reference on a retrieval operation
+ * @op: The retrieval operation to get a reference on
+ *
+ * Get an extra reference on a retrieval operation.
+ */
+static inline
+struct fscache_retrieval *fscache_get_retrieval(struct fscache_retrieval *op)
+{
+	atomic_inc(&op->op.usage);
+	return op;
+}
+
+/**
+ * fscache_enqueue_retrieval - Enqueue a retrieval operation for processing
+ * @op: The retrieval operation affected
+ *
+ * Enqueue a retrieval operation for processing by the FS-Cache thread pool.
+ */
+static inline void fscache_enqueue_retrieval(struct fscache_retrieval *op)
+{
+	fscache_enqueue_operation(&op->op);
+}
+
+/**
+ * fscache_put_retrieval - Drop a reference to a retrieval operation
+ * @op: The retrieval operation affected
+ *
+ * Drop a reference to a retrieval operation.
+ */
+static inline void fscache_put_retrieval(struct fscache_retrieval *op)
+{
+	fscache_put_operation(&op->op);
+}
+
+/*
+ * cached page storage work item
+ * - used to do three things:
+ *   - batch writes to the cache
+ *   - do cache writes asynchronously
+ *   - defer writes until cache object lookup completion
+ */
+struct fscache_storage {
+	struct fscache_operation op;
+	pgoff_t			store_limit;	/* don't write more than this */
+};
+
+/*
+ * cache operations
+ */
+struct fscache_cache_ops {
+	/* name of cache provider */
+	const char *name;
+
+	/* allocate an object record for a cookie */
+	struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
+					       struct fscache_cookie *cookie);
+
+	/* look up the object for a cookie */
+	void (*lookup_object)(struct fscache_object *object);
+
+	/* finished looking up */
+	void (*lookup_complete)(struct fscache_object *object);
+
+	/* increment the usage count on this object (may fail if unmounting) */
+	struct fscache_object *(*grab_object)(struct fscache_object *object);
+
+	/* pin an object in the cache */
+	int (*pin_object)(struct fscache_object *object);
+
+	/* unpin an object in the cache */
+	void (*unpin_object)(struct fscache_object *object);
+
+	/* store the updated auxilliary data on an object */
+	void (*update_object)(struct fscache_object *object);
+
+	/* discard the resources pinned by an object and effect retirement if
+	 * necessary */
+	void (*drop_object)(struct fscache_object *object);
+
+	/* dispose of a reference to an object */
+	void (*put_object)(struct fscache_object *object);
+
+	/* sync a cache */
+	void (*sync_cache)(struct fscache_cache *cache);
+
+	/* notification that the attributes of a non-index object (such as
+	 * i_size) have changed */
+	int (*attr_changed)(struct fscache_object *object);
+
+	/* reserve space for an object's data and associated metadata */
+	int (*reserve_space)(struct fscache_object *object, loff_t i_size);
+
+	/* request a backing block for a page be read or allocated in the
+	 * cache */
+	fscache_page_retrieval_func_t read_or_alloc_page;
+
+	/* request backing blocks for a list of pages be read or allocated in
+	 * the cache */
+	fscache_pages_retrieval_func_t read_or_alloc_pages;
+
+	/* request a backing block for a page be allocated in the cache so that
+	 * it can be written directly */
+	fscache_page_retrieval_func_t allocate_page;
+
+	/* request backing blocks for pages be allocated in the cache so that
+	 * they can be written directly */
+	fscache_pages_retrieval_func_t allocate_pages;
+
+	/* write a page to its backing block in the cache */
+	int (*write_page)(struct fscache_storage *op, struct page *page);
+
+	/* detach backing block from a page (optional)
+	 * - must release the cookie lock before returning
+	 * - may sleep
+	 */
+	void (*uncache_page)(struct fscache_object *object,
+			     struct page *page);
+
+	/* dissociate a cache from all the pages it was backing */
+	void (*dissociate_pages)(struct fscache_cache *cache);
+};
+
+/*
+ * data file or index object cookie
+ * - a file will only appear in one cache
+ * - a request to cache a file may or may not be honoured, subject to
+ *   constraints such as disk space
+ * - indices are created on disk just-in-time
+ */
+struct fscache_cookie {
+	atomic_t			usage;		/* number of users of this cookie */
+	atomic_t			n_children;	/* number of children of this cookie */
+	spinlock_t			lock;
+	struct hlist_head		backing_objects; /* object(s) backing this file/index */
+	const struct fscache_cookie_def	*def;		/* definition */
+	struct fscache_cookie		*parent;	/* parent of this entry */
+	void				*netfs_data;	/* back pointer to netfs */
+	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
+#define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
+
+	unsigned long			flags;
+#define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
+#define FSCACHE_COOKIE_CREATING		1	/* T if non-index object being created still */
+#define FSCACHE_COOKIE_NO_DATA_YET	2	/* T if new object with no cached data yet */
+#define FSCACHE_COOKIE_PENDING_FILL	3	/* T if pending initial fill on object */
+#define FSCACHE_COOKIE_FILLING		4	/* T if filling object incrementally */
+#define FSCACHE_COOKIE_UNAVAILABLE	5	/* T if cookie is unavailable (error, etc) */
+};
+
+extern struct fscache_cookie fscache_fsdef_index;
+
+/*
+ * on-disk cache file or index handle
+ */
+struct fscache_object {
+	enum fscache_object_state {
+		FSCACHE_OBJECT_INIT,		/* object in initial unbound state */
+		FSCACHE_OBJECT_LOOKING_UP,	/* looking up object */
+		FSCACHE_OBJECT_CREATING,	/* creating object */
+
+		/* active states */
+		FSCACHE_OBJECT_AVAILABLE,	/* cleaning up object after creation */
+		FSCACHE_OBJECT_ACTIVE,		/* object is usable */
+		FSCACHE_OBJECT_UPDATING,	/* object is updating */
+
+		/* terminal states */
+		FSCACHE_OBJECT_DYING,		/* object waiting for accessors to finish */
+		FSCACHE_OBJECT_LC_DYING,	/* object cleaning up after lookup/create */
+		FSCACHE_OBJECT_ABORT_INIT,	/* abort the init state */
+		FSCACHE_OBJECT_RELEASING,	/* releasing object */
+		FSCACHE_OBJECT_RECYCLING,	/* retiring object */
+		FSCACHE_OBJECT_WITHDRAWING,	/* withdrawing object */
+		FSCACHE_OBJECT_DEAD,		/* object is now dead */
+	} state;
+
+	int			debug_id;	/* debugging ID */
+	int			n_children;	/* number of child objects */
+	int			n_ops;		/* number of ops outstanding on object */
+	int			n_obj_ops;	/* number of object ops outstanding on object */
+	int			n_in_progress;	/* number of ops in progress */
+	int			n_exclusive;	/* number of exclusive ops queued */
+	spinlock_t		lock;		/* state and operations lock */
+
+	unsigned long		lookup_jif;	/* time at which lookup started */
+	unsigned long		event_mask;	/* events this object is interested in */
+	unsigned long		events;		/* events to be processed by this object
+						 * (order is important - using fls) */
+#define FSCACHE_OBJECT_EV_REQUEUE	0	/* T if object should be requeued */
+#define FSCACHE_OBJECT_EV_UPDATE	1	/* T if object should be updated */
+#define FSCACHE_OBJECT_EV_CLEARED	2	/* T if accessors all gone */
+#define FSCACHE_OBJECT_EV_ERROR		3	/* T if fatal error occurred during processing */
+#define FSCACHE_OBJECT_EV_RELEASE	4	/* T if netfs requested object release */
+#define FSCACHE_OBJECT_EV_RETIRE	5	/* T if netfs requested object retirement */
+#define FSCACHE_OBJECT_EV_WITHDRAW	6	/* T if cache requested object withdrawal */
+
+	unsigned long		flags;
+#define FSCACHE_OBJECT_LOCK		0	/* T if object is busy being processed */
+#define FSCACHE_OBJECT_PENDING_WRITE	1	/* T if object has pending write */
+#define FSCACHE_OBJECT_WAITING		2	/* T if object is waiting on its parent */
+
+	struct list_head	cache_link;	/* link in cache->object_list */
+	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
+	struct fscache_cache	*cache;		/* cache that supplied this object */
+	struct fscache_cookie	*cookie;	/* netfs's file/index object */
+	struct fscache_object	*parent;	/* parent object */
+	struct slow_work	work;		/* attention scheduling record */
+	struct list_head	dependents;	/* FIFO of dependent objects */
+	struct list_head	dep_link;	/* link in parent's dependents list */
+	struct list_head	pending_ops;	/* unstarted operations on this object */
+	pgoff_t			store_limit;	/* current storage limit */
+};
+
+extern const char *fscache_object_states[];
+
+#define fscache_object_is_active(obj)			      \
+	(!test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&  \
+	 (obj)->state >= FSCACHE_OBJECT_AVAILABLE &&	      \
+	 (obj)->state < FSCACHE_OBJECT_DYING)
+
+extern const struct slow_work_ops fscache_object_slow_work_ops;
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_object_init(struct fscache_object *object,
+			 struct fscache_cookie *cookie,
+			 struct fscache_cache *cache)
+{
+	atomic_inc(&cache->object_count);
+
+	object->state = FSCACHE_OBJECT_INIT;
+	spin_lock_init(&object->lock);
+	INIT_LIST_HEAD(&object->cache_link);
+	INIT_HLIST_NODE(&object->cookie_link);
+	vslow_work_init(&object->work, &fscache_object_slow_work_ops);
+	INIT_LIST_HEAD(&object->dependents);
+	INIT_LIST_HEAD(&object->dep_link);
+	INIT_LIST_HEAD(&object->pending_ops);
+	object->n_children = 0;
+	object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+	object->events = object->event_mask = 0;
+	object->flags = 0;
+	object->store_limit = 0;
+	object->cache = cache;
+	object->cookie = cookie;
+	object->parent = NULL;
+}
+
+extern void fscache_object_lookup_negative(struct fscache_object *object);
+extern void fscache_obtained_object(struct fscache_object *object);
+
+/**
+ * fscache_object_destroyed - Note destruction of an object in a cache
+ * @cache: The cache from which the object came
+ *
+ * Note the destruction and deallocation of an object record in a cache.
+ */
+static inline void fscache_object_destroyed(struct fscache_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->object_count))
+		wake_up_all(&fscache_cache_cleared_wq);
+}
+
+/**
+ * fscache_object_lookup_error - Note an object encountered an error
+ * @object: The object on which the error was encountered
+ *
+ * Note that an object encountered a fatal error (usually an I/O error) and
+ * that it should be withdrawn as soon as possible.
+ */
+static inline void fscache_object_lookup_error(struct fscache_object *object)
+{
+	set_bit(FSCACHE_OBJECT_EV_ERROR, &object->events);
+}
+
+/**
+ * fscache_set_store_limit - Set the maximum size to be stored in an object
+ * @object: The object to set the maximum on
+ * @i_size: The limit to set in bytes
+ *
+ * Set the maximum size an object is permitted to reach, implying the highest
+ * byte that may be written.  Intended to be called by the attr_changed() op.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_set_store_limit(struct fscache_object *object, loff_t i_size)
+{
+	object->store_limit = i_size >> PAGE_SHIFT;
+	if (i_size & ~PAGE_MASK)
+		object->store_limit++;
+}
+
+/**
+ * fscache_end_io - End a retrieval operation on a page
+ * @op: The FS-Cache operation covering the retrieval
+ * @page: The page that was to be fetched
+ * @error: The error code (0 if successful)
+ *
+ * Note the end of an operation to retrieve a page, as covered by a particular
+ * operation record.
+ */
+static inline void fscache_end_io(struct fscache_retrieval *op,
+				  struct page *page, int error)
+{
+	op->end_io_func(page, op->context, error);
+}
+
+/*
+ * out-of-line cache backend functions
+ */
+extern void fscache_init_cache(struct fscache_cache *cache,
+			       const struct fscache_cache_ops *ops,
+			       const char *idfmt,
+			       ...) __attribute__ ((format (printf, 3, 4)));
+
+extern int fscache_add_cache(struct fscache_cache *cache,
+			     struct fscache_object *fsdef,
+			     const char *tagname);
+extern void fscache_withdraw_cache(struct fscache_cache *cache);
+
+extern void fscache_io_error(struct fscache_cache *cache);
+
+extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
+				      struct pagevec *pagevec);
+
+extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+					       const void *data,
+					       uint16_t datalen);
+
+#endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3-71-gd317


From 7394daa8c61dfda4baa687f133748fa0b599b017 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:37 +0100
Subject: FS-Cache: Add use of /proc and presentation of statistics

Make FS-Cache create its /proc interface and present various statistical
information through it.  Also provide the functions for updating this
information.

These features are enabled by:

	CONFIG_FSCACHE_PROC
	CONFIG_FSCACHE_STATS
	CONFIG_FSCACHE_HISTOGRAM

The /proc directory for FS-Cache is also exported so that caching modules can
add their own statistics there too.

The FS-Cache module is loadable at this point, and the statistics files can be
examined by userspace:

	cat /proc/fs/fscache/stats
	cat /proc/fs/fscache/histogram

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/filesystems/caching/backend-api.txt |   6 -
 Documentation/filesystems/caching/fscache.txt     |  12 +-
 fs/fscache/Kconfig                                |  34 ++++
 fs/fscache/Makefile                               |   4 +
 fs/fscache/histogram.c                            | 109 +++++++++++
 fs/fscache/internal.h                             | 127 +++++++++++++
 fs/fscache/main.c                                 |   7 +
 fs/fscache/proc.c                                 |  68 +++++++
 fs/fscache/stats.c                                | 212 ++++++++++++++++++++++
 include/linux/fscache-cache.h                     |   4 -
 10 files changed, 566 insertions(+), 17 deletions(-)
 create mode 100644 fs/fscache/histogram.c
 create mode 100644 fs/fscache/proc.c
 create mode 100644 fs/fscache/stats.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
index 17723053aa91..382d52cdaf2d 100644
--- a/Documentation/filesystems/caching/backend-api.txt
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -100,12 +100,6 @@ A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
 is enabled.  This is accessible through the kobject struct fscache_cache::kobj
 and is for use by the cache as it sees fit.
 
-The cache driver may create itself a directory named for the cache type in the
-/proc/fs/fscache/ directory.  This is available if CONFIG_FSCACHE_PROC is
-enabled and is accessible through:
-
-	struct proc_dir_entry *proc_fscache;
-
 
 ========================
 RELEVANT DATA STRUCTURES
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index a759d916273e..0a751f3c2c70 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -195,7 +195,6 @@ STATISTICAL INFORMATION
 
 If FS-Cache is compiled with the following options enabled:
 
-	CONFIG_FSCACHE_PROC=y (implied by the following two)
 	CONFIG_FSCACHE_STATS=y
 	CONFIG_FSCACHE_HISTOGRAM=y
 
@@ -275,7 +274,7 @@ proc files.
  (*) /proc/fs/fscache/histogram
 
 	cat /proc/fs/fscache/histogram
-	+HZ   +TIME OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS
+	JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS
 	===== ===== ========= ========= ========= ========= =========
 
      This shows the breakdown of the number of times each amount of time
@@ -291,16 +290,16 @@ proc files.
 	RETRIEVLS	Time between beginning and end of a retrieval
 
      Each row shows the number of events that took a particular range of times.
-     Each step is 1 jiffy in size.  The +HZ column indicates the particular
-     jiffy range covered, and the +TIME field the equivalent number of seconds.
+     Each step is 1 jiffy in size.  The JIFS column indicates the particular
+     jiffy range covered, and the SECS field the equivalent number of seconds.
 
 
 =========
 DEBUGGING
 =========
 
-The FS-Cache facility can have runtime debugging enabled by adjusting the value
-in:
+If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime
+debugging enabled by adjusting the value in:
 
 	/sys/module/fscache/parameters/debug
 
@@ -327,4 +326,3 @@ the control file.  For example:
 	echo $((1|8|64)) >/sys/module/fscache/parameters/debug
 
 will turn on all function entry debugging.
-
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 7c7bccd5eee4..9bbb8ce7bea0 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -11,6 +11,40 @@ config FSCACHE
 
 	  See Documentation/filesystems/caching/fscache.txt for more information.
 
+config FSCACHE_STATS
+	bool "Gather statistical information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes statistical information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/stats
+
+	  The gathering of statistics adds a certain amount of overhead to
+	  execution as there are a quite a few stats gathered, and on a
+	  multi-CPU system these may be on cachelines that keep bouncing
+	  between CPUs.  On the other hand, the stats are very useful for
+	  debugging purposes.  Saying 'Y' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_HISTOGRAM
+	bool "Gather latency information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes latency information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
 config FSCACHE_DEBUG
 	bool "Debug FS-Cache"
 	depends on FSCACHE
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index f8038b83e0ef..1384823a160c 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -5,4 +5,8 @@
 fscache-y := \
 	main.o
 
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+
 obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned n[5], t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS "
+			 " RETRV DLY RETRIEVLS\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= ========="
+			 " ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+		n[1] = atomic_read(&fscache_ops_histogram[index]);
+		n[2] = atomic_read(&fscache_objs_histogram[index]);
+		n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+		n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+		if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+			   index, t, n[0], n[1], n[2], n[3], n[4]);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations fscache_histogram_ops = {
+	.start		= fscache_histogram_start,
+	.stop		= fscache_histogram_stop,
+	.next		= fscache_histogram_next,
+	.show		= fscache_histogram_show,
+};
+
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &fscache_histogram_ops);
+}
+
+const struct file_operations fscache_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 95dc92da7152..16f9f1f46e4d 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -27,6 +27,30 @@
 #define FSCACHE_MIN_THREADS	4
 #define FSCACHE_MAX_THREADS	32
 
+/*
+ * fsc-histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+extern const struct file_operations fscache_histogram_fops;
+
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+
 /*
  * fsc-main.c
  */
@@ -35,6 +59,109 @@ extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
 
+/*
+ * fsc-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()	(0)
+#define fscache_proc_cleanup()	do {} while (0)
+#endif
+
+/*
+ * fsc-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+	atomic_inc(stat);
+}
+
+extern const struct file_operations fscache_stats_fops;
+#else
+
+#define fscache_stat(stat) do {} while (0)
+#endif
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 76f7c69079c0..7c734b7fb18e 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -52,9 +52,15 @@ static int __init fscache_init(void)
 	if (ret < 0)
 		goto error_slow_work;
 
+	ret = fscache_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
 	printk(KERN_NOTICE "FS-Cache: Loaded\n");
 	return 0;
 
+error_proc:
+	slow_work_unregister_user();
 error_slow_work:
 	return ret;
 }
@@ -68,6 +74,7 @@ static void __exit fscache_exit(void)
 {
 	_enter("");
 
+	fscache_proc_cleanup();
 	slow_work_unregister_user();
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/fscache", NULL))
+		goto error_dir;
+
+#ifdef CONFIG_FSCACHE_STATS
+	if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+			 &fscache_stats_fops))
+		goto error_stats;
+#endif
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+			 &fscache_histogram_fops))
+		goto error_histogram;
+#endif
+
+	_leave(" = 0");
+	return 0;
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "FS-Cache statistics\n");
+
+	seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+		   atomic_read(&fscache_n_cookie_index),
+		   atomic_read(&fscache_n_cookie_data),
+		   atomic_read(&fscache_n_cookie_special));
+
+	seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+		   atomic_read(&fscache_n_object_alloc),
+		   atomic_read(&fscache_n_object_no_alloc),
+		   atomic_read(&fscache_n_object_avail),
+		   atomic_read(&fscache_n_object_dead));
+	seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+		   atomic_read(&fscache_n_checkaux_none),
+		   atomic_read(&fscache_n_checkaux_okay),
+		   atomic_read(&fscache_n_checkaux_update),
+		   atomic_read(&fscache_n_checkaux_obsolete));
+
+	seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+		   atomic_read(&fscache_n_marks),
+		   atomic_read(&fscache_n_uncaches));
+
+	seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+		   " oom=%u\n",
+		   atomic_read(&fscache_n_acquires),
+		   atomic_read(&fscache_n_acquires_null),
+		   atomic_read(&fscache_n_acquires_no_cache),
+		   atomic_read(&fscache_n_acquires_ok),
+		   atomic_read(&fscache_n_acquires_nobufs),
+		   atomic_read(&fscache_n_acquires_oom));
+
+	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+		   atomic_read(&fscache_n_object_lookups),
+		   atomic_read(&fscache_n_object_lookups_negative),
+		   atomic_read(&fscache_n_object_lookups_positive),
+		   atomic_read(&fscache_n_object_created));
+
+	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+		   atomic_read(&fscache_n_updates),
+		   atomic_read(&fscache_n_updates_null),
+		   atomic_read(&fscache_n_updates_run));
+
+	seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+		   atomic_read(&fscache_n_relinquishes),
+		   atomic_read(&fscache_n_relinquishes_null),
+		   atomic_read(&fscache_n_relinquishes_waitcrt));
+
+	seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+		   atomic_read(&fscache_n_attr_changed),
+		   atomic_read(&fscache_n_attr_changed_ok),
+		   atomic_read(&fscache_n_attr_changed_nobufs),
+		   atomic_read(&fscache_n_attr_changed_nomem),
+		   atomic_read(&fscache_n_attr_changed_calls));
+
+	seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+		   atomic_read(&fscache_n_allocs),
+		   atomic_read(&fscache_n_allocs_ok),
+		   atomic_read(&fscache_n_allocs_wait),
+		   atomic_read(&fscache_n_allocs_nobufs));
+	seq_printf(m, "Allocs : ops=%u owt=%u\n",
+		   atomic_read(&fscache_n_alloc_ops),
+		   atomic_read(&fscache_n_alloc_op_waits));
+
+	seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+		   " int=%u oom=%u\n",
+		   atomic_read(&fscache_n_retrievals),
+		   atomic_read(&fscache_n_retrievals_ok),
+		   atomic_read(&fscache_n_retrievals_wait),
+		   atomic_read(&fscache_n_retrievals_nodata),
+		   atomic_read(&fscache_n_retrievals_nobufs),
+		   atomic_read(&fscache_n_retrievals_intr),
+		   atomic_read(&fscache_n_retrievals_nomem));
+	seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+		   atomic_read(&fscache_n_retrieval_ops),
+		   atomic_read(&fscache_n_retrieval_op_waits));
+
+	seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+		   atomic_read(&fscache_n_stores),
+		   atomic_read(&fscache_n_stores_ok),
+		   atomic_read(&fscache_n_stores_again),
+		   atomic_read(&fscache_n_stores_nobufs),
+		   atomic_read(&fscache_n_stores_oom));
+	seq_printf(m, "Stores : ops=%u run=%u\n",
+		   atomic_read(&fscache_n_store_ops),
+		   atomic_read(&fscache_n_store_calls));
+
+	seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+		   atomic_read(&fscache_n_op_pend),
+		   atomic_read(&fscache_n_op_run),
+		   atomic_read(&fscache_n_op_enqueue));
+	seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+		   atomic_read(&fscache_n_op_deferred_release),
+		   atomic_read(&fscache_n_op_release),
+		   atomic_read(&fscache_n_op_gc));
+	return 0;
+}
+
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fscache_stats_show, NULL);
+}
+
+const struct file_operations fscache_stats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index b2a9a484c4cf..84d3532dd3ea 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -29,10 +29,6 @@ struct fscache_cache_ops;
 struct fscache_object;
 struct fscache_operation;
 
-#ifdef CONFIG_FSCACHE_PROC
-extern struct proc_dir_entry *proc_fscache;
-#endif
-
 /*
  * cache tag definition
  */
-- 
cgit v1.2.3-71-gd317


From 0e04d4cefcf4d8fbbdb2c50e93ad541582933fd2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:37 +0100
Subject: FS-Cache: Add cache tag handling

Implement two features of FS-Cache:

 (1) The ability to request and release cache tags - names by which a cache may
     be known to a netfs, and thus selected for use.

 (2) An internal function by which a cache is selected by consulting the netfs,
     if the netfs wishes to be consulted.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/Makefile     |   1 +
 fs/fscache/cache.c      | 166 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fscache/internal.h   |  20 ++++++
 include/linux/fscache.h |   9 ++-
 4 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 fs/fscache/cache.c

(limited to 'include/linux')

diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index bc1f3b9d811a..556708bb9796 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -3,6 +3,7 @@
 #
 
 fscache-y := \
+	cache.o \
 	fsdef.o \
 	main.o
 
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..1a28df36dd93
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,166 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+
+static LIST_HEAD(fscache_cache_tag_list);
+
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+	struct fscache_cache_tag *tag, *xtag;
+
+	/* firstly check for the existence of the tag under read lock */
+	down_read(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_read(&fscache_addremove_sem);
+			return tag;
+		}
+	}
+
+	up_read(&fscache_addremove_sem);
+
+	/* the tag does not exist - create a candidate */
+	xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+	if (!xtag)
+		/* return a dummy tag if out of memory */
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&xtag->usage, 1);
+	strcpy(xtag->name, name);
+
+	/* write lock, search again and add if still not present */
+	down_write(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_write(&fscache_addremove_sem);
+			kfree(xtag);
+			return tag;
+		}
+	}
+
+	list_add_tail(&xtag->link, &fscache_cache_tag_list);
+	up_write(&fscache_addremove_sem);
+	return xtag;
+}
+
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+	if (tag != ERR_PTR(-ENOMEM)) {
+		down_write(&fscache_addremove_sem);
+
+		if (atomic_dec_and_test(&tag->usage))
+			list_del_init(&tag->link);
+		else
+			tag = NULL;
+
+		up_write(&fscache_addremove_sem);
+
+		kfree(tag);
+	}
+}
+
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *cookie)
+{
+	struct fscache_cache_tag *tag;
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("");
+
+	if (list_empty(&fscache_cache_list)) {
+		_leave(" = NULL [no cache]");
+		return NULL;
+	}
+
+	/* we check the parent to determine the cache to use */
+	spin_lock(&cookie->lock);
+
+	/* the first in the parent's backing list should be the preferred
+	 * cache */
+	if (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object, cookie_link);
+
+		cache = object->cache;
+		if (object->state >= FSCACHE_OBJECT_DYING ||
+		    test_bit(FSCACHE_IOERROR, &cache->flags))
+			cache = NULL;
+
+		spin_unlock(&cookie->lock);
+		_leave(" = %p [parent]", cache);
+		return cache;
+	}
+
+	/* the parent is unbacked */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* cookie not an index and is unbacked */
+		spin_unlock(&cookie->lock);
+		_leave(" = NULL [cookie ub,ni]");
+		return NULL;
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (!cookie->def->select_cache)
+		goto no_preference;
+
+	/* ask the netfs for its preference */
+	tag = cookie->def->select_cache(cookie->parent->netfs_data,
+					cookie->netfs_data);
+	if (!tag)
+		goto no_preference;
+
+	if (tag == ERR_PTR(-ENOMEM)) {
+		_leave(" = NULL [nomem tag]");
+		return NULL;
+	}
+
+	if (!tag->cache) {
+		_leave(" = NULL [unbacked tag]");
+		return NULL;
+	}
+
+	if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+		return NULL;
+
+	_leave(" = %p [specific]", tag->cache);
+	return tag->cache;
+
+no_preference:
+	/* netfs has no preference - just select first cache */
+	cache = list_entry(fscache_cache_list.next,
+			   struct fscache_cache, link);
+	_leave(" = %p [first]", cache);
+	return cache;
+}
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 4113af8d1660..0a2069afa417 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -27,6 +27,15 @@
 #define FSCACHE_MIN_THREADS	4
 #define FSCACHE_MAX_THREADS	32
 
+/*
+ * fsc-cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+
+extern struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *);
+
 /*
  * fsc-fsdef.c
  */
@@ -168,6 +177,17 @@ extern const struct file_operations fscache_stats_fops;
 #define fscache_stat(stat) do {} while (0)
 #endif
 
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+				       unsigned event)
+{
+	BUG();  // TODO
+}
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index feb3b0e0af4d..9584c094d69f 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -173,6 +173,8 @@ struct fscache_netfs {
  * - these are undefined symbols when FS-Cache is not configured and the
  *   optimiser takes care of not using them
  */
+extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
+extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -218,7 +220,10 @@ void fscache_unregister_netfs(struct fscache_netfs *netfs)
 static inline
 struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
 {
-	return NULL;
+	if (fscache_available())
+		return __fscache_lookup_cache_tag(name);
+	else
+		return NULL;
 }
 
 /**
@@ -233,6 +238,8 @@ struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
 static inline
 void fscache_release_cache_tag(struct fscache_cache_tag *tag)
 {
+	if (fscache_available())
+		__fscache_release_cache_tag(tag);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 726dd7ff10c217dd74329c94643dc8ebea27334b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:38 +0100
Subject: FS-Cache: Add netfs registration

Add functions to register and unregister a network filesystem or other client
of the FS-Cache service.  This allocates and releases the cookie representing
the top-level index for a netfs, and makes it available to the netfs.

If the FS-Cache facility is disabled, then the calls are optimised away at
compile time.

Note that whilst this patch may appear to work with FS-Cache enabled and a
netfs attempting to use it, it will leak the cookie it allocates for the netfs
as fscache_relinquish_cookie() is implemented in a later patch.  This will
cause the slab code to emit a warning when the module is removed.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/Makefile     |   3 +-
 fs/fscache/netfs.c      | 103 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fscache.h |   9 ++++-
 3 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 fs/fscache/netfs.c

(limited to 'include/linux')

diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index f88ac1764ce3..ecf6946eaeb3 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -6,7 +6,8 @@ fscache-y := \
 	cache.o \
 	cookie.o \
 	fsdef.o \
-	main.o
+	main.o \
+	netfs.o
 
 fscache-$(CONFIG_PROC_FS) += proc.o
 fscache-$(CONFIG_FSCACHE_STATS) += stats.o
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static LIST_HEAD(fscache_netfs_list);
+
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+	struct fscache_netfs *ptr;
+	int ret;
+
+	_enter("{%s}", netfs->name);
+
+	INIT_LIST_HEAD(&netfs->link);
+
+	/* allocate a cookie for the primary index */
+	netfs->primary_index =
+		kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+
+	if (!netfs->primary_index) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* initialise the primary index cookie */
+	atomic_set(&netfs->primary_index->usage, 1);
+	atomic_set(&netfs->primary_index->n_children, 0);
+
+	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
+	netfs->primary_index->parent		= &fscache_fsdef_index;
+	netfs->primary_index->netfs_data	= netfs;
+
+	atomic_inc(&netfs->primary_index->parent->usage);
+	atomic_inc(&netfs->primary_index->parent->n_children);
+
+	spin_lock_init(&netfs->primary_index->lock);
+	INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+
+	/* check the netfs type is not already present */
+	down_write(&fscache_addremove_sem);
+
+	ret = -EEXIST;
+	list_for_each_entry(ptr, &fscache_netfs_list, link) {
+		if (strcmp(ptr->name, netfs->name) == 0)
+			goto already_registered;
+	}
+
+	list_add(&netfs->link, &fscache_netfs_list);
+	ret = 0;
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+	       netfs->name);
+
+already_registered:
+	up_write(&fscache_addremove_sem);
+
+	if (ret < 0) {
+		netfs->primary_index->parent = NULL;
+		__fscache_cookie_put(netfs->primary_index);
+		netfs->primary_index = NULL;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+	_enter("{%s.%u}", netfs->name, netfs->version);
+
+	down_write(&fscache_addremove_sem);
+
+	list_del(&netfs->link);
+	fscache_relinquish_cookie(netfs->primary_index, 0);
+
+	up_write(&fscache_addremove_sem);
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+	       netfs->name);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 9584c094d69f..b195c2e1ef6a 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -173,6 +173,8 @@ struct fscache_netfs {
  * - these are undefined symbols when FS-Cache is not configured and the
  *   optimiser takes care of not using them
  */
+extern int __fscache_register_netfs(struct fscache_netfs *);
+extern void __fscache_unregister_netfs(struct fscache_netfs *);
 extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
 extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 
@@ -188,7 +190,10 @@ extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 static inline
 int fscache_register_netfs(struct fscache_netfs *netfs)
 {
-	return 0;
+	if (fscache_available())
+		return __fscache_register_netfs(netfs);
+	else
+		return 0;
 }
 
 /**
@@ -205,6 +210,8 @@ int fscache_register_netfs(struct fscache_netfs *netfs)
 static inline
 void fscache_unregister_netfs(struct fscache_netfs *netfs)
 {
+	if (fscache_available())
+		__fscache_unregister_netfs(netfs);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From ccc4fc3d11e91477036d1f82bfa2d442f6ce77f0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:38 +0100
Subject: FS-Cache: Implement the cookie management part of the netfs API

Implement the cookie management part of the FS-Cache netfs client API.  The
documentation and API header file were added in a previous patch.

This patch implements the following three functions:

 (1) fscache_acquire_cookie().

     Acquire a cookie to represent an object to the netfs.  If the object in
     question is a non-index object, then that object and its parent indices
     will be created on disk at this point if they don't already exist.  Index
     creation is deferred because an index may reside in multiple caches.

 (2) fscache_relinquish_cookie().

     Retire or release a cookie previously acquired.  At this point, the
     object on disk may be destroyed.

 (3) fscache_update_cookie().

     Update the in-cache representation of a cookie.  This is used to update
     the auxiliary data for coherency management purposes.

With this patch it is possible to have a netfs instruct a cache backend to
look up, validate and create metadata on disk and to destroy it again.
The ability to actually store and retrieve data in the objects so created is
added in later patches.

Note that these functions will never return an error.  _All_ errors are
handled internally to FS-Cache.

The worst that can happen is that fscache_acquire_cookie() may return a NULL
pointer - which is considered a negative cookie pointer and can be passed back
to any function that takes a cookie without harm.  A negative cookie pointer
merely suppresses caching at that level.

The stub in linux/fscache.h will detect inline the negative cookie pointer and
abort the operation as fast as possible.  This means that the compiler doesn't
have to set up for a call in that case.

See the documentation in Documentation/filesystems/caching/netfs-api.txt for
more information.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/cookie.c     | 444 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fscache.h |  16 +-
 2 files changed, 459 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 47fd75b832e1..72fd18f6c71f 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -7,6 +7,9 @@
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
  */
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
@@ -16,6 +19,14 @@
 
 struct kmem_cache *fscache_cookie_jar;
 
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object);
+
 /*
  * initialise an cookie jar slab element prior to any use
  */
@@ -28,6 +39,439 @@ void fscache_cookie_init_once(void *_cookie)
 	INIT_HLIST_HEAD(&cookie->backing_objects);
 }
 
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_cookie *parent,
+	const struct fscache_cookie_def *def,
+	void *netfs_data)
+{
+	struct fscache_cookie *cookie;
+
+	BUG_ON(!def);
+
+	_enter("{%s},{%s},%p",
+	       parent ? (char *) parent->def->name : "<no-parent>",
+	       def->name, netfs_data);
+
+	fscache_stat(&fscache_n_acquires);
+
+	/* if there's no parent cookie, then we don't create one here either */
+	if (!parent) {
+		fscache_stat(&fscache_n_acquires_null);
+		_leave(" [no parent]");
+		return NULL;
+	}
+
+	/* validate the definition */
+	BUG_ON(!def->get_key);
+	BUG_ON(!def->name[0]);
+
+	BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+	       parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+
+	/* allocate and initialise a cookie */
+	cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+	if (!cookie) {
+		fscache_stat(&fscache_n_acquires_oom);
+		_leave(" [ENOMEM]");
+		return NULL;
+	}
+
+	atomic_set(&cookie->usage, 1);
+	atomic_set(&cookie->n_children, 0);
+
+	atomic_inc(&parent->usage);
+	atomic_inc(&parent->n_children);
+
+	cookie->def		= def;
+	cookie->parent		= parent;
+	cookie->netfs_data	= netfs_data;
+	cookie->flags		= 0;
+
+	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+
+	switch (cookie->def->type) {
+	case FSCACHE_COOKIE_TYPE_INDEX:
+		fscache_stat(&fscache_n_cookie_index);
+		break;
+	case FSCACHE_COOKIE_TYPE_DATAFILE:
+		fscache_stat(&fscache_n_cookie_data);
+		break;
+	default:
+		fscache_stat(&fscache_n_cookie_special);
+		break;
+	}
+
+	/* if the object is an index then we need do nothing more here - we
+	 * create indices on disk when we need them as an index may exist in
+	 * multiple caches */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		if (fscache_acquire_non_index_cookie(cookie) < 0) {
+			atomic_dec(&parent->n_children);
+			__fscache_cookie_put(cookie);
+			fscache_stat(&fscache_n_acquires_nobufs);
+			_leave(" = NULL");
+			return NULL;
+		}
+	}
+
+	fscache_stat(&fscache_n_acquires_ok);
+	_leave(" = %p", cookie);
+	return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+	uint64_t i_size;
+	int ret;
+
+	_enter("");
+
+	cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+
+	/* now we need to see whether the backing objects for this cookie yet
+	 * exist, if not there'll be nothing to search */
+	down_read(&fscache_addremove_sem);
+
+	if (list_empty(&fscache_cache_list)) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = 0 [no caches]");
+		return 0;
+	}
+
+	/* select a cache in which to store the object */
+	cache = fscache_select_cache_for_object(cookie->parent);
+	if (!cache) {
+		up_read(&fscache_addremove_sem);
+		fscache_stat(&fscache_n_acquires_no_cache);
+		_leave(" = -ENOMEDIUM [no cache]");
+		return -ENOMEDIUM;
+	}
+
+	_debug("cache %s", cache->tag->name);
+
+	cookie->flags =
+		(1 << FSCACHE_COOKIE_LOOKING_UP) |
+		(1 << FSCACHE_COOKIE_CREATING) |
+		(1 << FSCACHE_COOKIE_NO_DATA_YET);
+
+	/* ask the cache to allocate objects for this cookie and its parent
+	 * chain */
+	ret = fscache_alloc_object(cache, cookie);
+	if (ret < 0) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* pass on how big the object we're caching is supposed to be */
+	cookie->def->get_attr(cookie->netfs_data, &i_size);
+
+	spin_lock(&cookie->lock);
+	if (hlist_empty(&cookie->backing_objects)) {
+		spin_unlock(&cookie->lock);
+		goto unavailable;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	fscache_set_store_limit(object, i_size);
+
+	/* initiate the process of looking up all the objects in the chain
+	 * (done by fscache_initialise_object()) */
+	fscache_enqueue_object(object);
+
+	spin_unlock(&cookie->lock);
+
+	/* we may be required to wait for lookup to complete at this point */
+	if (!fscache_defer_lookup) {
+		_debug("non-deferred lookup %p", &cookie->flags);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("complete");
+		if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+			goto unavailable;
+	}
+
+	up_read(&fscache_addremove_sem);
+	_leave(" = 0 [deferred]");
+	return 0;
+
+unavailable:
+	up_read(&fscache_addremove_sem);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+
+	spin_lock(&cookie->lock);
+	hlist_for_each_entry(object, _n, &cookie->backing_objects,
+			     cookie_link) {
+		if (object->cache == cache)
+			goto object_already_extant;
+	}
+	spin_unlock(&cookie->lock);
+
+	/* ask the cache to allocate an object (we may end up with duplicate
+	 * objects at this stage, but we sort that out later) */
+	object = cache->ops->alloc_object(cache, cookie);
+	if (IS_ERR(object)) {
+		fscache_stat(&fscache_n_object_no_alloc);
+		ret = PTR_ERR(object);
+		goto error;
+	}
+
+	fscache_stat(&fscache_n_object_alloc);
+
+	object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+
+	_debug("ALLOC OBJ%x: %s {%lx}",
+	       object->debug_id, cookie->def->name, object->events);
+
+	ret = fscache_alloc_object(cache, cookie->parent);
+	if (ret < 0)
+		goto error_put;
+
+	/* only attach if we managed to allocate all we needed, otherwise
+	 * discard the object we just allocated and instead use the one
+	 * attached to the cookie */
+	if (fscache_attach_object(cookie, object) < 0)
+		cache->ops->put_object(object);
+
+	_leave(" = 0");
+	return 0;
+
+object_already_extant:
+	ret = -ENOBUFS;
+	if (object->state >= FSCACHE_OBJECT_DYING) {
+		spin_unlock(&cookie->lock);
+		goto error;
+	}
+	spin_unlock(&cookie->lock);
+	_leave(" = 0 [found]");
+	return 0;
+
+error_put:
+	cache->ops->put_object(object);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object)
+{
+	struct fscache_object *p;
+	struct fscache_cache *cache = object->cache;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+
+	spin_lock(&cookie->lock);
+
+	/* there may be multiple initial creations of this object, but we only
+	 * want one */
+	ret = -EEXIST;
+	hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING)
+				ret = -ENOBUFS;
+			goto cant_attach_object;
+		}
+	}
+
+	/* pin the parent object */
+	spin_lock_nested(&cookie->parent->lock, 1);
+	hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+			     cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING) {
+				ret = -ENOBUFS;
+				spin_unlock(&cookie->parent->lock);
+				goto cant_attach_object;
+			}
+			object->parent = p;
+			spin_lock(&p->lock);
+			p->n_children++;
+			spin_unlock(&p->lock);
+			break;
+		}
+	}
+	spin_unlock(&cookie->parent->lock);
+
+	/* attach to the cache's object list */
+	if (list_empty(&object->cache_link)) {
+		spin_lock(&cache->object_list_lock);
+		list_add(&object->cache_link, &cache->object_list);
+		spin_unlock(&cache->object_list_lock);
+	}
+
+	/* attach to the cookie */
+	object->cookie = cookie;
+	atomic_inc(&cookie->usage);
+	hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+	ret = 0;
+
+cant_attach_object:
+	spin_unlock(&cookie->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_p;
+
+	fscache_stat(&fscache_n_updates);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_updates_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("{%s}", cookie->def->name);
+
+	BUG_ON(!cookie->def->get_aux);
+
+	spin_lock(&cookie->lock);
+
+	/* update the index entry on disk in each cache backing this cookie */
+	hlist_for_each_entry(object, _p,
+			     &cookie->backing_objects, cookie_link) {
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+	}
+
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+	struct fscache_cache *cache;
+	struct fscache_object *object;
+	unsigned long event;
+
+	fscache_stat(&fscache_n_relinquishes);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p},%d",
+	       cookie, cookie->def->name, cookie->netfs_data, retire);
+
+	if (atomic_read(&cookie->n_children) != 0) {
+		printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+		       cookie->def->name);
+		BUG();
+	}
+
+	/* wait for the cookie to finish being instantiated (or to fail) */
+	if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+		fscache_stat(&fscache_n_relinquishes_waitcrt);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	}
+
+	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+
+	/* detach pointers back to the netfs */
+	spin_lock(&cookie->lock);
+
+	cookie->netfs_data	= NULL;
+	cookie->def		= NULL;
+
+	/* break links with all the active objects */
+	while (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object,
+				     cookie_link);
+
+		_debug("RELEASE OBJ%x", object->debug_id);
+
+		/* detach each cache object from the object cookie */
+		spin_lock(&object->lock);
+		hlist_del_init(&object->cookie_link);
+
+		cache = object->cache;
+		object->cookie = NULL;
+		fscache_raise_event(object, event);
+		spin_unlock(&object->lock);
+
+		if (atomic_dec_and_test(&cookie->usage))
+			/* the cookie refcount shouldn't be reduced to 0 yet */
+			BUG();
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (cookie->parent) {
+		ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+		ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+		atomic_dec(&cookie->parent->n_children);
+	}
+
+	/* finally dispose of the cookie */
+	ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+	fscache_cookie_put(cookie);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+
 /*
  * destroy a cookie
  */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index b195c2e1ef6a..245b48646efa 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -178,6 +178,13 @@ extern void __fscache_unregister_netfs(struct fscache_netfs *);
 extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
 extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 
+extern struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_cookie *,
+	const struct fscache_cookie_def *,
+	void *);
+extern void __fscache_relinquish_cookie(struct fscache_cookie *, int);
+extern void __fscache_update_cookie(struct fscache_cookie *);
+
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
  * @netfs: The description of the filesystem
@@ -269,7 +276,10 @@ struct fscache_cookie *fscache_acquire_cookie(
 	const struct fscache_cookie_def *def,
 	void *netfs_data)
 {
-	return NULL;
+	if (fscache_cookie_valid(parent))
+		return __fscache_acquire_cookie(parent, def, netfs_data);
+	else
+		return NULL;
 }
 
 /**
@@ -287,6 +297,8 @@ struct fscache_cookie *fscache_acquire_cookie(
 static inline
 void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_relinquish_cookie(cookie, retire);
 }
 
 /**
@@ -302,6 +314,8 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 static inline
 void fscache_update_cookie(struct fscache_cookie *cookie)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_update_cookie(cookie);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From b510882281d56873e1194021643b7c325336f84f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:39 +0100
Subject: FS-Cache: Implement data I/O part of netfs API

Implement the data I/O part of the FS-Cache netfs API.  The documentation and
API header file were added in a previous patch.

This patch implements the following functions for the netfs to call:

 (*) fscache_attr_changed().

     Indicate that the object has changed its attributes.  The only attribute
     currently recorded is the file size.  Only pages within the set file size
     will be stored in the cache.

     This operation is submitted for asynchronous processing, and will return
     immediately.  It will return -ENOMEM if an out of memory error is
     encountered, -ENOBUFS if the object is not actually cached, or 0 if the
     operation is successfully queued.

 (*) fscache_read_or_alloc_page().
 (*) fscache_read_or_alloc_pages().

     Request data be fetched from the disk, and allocate internal metadata to
     track the netfs pages and reserve disk space for unknown pages.

     These operations perform semi-asynchronous data reads.  Upon returning
     they will indicate which pages they think can be retrieved from disk, and
     will have set in progress attempts to retrieve those pages.

     These will return, in order of preference, -ENOMEM on memory allocation
     error, -ERESTARTSYS if a signal interrupted proceedings, -ENODATA if one
     or more requested pages are not yet cached, -ENOBUFS if the object is not
     actually cached or if there isn't space for future pages to be cached on
     this object, or 0 if successful.

     In the case of the multipage function, the pages for which reads are set
     in progress will be removed from the list and the page count decreased
     appropriately.

     If any read operations should fail, the completion function will be given
     an error, and will also be passed contextual information to allow the
     netfs to fall back to querying the server for the absent pages.

     For each successful read, the page completion function will also be
     called.

     Any pages subsequently tracked by the cache will have PG_fscache set upon
     them on return.  fscache_uncache_page() must be called for such pages.

     If supplied by the netfs, the mark_pages_cached() cookie op will be
     invoked for any pages now tracked.

 (*) fscache_alloc_page().

     Allocate internal metadata to track a netfs page and reserve disk space.

     This will return -ENOMEM on memory allocation error, -ERESTARTSYS on
     signal, -ENOBUFS if the object isn't cached, or there isn't enough space
     in the cache, or 0 if successful.

     Any pages subsequently tracked by the cache will have PG_fscache set upon
     them on return.  fscache_uncache_page() must be called for such pages.

     If supplied by the netfs, the mark_pages_cached() cookie op will be
     invoked for any pages now tracked.

 (*) fscache_write_page().

     Request data be stored to disk.  This may only be called on pages that
     have been read or alloc'd by the above three functions and have not yet
     been uncached.

     This will return -ENOMEM on memory allocation error, -ERESTARTSYS on
     signal, -ENOBUFS if the object isn't cached, or there isn't immediately
     enough space in the cache, or 0 if successful.

     On a successful return, this operation will have queued the page for
     asynchronous writing to the cache.  The page will be returned with
     PG_fscache_write set until the write completes one way or another.  The
     caller will not be notified if the write fails due to an I/O error.  If
     that happens, the object will become available and all pending writes will
     be aborted.

     Note that the cache may batch up page writes, and so it may take a while
     to get around to writing them out.

     The caller must assume that until PG_fscache_write is cleared the page is
     use by the cache.  Any changes made to the page may be reflected on disk.
     The page may even be under DMA.

 (*) fscache_uncache_page().

     Indicate that the cache should stop tracking a page previously read or
     alloc'd from the cache.  If the page was alloc'd only, but unwritten, it
     will not appear on disk.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/fscache/Makefile     |   3 +-
 fs/fscache/internal.h   |  21 ++
 fs/fscache/page.c       | 816 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fscache.h |  52 ++-
 4 files changed, 886 insertions(+), 6 deletions(-)
 create mode 100644 fs/fscache/page.c

(limited to 'include/linux')

diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 6f82da2aa9d1..91571b95aacc 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -9,7 +9,8 @@ fscache-y := \
 	main.o \
 	netfs.o \
 	object.o \
-	operation.o
+	operation.o \
+	page.o
 
 fscache-$(CONFIG_PROC_FS) += proc.o
 fscache-$(CONFIG_FSCACHE_STATS) += stats.o
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 014a830c8b37..e0cbd16f6dc9 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -229,6 +229,27 @@ static inline void fscache_cookie_put(struct fscache_cookie *cookie)
 		__fscache_cookie_put(cookie);
 }
 
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->get_context)
+		cookie->def->get_context(cookie->netfs_data, context);
+	return context;
+}
+
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->put_context)
+		cookie->def->put_context(cookie->netfs_data, context);
+}
+
 /*****************************************************************************/
 /*
  * debug tracing
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	void *val;
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	rcu_read_unlock();
+
+	return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	struct page *xpage;
+
+	spin_lock(&cookie->lock);
+	xpage = radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->lock);
+	ASSERT(xpage != NULL);
+
+	wake_up_bit(&cookie->flags, 0);
+}
+
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+	struct fscache_object *object = op->object;
+
+	_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+
+	fscache_stat(&fscache_n_attr_changed_calls);
+
+	if (fscache_object_is_active(object) &&
+	    object->cache->ops->attr_changed(object) < 0)
+		fscache_abort_object(object);
+
+	_leave("");
+}
+
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+
+	_enter("%p", cookie);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+	fscache_stat(&fscache_n_attr_changed);
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op) {
+		fscache_stat(&fscache_n_attr_changed_nomem);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	fscache_operation_init(op, NULL);
+	fscache_operation_init_slow(op, fscache_attr_changed_op);
+	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_exclusive_op(object, op) < 0)
+		goto nobufs;
+	spin_unlock(&cookie->lock);
+	fscache_stat(&fscache_n_attr_changed_ok);
+	fscache_put_operation(op);
+	_leave(" = 0");
+	return 0;
+
+nobufs:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+	fscache_stat(&fscache_n_attr_changed_nobufs);
+	_leave(" = %d", -ENOBUFS);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+
+/*
+ * handle secondary execution given to a retrieval op on behalf of the
+ * cache
+ */
+static void fscache_retrieval_work(struct work_struct *work)
+{
+	struct fscache_retrieval *op =
+		container_of(work, struct fscache_retrieval, op.fast_work);
+	unsigned long start;
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	start = jiffies;
+	op->op.processor(&op->op);
+	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(&op->op);
+}
+
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	fscache_hist(fscache_retrieval_histogram, op->start_time);
+	if (op->context)
+		fscache_put_context(op->op.object->cookie, op->context);
+
+	_leave("");
+}
+
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+	struct address_space *mapping,
+	fscache_rw_complete_t end_io_func,
+	void *context)
+{
+	struct fscache_retrieval *op;
+
+	/* allocate a retrieval operation and attempt to submit it */
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op) {
+		fscache_stat(&fscache_n_retrievals_nomem);
+		return NULL;
+	}
+
+	fscache_operation_init(&op->op, fscache_release_retrieval_op);
+	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+	op->mapping	= mapping;
+	op->end_io_func	= end_io_func;
+	op->context	= context;
+	op->start_time	= jiffies;
+	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
+	INIT_LIST_HEAD(&op->to_do);
+	return op;
+}
+
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+	unsigned long jif;
+
+	_enter("");
+
+	if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+		_leave(" = 0 [imm]");
+		return 0;
+	}
+
+	fscache_stat(&fscache_n_retrievals_wait);
+
+	jif = jiffies;
+	if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			fscache_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE) != 0) {
+		fscache_stat(&fscache_n_retrievals_intr);
+		_leave(" = -ERESTARTSYS");
+		return -ERESTARTSYS;
+	}
+
+	ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+
+	smp_rmb();
+	fscache_hist(fscache_retrieval_delay_histogram, jif);
+	_leave(" = 0 [dly]");
+	return 0;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   -ENODATA	- no data available in the backing object for this block
+ *   0		- dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+				 struct page *page,
+				 fscache_rw_complete_t end_io_func,
+				 void *context,
+				 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+	if (!op) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_retrieval_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		ret = object->cache->ops->allocate_page(op, page, gfp);
+		if (ret == 0)
+			ret = -ENODATA;
+	} else {
+		ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+	}
+
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM	- out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS	- no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA	- no data available in the backing object for some or all of
+ *                the pages
+ *   0		- dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages,
+				  fscache_rw_complete_t end_io_func,
+				  void *context,
+				  gfp_t gfp)
+{
+	fscache_pages_retrieval_func_t func;
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,,%d,,,", cookie, *nr_pages);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(*nr_pages, >, 0);
+	ASSERT(!list_empty(pages));
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(mapping, end_io_func, context);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_retrieval_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+		func = object->cache->ops->allocate_pages;
+	else
+		func = object->cache->ops->read_or_alloc_pages;
+	ret = func(op, pages, nr_pages, gfp);
+
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   0		- block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_allocs);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_alloc_ops);
+
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_alloc_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	ret = object->cache->ops->allocate_page(op, page, gfp);
+
+	if (ret < 0)
+		fscache_stat(&fscache_n_allocs_nobufs);
+	else
+		fscache_stat(&fscache_n_allocs_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_allocs_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+	_enter("{OP%x}", _op->debug_id);
+}
+
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+	struct fscache_storage *op =
+		container_of(_op, struct fscache_storage, op);
+	struct fscache_object *object = op->op.object;
+	struct fscache_cookie *cookie = object->cookie;
+	struct page *page;
+	unsigned n;
+	void *results[1];
+	int ret;
+
+	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+
+	spin_lock(&cookie->lock);
+	spin_lock(&object->lock);
+
+	if (!fscache_object_is_active(object)) {
+		spin_unlock(&object->lock);
+		spin_unlock(&cookie->lock);
+		_leave("");
+		return;
+	}
+
+	fscache_stat(&fscache_n_store_calls);
+
+	/* find a page to store */
+	page = NULL;
+	n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+				       FSCACHE_COOKIE_PENDING_TAG);
+	if (n != 1)
+		goto superseded;
+	page = results[0];
+	_debug("gang %d [%lx]", n, page->index);
+	if (page->index > op->store_limit)
+		goto superseded;
+
+	radix_tree_tag_clear(&cookie->stores, page->index,
+			     FSCACHE_COOKIE_PENDING_TAG);
+
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+
+	if (page) {
+		ret = object->cache->ops->write_page(op, page);
+		fscache_end_page_write(cookie, page);
+		page_cache_release(page);
+		if (ret < 0)
+			fscache_abort_object(object);
+		else
+			fscache_enqueue_operation(&op->op);
+	}
+
+	_leave("");
+	return;
+
+superseded:
+	/* this writer is going away and there aren't any more things to
+	 * write */
+	_debug("cease");
+	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ENOBUFS	- no backing object available in which to cache the page
+ *   0		- dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *	(a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *	    fill op)
+ *
+ *	(b) writes deferred till post-creation (mark page for writing and
+ *	    return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *      (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ *	(a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *	(b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_storage *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%x,", cookie, (u32) page->flags);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERT(PageFsCache(page));
+
+	fscache_stat(&fscache_n_stores);
+
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op)
+		goto nomem;
+
+	fscache_operation_init(&op->op, fscache_release_write_op);
+	fscache_operation_init_slow(&op->op, fscache_write_op);
+	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+
+	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+	if (ret < 0)
+		goto nomem_free;
+
+	ret = -ENOBUFS;
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+	if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+		goto nobufs;
+
+	/* add the page to the pending-storage radix tree on the backing
+	 * object */
+	spin_lock(&object->lock);
+
+	_debug("store limit %llx", (unsigned long long) object->store_limit);
+
+	ret = radix_tree_insert(&cookie->stores, page->index, page);
+	if (ret < 0) {
+		if (ret == -EEXIST)
+			goto already_queued;
+		_debug("insert failed %d", ret);
+		goto nobufs_unlock_obj;
+	}
+
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_PENDING_TAG);
+	page_cache_get(page);
+
+	/* we only want one writer at a time, but we do need to queue new
+	 * writers after exclusive ops */
+	if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+		goto already_pending;
+
+	spin_unlock(&object->lock);
+
+	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
+	op->store_limit = object->store_limit;
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto submit_failed;
+
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	fscache_stat(&fscache_n_store_ops);
+	fscache_stat(&fscache_n_stores_ok);
+
+	/* the slow work queue now carries its own ref on the object */
+	fscache_put_operation(&op->op);
+	_leave(" = 0");
+	return 0;
+
+already_queued:
+	fscache_stat(&fscache_n_stores_again);
+already_pending:
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_ok);
+	_leave(" = 0");
+	return 0;
+
+submit_failed:
+	radix_tree_delete(&cookie->stores, page->index);
+	page_cache_release(page);
+	ret = -ENOBUFS;
+	goto nobufs;
+
+nobufs_unlock_obj:
+	spin_unlock(&object->lock);
+nobufs:
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+
+nomem_free:
+	kfree(op);
+nomem:
+	fscache_stat(&fscache_n_stores_oom);
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+	struct fscache_object *object;
+
+	_enter(",%p", page);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	fscache_stat(&fscache_n_uncaches);
+
+	/* cache withdrawal may beat us to it */
+	if (!PageFsCache(page))
+		goto done;
+
+	/* get the object */
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects)) {
+		ClearPageFsCache(page);
+		goto done_unlock;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	/* there might now be stuff on disk we could read */
+	clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+	/* only invoke the cache backend if we managed to mark the page
+	 * uncached here; this deals with synchronisation vs withdrawal */
+	if (TestClearPageFsCache(page) &&
+	    object->cache->ops->uncache_page) {
+		/* the cache backend releases the cookie lock */
+		object->cache->ops->uncache_page(object, page);
+		goto done;
+	}
+
+done_unlock:
+	spin_unlock(&cookie->lock);
+done:
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+			       struct pagevec *pagevec)
+{
+	struct fscache_cookie *cookie = op->op.object->cookie;
+	unsigned long loop;
+
+#ifdef CONFIG_FSCACHE_STATS
+	atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+
+	for (loop = 0; loop < pagevec->nr; loop++) {
+		struct page *page = pagevec->pages[loop];
+
+		_debug("- mark %p{%lx}", page, page->index);
+		if (TestSetPageFsCache(page)) {
+			static bool once_only;
+			if (!once_only) {
+				once_only = true;
+				printk(KERN_WARNING "FS-Cache:"
+				       " Cookie type %s marked page %lx"
+				       " multiple times\n",
+				       cookie->def->name, page->index);
+			}
+		}
+	}
+
+	if (cookie->def->mark_pages_cached)
+		cookie->def->mark_pages_cached(cookie->netfs_data,
+					       op->mapping, pagevec);
+	pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 245b48646efa..6d8ee466e0a0 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -184,6 +184,24 @@ extern struct fscache_cookie *__fscache_acquire_cookie(
 	void *);
 extern void __fscache_relinquish_cookie(struct fscache_cookie *, int);
 extern void __fscache_update_cookie(struct fscache_cookie *);
+extern int __fscache_attr_changed(struct fscache_cookie *);
+extern int __fscache_read_or_alloc_page(struct fscache_cookie *,
+					struct page *,
+					fscache_rw_complete_t,
+					void *,
+					gfp_t);
+extern int __fscache_read_or_alloc_pages(struct fscache_cookie *,
+					 struct address_space *,
+					 struct list_head *,
+					 unsigned *,
+					 fscache_rw_complete_t,
+					 void *,
+					 gfp_t);
+extern int __fscache_alloc_page(struct fscache_cookie *, struct page *, gfp_t);
+extern int __fscache_write_page(struct fscache_cookie *, struct page *, gfp_t);
+extern void __fscache_uncache_page(struct fscache_cookie *, struct page *);
+extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *);
+extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -361,7 +379,10 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie)
 static inline
 int fscache_attr_changed(struct fscache_cookie *cookie)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_attr_changed(cookie);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -418,7 +439,11 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 			       void *context,
 			       gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_read_or_alloc_page(cookie, page, end_io_func,
+						    context, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -464,7 +489,12 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 				void *context,
 				gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_read_or_alloc_pages(cookie, mapping, pages,
+						     nr_pages, end_io_func,
+						     context, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -490,7 +520,10 @@ int fscache_alloc_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_alloc_page(cookie, page, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -516,7 +549,10 @@ int fscache_write_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	return -ENOBUFS;
+	if (fscache_cookie_valid(cookie))
+		return __fscache_write_page(cookie, page, gfp);
+	else
+		return -ENOBUFS;
 }
 
 /**
@@ -537,6 +573,8 @@ static inline
 void fscache_uncache_page(struct fscache_cookie *cookie,
 			  struct page *page)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_uncache_page(cookie, page);
 }
 
 /**
@@ -553,6 +591,8 @@ static inline
 bool fscache_check_page_write(struct fscache_cookie *cookie,
 			      struct page *page)
 {
+	if (fscache_cookie_valid(cookie))
+		return __fscache_check_page_write(cookie, page);
 	return false;
 }
 
@@ -571,6 +611,8 @@ static inline
 void fscache_wait_on_page_write(struct fscache_cookie *cookie,
 				struct page *page)
 {
+	if (fscache_cookie_valid(cookie))
+		__fscache_wait_on_page_write(cookie, page);
 }
 
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3-71-gd317


From 385e1ca5f21c4680ad6a46a3aa2ea8af99e99c92 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:39 +0100
Subject: CacheFiles: Permit the page lock state to be monitored

Add a function to install a monitor on the page lock waitqueue for a particular
page, thus allowing the page being unlocked to be detected.

This is used by CacheFiles to detect read completion on a page in the backing
filesystem so that it can then copy the data to the waiting netfs page.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/pagemap.h |  5 +++++
 mm/filemap.c            | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 076a7dc67c2b..34da5230faab 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -383,6 +383,11 @@ static inline void wait_on_page_writeback(struct page *page)
 
 extern void end_page_writeback(struct page *page);
 
+/*
+ * Add an arbitrary waiter to a page's wait queue
+ */
+extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
+
 /*
  * Fault a userspace page into pagetables.  Return non-zero on a fault.
  *
diff --git a/mm/filemap.c b/mm/filemap.c
index cbc5772e7171..fc11974f2bee 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -564,6 +564,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
+/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page - Page defining the wait queue of interest
+ * @waiter - Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+{
+	wait_queue_head_t *q = page_waitqueue(page);
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, waiter);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
 /**
  * unlock_page - unlock a locked page
  * @page: the page
-- 
cgit v1.2.3-71-gd317


From c6a6f19e22da0a3d74214ee010224c9a30a794c1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:42 +0100
Subject: NFS: Add FS-Cache option bit and debug bit

Add FS-Cache option bit to nfs_server struct.  This is set to indicate local
on-disk caching is enabled for a particular superblock.

Also add debug bit for local caching operations.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/nfs_fs.h    | 1 +
 include/linux/nfs_fs_sb.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index bde2557c2a9c..fd3e7f9c6fd3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -583,6 +583,7 @@ extern void * nfs_root_data(void);
 #define NFSDBG_CALLBACK		0x0100
 #define NFSDBG_CLIENT		0x0200
 #define NFSDBG_MOUNT		0x0400
+#define NFSDBG_FSCACHE		0x0800
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 29b1e40dce99..a749f8564aa6 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -96,6 +96,8 @@ struct nfs_server {
 	unsigned int		acdirmin;
 	unsigned int		acdirmax;
 	unsigned int		namelen;
+	unsigned int		options;	/* extra options enabled by mount */
+#define NFS_OPTION_FSCACHE	0x00000001	/* - local caching enabled */
 
 	struct nfs_fsid		fsid;
 	__u64			maxfilesize;	/* maximum file size */
-- 
cgit v1.2.3-71-gd317


From 147272813e043fb44bd112527951da70c1e663de Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:42 +0100
Subject: NFS: Define and create server-level objects

Define and create server-level cache index objects (as managed by nfs_client
structs).

Each server object is created in the NFS top-level index object and is itself
an index into which superblock-level objects are inserted.

Ideally there would be one superblock-level object per server, and the former
would be folded into the latter; however, since the "nosharecache" option
exists this isn't possible.

The server object key is a sequence consisting of:

 (1) NFS version

 (2) Server address family (eg: AF_INET or AF_INET6)

 (3) Server port.

 (4) Server IP address.

The key blob is of variable length, depending on the length of (4).

The server object is given no coherency data to carry in the auxiliary data
permitted by the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/Makefile           |  2 +-
 fs/nfs/client.c           |  5 ++++
 fs/nfs/fscache-index.c    | 65 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.c          | 52 +++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.h          | 10 ++++++++
 include/linux/nfs_fs_sb.h |  4 +++
 6 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 fs/nfs/fscache.c

(limited to 'include/linux')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 0e0bb6c17a86..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,4 +15,4 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   callback.o callback_xdr.o callback_proc.o \
 			   nfs4namespace.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
-nfs-$(CONFIG_NFS_FSCACHE) += fscache-index.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index aba38017bdef..aa04da8748a6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
 
+	nfs_fscache_get_client_cookie(clp);
+
 	return clp;
 
 error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
 
 	nfs4_shutdown_client(clp);
 
+	nfs_fscache_release_client_cookie(clp);
+
 	/* -EIO all pending I/O */
 	if (!IS_ERR(clp->cl_rpcclient))
 		rpc_shutdown_client(clp->cl_rpcclient);
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 6d5bb5c69048..ff14b032459b 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -47,3 +47,68 @@ void nfs_fscache_unregister(void)
 {
 	fscache_unregister_netfs(&nfs_fscache_netfs);
 }
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+	uint16_t	nfsversion;		/* NFS protocol version */
+	uint16_t	family;			/* address family */
+	uint16_t	port;			/* IP port */
+	union {
+		struct in_addr	ipv4_addr;	/* IPv4 address */
+		struct in6_addr ipv6_addr;	/* IPv6 address */
+	} addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t bufmax)
+{
+	const struct nfs_client *clp = cookie_netfs_data;
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+	const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+	struct nfs_server_key *key = buffer;
+	uint16_t len = sizeof(struct nfs_server_key);
+
+	key->nfsversion = clp->rpc_ops->version;
+	key->family = clp->cl_addr.ss_family;
+
+	memset(key, 0, len);
+
+	switch (clp->cl_addr.ss_family) {
+	case AF_INET:
+		key->port = sin->sin_port;
+		key->addr[0].ipv4_addr = sin->sin_addr;
+		len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->port = sin6->sin6_port;
+		key->addr[0].ipv6_addr = sin6->sin6_addr;
+		len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+		       clp->cl_addr.ss_family);
+		len = 0;
+		break;
+	}
+
+	return len;
+}
+
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+	.name		= "NFS.server",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_server_get_key,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..c3f056f89477
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,52 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+	/* create a cache index for looking up filehandles */
+	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+					      &nfs_fscache_server_index_def,
+					      clp);
+	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+	dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+
+	fscache_relinquish_cookie(clp->fscache, 0);
+	clp->fscache = NULL;
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index ccfcdc58066e..1d864bedf154 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -23,13 +23,23 @@
  * fscache-index.c
  */
 extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
 
 extern int nfs_fscache_register(void);
 extern void nfs_fscache_unregister(void);
 
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
 #else /* CONFIG_NFS_FSCACHE */
 static inline int nfs_fscache_register(void) { return 0; }
 static inline void nfs_fscache_unregister(void) {}
 
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a749f8564aa6..0a374b9c5093 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -64,6 +64,10 @@ struct nfs_client {
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
 #endif
+
+#ifdef CONFIG_NFS_FSCACHE
+	struct fscache_cookie	*fscache;	/* client index cache cookie */
+#endif
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From 08734048b380103f0412f58b84c2f76a2c8b599f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:42 +0100
Subject: NFS: Define and create superblock-level objects

Define and create superblock-level cache index objects (as managed by
nfs_server structs).

Each superblock object is created in a server level index object and is itself
an index into which inode-level objects are inserted.

Ideally there would be one superblock-level object per server, and the former
would be folded into the latter; however, since the "nosharecache" option
exists this isn't possible.

The superblock object key is a sequence consisting of:

 (1) Certain superblock s_flags.

 (2) Various connection parameters that serve to distinguish superblocks for
     sget().

 (3) The volume FSID.

 (4) The security flavour.

 (5) The uniquifier length.

 (6) The uniquifier text.  This is normally an empty string, unless the fsc=xyz
     mount option was used to explicitly specify a uniquifier.

The key blob is of variable length, depending on the length of (6).

The superblock object is given no coherency data to carry in the auxiliary data
permitted by the cache.  It is assumed that the superblock is always coherent.

This patch also adds uniquification handling such that two otherwise identical
superblocks, at least one of which is marked "nosharecache", won't end up
trying to share the on-disk cache.  It will be possible to manually provide a
uniquifier through a mount option with a later patch to avoid the error
otherwise produced.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/fscache-index.c    |  34 ++++++++++++++
 fs/nfs/fscache.c          | 116 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.h          |  49 ++++++++++++++++++++
 fs/nfs/internal.h         |   3 ++
 fs/nfs/super.c            |   9 +++-
 include/linux/nfs_fs_sb.h |   5 ++
 6 files changed, 214 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index ff14b032459b..a824050be807 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -112,3 +112,37 @@ const struct fscache_cookie_def nfs_fscache_server_index_def = {
 	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
 	.get_key	= nfs_server_get_key,
 };
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+				  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_fscache_key *key;
+	const struct nfs_server *nfss = cookie_netfs_data;
+	uint16_t len;
+
+	key = nfss->fscache_key;
+	len = sizeof(key->key) + key->key.uniq_len;
+	if (len > bufmax) {
+		len = 0;
+	} else {
+		memcpy(buffer, &key->key, sizeof(key->key));
+		memcpy(buffer + sizeof(key->key),
+		       key->key.uniquifier, key->key.uniq_len);
+	}
+
+	return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+	.name		= "NFS.super",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_super_get_key,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c3f056f89477..ab2de2c92b21 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -23,6 +23,9 @@
 
 #define NFSDBG_FACILITY		NFSDBG_FSCACHE
 
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
 /*
  * Get the per-client index cookie for an NFS client if the appropriate mount
  * flag was set
@@ -50,3 +53,116 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
 	fscache_relinquish_cookie(clp->fscache, 0);
 	clp->fscache = NULL;
 }
+
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb,
+				  struct nfs_parsed_mount_data *data)
+{
+	struct nfs_fscache_key *key, *xkey;
+	struct nfs_server *nfss = NFS_SB(sb);
+	struct rb_node **p, *parent;
+	const char *uniq = data->fscache_uniq ?: "";
+	int diff, ulen;
+
+	ulen = strlen(uniq);
+	key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+	if (!key)
+		return;
+
+	key->nfs_client = nfss->nfs_client;
+	key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+	key->key.nfs_server.flags = nfss->flags;
+	key->key.nfs_server.rsize = nfss->rsize;
+	key->key.nfs_server.wsize = nfss->wsize;
+	key->key.nfs_server.acregmin = nfss->acregmin;
+	key->key.nfs_server.acregmax = nfss->acregmax;
+	key->key.nfs_server.acdirmin = nfss->acdirmin;
+	key->key.nfs_server.acdirmax = nfss->acdirmax;
+	key->key.nfs_server.fsid = nfss->fsid;
+	key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+	key->key.uniq_len = ulen;
+	memcpy(key->key.uniquifier, uniq, ulen);
+
+	spin_lock(&nfs_fscache_keys_lock);
+	p = &nfs_fscache_keys.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+		if (key->nfs_client < xkey->nfs_client)
+			goto go_left;
+		if (key->nfs_client > xkey->nfs_client)
+			goto go_right;
+
+		diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+
+		if (key->key.uniq_len == 0)
+			goto non_unique;
+		diff = memcmp(key->key.uniquifier,
+			      xkey->key.uniquifier,
+			      key->key.uniq_len);
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+		goto non_unique;
+
+	go_left:
+		p = &(*p)->rb_left;
+		continue;
+	go_right:
+		p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&key->node, parent, p);
+	rb_insert_color(&key->node, &nfs_fscache_keys);
+	spin_unlock(&nfs_fscache_keys_lock);
+	nfss->fscache_key = key;
+
+	/* create a cache index for looking up filehandles */
+	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+					       &nfs_fscache_super_index_def,
+					       nfss);
+	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+	return;
+
+non_unique:
+	spin_unlock(&nfs_fscache_keys_lock);
+	kfree(key);
+	nfss->fscache_key = NULL;
+	nfss->fscache = NULL;
+	printk(KERN_WARNING "NFS:"
+	       " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+	struct nfs_server *nfss = NFS_SB(sb);
+
+	dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+
+	fscache_relinquish_cookie(nfss->fscache, 0);
+	nfss->fscache = NULL;
+
+	if (nfss->fscache_key) {
+		spin_lock(&nfs_fscache_keys_lock);
+		rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+		spin_unlock(&nfs_fscache_keys_lock);
+		kfree(nfss->fscache_key);
+		nfss->fscache_key = NULL;
+	}
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 1d864bedf154..22b971e8b380 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -19,11 +19,49 @@
 
 #ifdef CONFIG_NFS_FSCACHE
 
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+	struct rb_node		node;
+	struct nfs_client	*nfs_client;	/* the server */
+
+	/* the elements of the unique key - as used by nfs_compare_super() and
+	 * nfs_compare_mount_options() to distinguish superblocks */
+	struct {
+		struct {
+			unsigned long	s_flags;	/* various flags
+							 * (& NFS_MS_MASK) */
+		} super;
+
+		struct {
+			struct nfs_fsid fsid;
+			int		flags;
+			unsigned int	rsize;		/* read size */
+			unsigned int	wsize;		/* write size */
+			unsigned int	acregmin;	/* attr cache timeouts */
+			unsigned int	acregmax;
+			unsigned int	acdirmin;
+			unsigned int	acdirmax;
+		} nfs_server;
+
+		struct {
+			rpc_authflavor_t au_flavor;
+		} rpc_auth;
+
+		/* uniquifier - can be used if nfs_server.flags includes
+		 * NFS_MOUNT_UNSHARED  */
+		u8 uniq_len;
+		char uniquifier[0];
+	} key;
+};
+
 /*
  * fscache-index.c
  */
 extern struct fscache_netfs nfs_fscache_netfs;
 extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
 
 extern int nfs_fscache_register(void);
 extern void nfs_fscache_unregister(void);
@@ -34,6 +72,10 @@ extern void nfs_fscache_unregister(void);
 extern void nfs_fscache_get_client_cookie(struct nfs_client *);
 extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+					 struct nfs_parsed_mount_data *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
 #else /* CONFIG_NFS_FSCACHE */
 static inline int nfs_fscache_register(void) { return 0; }
 static inline void nfs_fscache_unregister(void) {}
@@ -41,5 +83,12 @@ static inline void nfs_fscache_unregister(void) {}
 static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
 static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
+static inline void nfs_fscache_get_super_cookie(
+	struct super_block *sb,
+	struct nfs_parsed_mount_data *data)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2041f68ff1cc..013070000c38 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
 struct nfs_string;
 
 /* Maximum number of readahead requests
@@ -41,6 +43,7 @@ struct nfs_parsed_mount_data {
 	unsigned int		auth_flavor_len;
 	rpc_authflavor_t	auth_flavors[1];
 	char			*client_address;
+	char			*fscache_uniq;
 
 	struct {
 		struct sockaddr_storage	address;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0942fcbbad3c..87f65ae07f32 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -1870,8 +1871,6 @@ static void nfs_clone_super(struct super_block *sb,
  	nfs_initialise_sb(sb);
 }
 
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
-
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
 {
 	const struct nfs_server *a = s->s_fs_info;
@@ -2036,6 +2035,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs_fill_super(s, data);
+		nfs_fscache_get_super_cookie(s, data);
 	}
 
 	mntroot = nfs_get_root(s, mntfh);
@@ -2056,6 +2056,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
 	kfree(data->nfs_server.hostname);
 	kfree(data->mount_server.hostname);
+	kfree(data->fscache_uniq);
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	kfree(mntfh);
@@ -2083,6 +2084,7 @@ static void nfs_kill_super(struct super_block *s)
 
 	bdi_unregister(&server->backing_dev_info);
 	kill_anon_super(s);
+	nfs_fscache_release_super_cookie(s);
 	nfs_free_server(server);
 }
 
@@ -2390,6 +2392,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_fill_super(s);
+		nfs_fscache_get_super_cookie(s, data);
 	}
 
 	mntroot = nfs4_get_root(s, mntfh);
@@ -2411,6 +2414,7 @@ out:
 	kfree(data->client_address);
 	kfree(data->nfs_server.export_path);
 	kfree(data->nfs_server.hostname);
+	kfree(data->fscache_uniq);
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	kfree(mntfh);
@@ -2437,6 +2441,7 @@ static void nfs4_kill_super(struct super_block *sb)
 	kill_anon_super(sb);
 
 	nfs4_renewd_prepare_shutdown(server);
+	nfs_fscache_release_super_cookie(sb);
 	nfs_free_server(server);
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 0a374b9c5093..6ad75948cbf7 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -108,6 +108,11 @@ struct nfs_server {
 	unsigned long		mount_time;	/* when this fs was mounted */
 	dev_t			s_dev;		/* superblock dev numbers */
 
+#ifdef CONFIG_NFS_FSCACHE
+	struct nfs_fscache_key	*fscache_key;	/* unique key for superblock */
+	struct fscache_cookie	*fscache;	/* superblock cookie */
+#endif
+
 #ifdef CONFIG_NFS_V4
 	u32			attr_bitmask[2];/* V4 bitmask representing the set
 						   of attributes supported on this
-- 
cgit v1.2.3-71-gd317


From ef79c097bbe9724e13937271b3457df560e00370 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:43 +0100
Subject: NFS: Use local disk inode cache

Bind data storage objects in the local cache to NFS inodes.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/fscache.c       | 162 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/fscache.h       |  13 ++++
 fs/nfs/inode.c         |   6 ++
 include/linux/nfs_fs.h |  10 +++
 4 files changed, 191 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ab2de2c92b21..e3816eb53fb8 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -166,3 +166,165 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
 		nfss->fscache_key = NULL;
 	}
 }
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+	NFS_I(inode)->fscache = NULL;
+	if (S_ISREG(inode->i_mode))
+		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfsi->fscache || !NFS_FSCACHE(inode))
+		return;
+
+	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+		nfsi->fscache = fscache_acquire_cookie(
+			NFS_SB(sb)->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+			 sb, nfsi, nfsi->fscache);
+	}
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 0);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 1);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+
+	if (NFS_I(inode)->fscache) {
+		dfprintk(FSCACHE,
+			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+
+		/* Need to invalidate any mapped pages that were read in before
+		 * turning off the cache.
+		 */
+		if (inode->i_mapping && inode->i_mapping->nrpages)
+			invalidate_inode_pages2(inode->i_mapping);
+
+		nfs_fscache_zap_inode_cookie(inode);
+	}
+}
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+	schedule();
+	return 0;
+}
+
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	smp_mb__before_clear_bit();
+	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ *   to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+	if (NFS_FSCACHE(inode)) {
+		nfs_fscache_inode_lock(inode);
+		if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+			nfs_fscache_disable_inode_cookie(inode);
+		else
+			nfs_fscache_enable_inode_cookie(inode);
+		nfs_fscache_inode_unlock(inode);
+	}
+}
+
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	struct fscache_cookie *old = nfsi->fscache;
+
+	nfs_fscache_inode_lock(inode);
+	if (nfsi->fscache) {
+		/* retire the current fscache cache and get a new one */
+		fscache_relinquish_cookie(nfsi->fscache, 1);
+
+		nfsi->fscache = fscache_acquire_cookie(
+			nfss->nfs_client->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE,
+			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+			 nfss, nfsi, old, nfsi->fscache);
+	}
+	nfs_fscache_inode_unlock(inode);
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index d21b5906ccf4..8b4299a0ad61 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -77,6 +77,12 @@ extern void nfs_fscache_get_super_cookie(struct super_block *,
 					 struct nfs_parsed_mount_data *);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+
 #else /* CONFIG_NFS_FSCACHE */
 static inline int nfs_fscache_register(void) { return 0; }
 static inline void nfs_fscache_unregister(void) {}
@@ -91,5 +97,12 @@ static inline void nfs_fscache_get_super_cookie(
 }
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+						struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+
 #endif /* CONFIG_NFS_FSCACHE */
 #endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cd29f410e941..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -122,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
 	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
+	nfs_fscache_release_inode_cookie(inode);
 }
 
 /**
@@ -356,6 +357,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		nfsi->attrtimeo_timestamp = now;
 		nfsi->access_cache = RB_ROOT;
 
+		nfs_fscache_init_inode_cookie(inode);
+
 		unlock_new_inode(inode);
 	} else
 		nfs_refresh_inode(inode, fattr);
@@ -687,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	ctx->mode = filp->f_mode;
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
+	nfs_fscache_set_inode_cookie(inode, filp);
 	return 0;
 }
 
@@ -787,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 	spin_unlock(&inode->i_lock);
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+	nfs_fscache_reset_inode_cookie(inode);
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
 			inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 	return 0;
@@ -1031,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	spin_lock(&inode->i_lock);
 	status = nfs_refresh_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
+
 	return status;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fd3e7f9c6fd3..8a99e79d5ea5 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,6 +185,9 @@ struct nfs_inode {
 	fmode_t			 delegation_state;
 	struct rw_semaphore	rwsem;
 #endif /* CONFIG_NFS_V4*/
+#ifdef CONFIG_NFS_FSCACHE
+	struct fscache_cookie	*fscache;
+#endif
 	struct inode		vfs_inode;
 };
 
@@ -207,6 +210,8 @@ struct nfs_inode {
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
+#define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
+#define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
@@ -260,6 +265,11 @@ static inline int NFS_STALE(const struct inode *inode)
 	return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 }
 
+static inline int NFS_FSCACHE(const struct inode *inode)
+{
+	return test_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
 static inline __u64 NFS_FILEID(const struct inode *inode)
 {
 	return NFS_I(inode)->fileid;
-- 
cgit v1.2.3-71-gd317


From 6a51091d0775cdc4a923f2172c61925ad416aa32 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:43 +0100
Subject: NFS: Add some new I/O counters for FS-Cache doing things for NFS

Add some new NFS I/O counters for FS-Cache doing things for NFS.  A new line is
emitted into /proc/pid/mountstats if caching is enabled that looks like:

	fsc: <rok> <rfl> <wok> <wfl> <unc>

Where <rok> is the number of pages read successfully from the cache, <rfl> is
the number of failed page reads against the cache, <wok> is the number of
successful page writes to the cache, <wfl> is the number of failed page writes
to the cache, and <unc> is the number of NFS pages that have been disconnected
from the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/iostat.h            | 18 ++++++++++++++++++
 fs/nfs/super.c             | 11 +++++++++++
 include/linux/nfs_iostat.h | 12 ++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
 
 struct nfs_iostats {
 	unsigned long long	bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+	unsigned long long	fscache[__NFSIOS_FSCACHEMAX];
+#endif
 	unsigned long		events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
 
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+					 enum nfs_stat_fscachecounters stat,
+					 unsigned long addend)
+{
+	struct nfs_iostats *iostats;
+	int cpu;
+
+	cpu = get_cpu();
+	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+	iostats->fscache[stat] += addend;
+	put_cpu_no_resched();
+}
+#endif
+
 static inline struct nfs_iostats *nfs_alloc_iostats(void)
 {
 	return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 87f65ae07f32..b5fea776a0dc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -642,6 +642,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 			totals.events[i] += stats->events[i];
 		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 			totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			totals.fscache[i] += stats->fscache[i];
+#endif
 
 		preempt_enable();
 	}
@@ -652,6 +656,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, "\n\tbytes:\t");
 	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 		seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+	if (nfss->options & NFS_OPTION_FSCACHE) {
+		seq_printf(m, "\n\tfsc:\t");
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			seq_printf(m, "%Lu ", totals.bytes[i]);
+	}
+#endif
 	seq_printf(m, "\n");
 
 	rpc_print_iostats(m, nfss->client);
diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
index 1cb9a3fed2b3..68b10f5f8907 100644
--- a/include/linux/nfs_iostat.h
+++ b/include/linux/nfs_iostat.h
@@ -116,4 +116,16 @@ enum nfs_stat_eventcounters {
 	__NFSIOS_COUNTSMAX,
 };
 
+/*
+ * NFS local caching servicing counters
+ */
+enum nfs_stat_fscachecounters {
+	NFSIOS_FSCACHE_PAGES_READ_OK,
+	NFSIOS_FSCACHE_PAGES_READ_FAIL,
+	NFSIOS_FSCACHE_PAGES_WRITTEN_OK,
+	NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL,
+	NFSIOS_FSCACHE_PAGES_UNCACHED,
+	__NFSIOS_FSCACHEMAX,
+};
+
 #endif	/* _LINUX_NFS_IOSTAT */
-- 
cgit v1.2.3-71-gd317


From f42b293d6d5259043a8944b556eeab427c695d57 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:44 +0100
Subject: NFS: nfs_readpage_async() needs to be accessible as a fallback for
 local caching

nfs_readpage_async() needs to be non-static so that it can be used as a
fallback for the local on-disk caching should an EIO crop up when reading the
cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 fs/nfs/read.c          | 4 ++--
 include/linux/nfs_fs.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..98b74009c9d7 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -111,8 +111,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 	}
 }
 
-static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
-		struct page *page)
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+		       struct page *page)
 {
 	LIST_HEAD(one_request);
 	struct nfs_page	*new;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8a99e79d5ea5..fdffb413b192 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -516,6 +516,8 @@ extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
 extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
 extern void nfs_readdata_release(void *data);
+extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
+			       struct page *);
 
 /*
  * Allocate nfs_read_data structures
-- 
cgit v1.2.3-71-gd317


From 3688e07f83d81941c4a8b20e29602c6d0c883539 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 1 Apr 2009 23:38:49 -0500
Subject: Fix highmem PPC build failure

Commit f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 ("mm: introduce
debug_kmap_atomic") broke PPC builds with CONFIG_HIGHMEM=y:

   CC      init/main.o
  In file included from include/linux/highmem.h:25,
                   from include/linux/pagemap.h:11,
                   from include/linux/mempolicy.h:63,
                   from init/main.c:53:
  arch/powerpc/include/asm/highmem.h: In function 'kmap_atomic_prot':
  arch/powerpc/include/asm/highmem.h:98: error: implicit declaration of function 'debug_kmap_atomic'
  In file included from include/linux/pagemap.h:11,
                   from include/linux/mempolicy.h:63,
                   from init/main.c:53:
  include/linux/highmem.h: At top level:
  include/linux/highmem.h:196: warning: conflicting types for 'debug_kmap_atomic'
  include/linux/highmem.h:196: error: static declaration of 'debug_kmap_atomic' follows non-static declaration
  include/asm/highmem.h:98: error: previous implicit declaration of 'debug_kmap_atomic' was here
  make[1]: *** [init/main.o] Error 1
  make: *** [init] Error 2

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 7ff5c55f9b55..1fcb7126a01f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -19,8 +19,21 @@ static inline void flush_kernel_dcache_page(struct page *page)
 }
 #endif
 
-#ifdef CONFIG_HIGHMEM
+#include <asm/kmap_types.h>
+
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type);
+
+#else
 
+static inline void debug_kmap_atomic(enum km_type type)
+{
+}
+
+#endif
+
+#ifdef CONFIG_HIGHMEM
 #include <asm/highmem.h>
 
 /* declarations for linux/mm/highmem.c */
@@ -44,8 +57,6 @@ static inline void *kmap(struct page *page)
 
 #define kunmap(page) do { (void) (page); } while (0)
 
-#include <asm/kmap_types.h>
-
 static inline void *kmap_atomic(struct page *page, enum km_type idx)
 {
 	pagefault_disable();
@@ -187,16 +198,4 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	kunmap_atomic(vto, KM_USER1);
 }
 
-#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
-
-void debug_kmap_atomic(enum km_type type);
-
-#else
-
-static inline void debug_kmap_atomic(enum km_type type)
-{
-}
-
-#endif
-
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3-71-gd317


From ce0d9d7255a55628fd3732bf583c83e90150b699 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Wed, 14 Jan 2009 02:05:27 +0300
Subject: Staging: dst: core files.

This patch contains DST core files, which introduce
block layer, connector and sysfs registration glue and main headers.

Connector is used for the configuration of the node (its type, address,
device name and so on). Sysfs provides bits of information about running
devices in the following format:

+/*
+ * DST sysfs tree for device called 'storage':
+ *
+ * /sys/bus/dst/devices/storage/
+ * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025
+ * /sys/bus/dst/devices/storage/size : 800
+ * /sys/bus/dst/devices/storage/name : storage
+ */

DST header contains structure definitions and protocol command description.

Signed-off-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/staging/dst/dcore.c | 972 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/connector.h   |   4 +-
 include/linux/dst.h         | 587 ++++++++++++++++++++++++++
 3 files changed, 1562 insertions(+), 1 deletion(-)
 create mode 100644 drivers/staging/dst/dcore.c
 create mode 100644 include/linux/dst.h

(limited to 'include/linux')

diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
new file mode 100644
index 000000000000..c6e3cd1a5051
--- /dev/null
+++ b/drivers/staging/dst/dcore.c
@@ -0,0 +1,972 @@
+/*
+ * 2007+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/connector.h>
+#include <linux/dst.h>
+#include <linux/device.h>
+#include <linux/jhash.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include <net/sock.h>
+
+static int dst_major;
+
+static DEFINE_MUTEX(dst_hash_lock);
+static struct list_head *dst_hashtable;
+static unsigned int dst_hashtable_size = 128;
+module_param(dst_hashtable_size, uint, 0644);
+
+static char dst_name[] = "Dementianting goldfish";
+
+static DEFINE_IDR(dst_index_idr);
+static struct cb_id cn_dst_id = { CN_DST_IDX, CN_DST_VAL };
+
+/*
+ * DST sysfs tree for device called 'storage':
+ *
+ * /sys/bus/dst/devices/storage/
+ * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025
+ * /sys/bus/dst/devices/storage/size : 800
+ * /sys/bus/dst/devices/storage/name : storage
+ */
+
+static int dst_dev_match(struct device *dev, struct device_driver *drv)
+{
+	return 1;
+}
+
+static struct bus_type dst_dev_bus_type = {
+	.name 		= "dst",
+	.match 		= &dst_dev_match,
+};
+
+static void dst_node_release(struct device *dev)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+
+	kfree(info);
+}
+
+static struct device dst_node_dev = {
+	.bus 		= &dst_dev_bus_type,
+	.release 	= &dst_node_release
+};
+
+/*
+ * Setting size of the node after it was changed.
+ */
+static void dst_node_set_size(struct dst_node *n)
+{
+	struct block_device *bdev;
+
+	set_capacity(n->disk, n->size >> 9);
+
+	bdev = bdget_disk(n->disk, 0);
+	if (bdev) {
+		mutex_lock(&bdev->bd_inode->i_mutex);
+		i_size_write(bdev->bd_inode, n->size);
+		mutex_unlock(&bdev->bd_inode->i_mutex);
+		bdput(bdev);
+	}
+}
+
+/*
+ * Distributed storage request processing function.
+ */
+static int dst_request(struct request_queue *q, struct bio *bio)
+{
+	struct dst_node *n = q->queuedata;
+
+	bio_get(bio);
+
+	return dst_process_bio(n, bio);
+}
+
+/*
+ * Open/close callbacks for appropriate block device.
+ */
+static int dst_bdev_open(struct block_device *bdev, fmode_t mode)
+{
+	struct dst_node *n = bdev->bd_disk->private_data;
+
+	dst_node_get(n);
+	return 0;
+}
+
+static int dst_bdev_release(struct gendisk *disk, fmode_t mode)
+{
+	struct dst_node *n = disk->private_data;
+
+	dst_node_put(n);
+	return 0;
+}
+
+static struct block_device_operations dst_blk_ops = {
+	.open		= dst_bdev_open,
+	.release	= dst_bdev_release,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Block layer binding - disk is created when array is fully configured
+ * by userspace request.
+ */
+static int dst_node_create_disk(struct dst_node *n)
+{
+	int err = -ENOMEM;
+	u32 index = 0;
+
+	n->queue = blk_init_queue(NULL, NULL);
+	if (!n->queue)
+		goto err_out_exit;
+
+	n->queue->queuedata = n;
+	blk_queue_make_request(n->queue, dst_request);
+	blk_queue_max_phys_segments(n->queue, n->max_pages);
+	blk_queue_max_hw_segments(n->queue, n->max_pages);
+
+	err = -ENOMEM;
+	n->disk = alloc_disk(1);
+	if (!n->disk)
+		goto err_out_free_queue;
+
+	if (!(n->state->permissions & DST_PERM_WRITE)) {
+		printk(KERN_INFO "DST node %s attached read-only.\n", n->name);
+		set_disk_ro(n->disk, 1);
+	}
+
+	if (!idr_pre_get(&dst_index_idr, GFP_KERNEL))
+		goto err_out_put;
+
+	mutex_lock(&dst_hash_lock);
+	err = idr_get_new(&dst_index_idr, NULL, &index);
+	mutex_unlock(&dst_hash_lock);
+	if (err)
+		goto err_out_put;
+
+	n->disk->major = dst_major;
+	n->disk->first_minor = index;
+	n->disk->fops = &dst_blk_ops;
+	n->disk->queue = n->queue;
+	n->disk->private_data = n;
+	snprintf(n->disk->disk_name, sizeof(n->disk->disk_name), "dst-%s", n->name);
+
+	return 0;
+
+err_out_put:
+	put_disk(n->disk);
+err_out_free_queue:
+	blk_cleanup_queue(n->queue);
+err_out_exit:
+	return err;
+}
+
+/*
+ * Sysfs machinery: show device's size.
+ */
+static ssize_t dst_show_size(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+
+	return sprintf(buf, "%llu\n", info->size);
+}
+
+/*
+ * Show local exported device.
+ */
+static ssize_t dst_show_local(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+
+	return sprintf(buf, "%s\n", info->local);
+}
+
+/*
+ * Shows type of the remote node - device major/minor number
+ * for local nodes and address (af_inet ipv4/ipv6 only) for remote nodes.
+ */
+static ssize_t dst_show_type(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dst_info *info = container_of(dev, struct dst_info, device);
+	int family = info->net.addr.sa_family;
+
+	if (family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)&info->net.addr;
+		return sprintf(buf, "%u.%u.%u.%u:%d\n",
+			NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
+	} else if (family == AF_INET6) {
+		struct sockaddr_in6 *sin = (struct sockaddr_in6 *)&info->net.addr;
+		return sprintf(buf,
+			"%pi6:%d\n",
+			&sin->sin6_addr, ntohs(sin->sin6_port));
+	} else {
+		int i, sz = PAGE_SIZE - 2; /* 0 symbol and '\n' below */
+		int size, addrlen = info->net.addr.sa_data_len;
+		unsigned char *a = (unsigned char *)&info->net.addr.sa_data;
+		char *buf_orig = buf;
+
+		size = snprintf(buf, sz, "family: %d, addrlen: %u, addr: ",
+				family, addrlen);
+		sz -= size;
+		buf += size;
+
+		for (i=0; i<addrlen; ++i) {
+			if (sz < 3)
+				break;
+
+			size = snprintf(buf, sz, "%02x ", a[i]);
+			sz -= size;
+			buf += size;
+		}
+		buf += sprintf(buf, "\n");
+
+		return buf - buf_orig;
+	}
+	return 0;
+}
+
+static struct device_attribute dst_node_attrs[] = {
+	__ATTR(size, 0444, dst_show_size, NULL),
+	__ATTR(type, 0444, dst_show_type, NULL),
+	__ATTR(local, 0444, dst_show_local, NULL),
+};
+
+static int dst_create_node_attributes(struct dst_node *n)
+{
+	int err, i;
+
+	for (i=0; i<ARRAY_SIZE(dst_node_attrs); ++i) {
+		err = device_create_file(&n->info->device,
+				&dst_node_attrs[i]);
+		if (err)
+			goto err_out_remove_all;
+	}
+	return 0;
+
+err_out_remove_all:
+	while (--i >= 0)
+		device_remove_file(&n->info->device,
+				&dst_node_attrs[i]);
+
+	return err;
+}
+
+static void dst_remove_node_attributes(struct dst_node *n)
+{
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(dst_node_attrs); ++i)
+		device_remove_file(&n->info->device,
+				&dst_node_attrs[i]);
+}
+
+/*
+ * Sysfs cleanup and initialization.
+ * Shows number of useful parameters.
+ */
+static void dst_node_sysfs_exit(struct dst_node *n)
+{
+	if (n->info) {
+		dst_remove_node_attributes(n);
+		device_unregister(&n->info->device);
+		n->info = NULL;
+	}
+}
+
+static int dst_node_sysfs_init(struct dst_node *n)
+{
+	int err;
+
+	n->info = kzalloc(sizeof(struct dst_info), GFP_KERNEL);
+	if (!n->info)
+		return -ENOMEM;
+
+	memcpy(&n->info->device, &dst_node_dev, sizeof(struct device));
+	n->info->size = n->size;
+
+	snprintf(n->info->device.bus_id, sizeof(n->info->device.bus_id), "dst-%s", n->name);
+	err = device_register(&n->info->device);
+	if (err) {
+		dprintk(KERN_ERR "Failed to register node '%s', err: %d.\n",
+				n->name, err);
+		goto err_out_exit;
+	}
+
+	dst_create_node_attributes(n);
+
+	return 0;
+
+err_out_exit:
+	kfree(n->info);
+	n->info = NULL;
+	return err;
+}
+
+/*
+ * DST node hash tables machinery.
+ */
+static inline unsigned int dst_hash(char *str, unsigned int size)
+{
+	return (jhash(str, size, 0) % dst_hashtable_size);
+}
+
+static void dst_node_remove(struct dst_node *n)
+{
+	mutex_lock(&dst_hash_lock);
+	list_del_init(&n->node_entry);
+	mutex_unlock(&dst_hash_lock);
+}
+
+static void dst_node_add(struct dst_node *n)
+{
+	unsigned hash = dst_hash(n->name, sizeof(n->name));
+
+	mutex_lock(&dst_hash_lock);
+	list_add_tail(&n->node_entry, &dst_hashtable[hash]);
+	mutex_unlock(&dst_hash_lock);
+}
+
+/*
+ * Cleaning node when it is about to be freed.
+ * There are still users of the socket though,
+ * so connection cleanup should be protected.
+ */
+static void dst_node_cleanup(struct dst_node *n)
+{
+	struct dst_state *st = n->state;
+
+	if (!st)
+		return;
+
+	if (n->queue) {
+		blk_cleanup_queue(n->queue);
+
+		mutex_lock(&dst_hash_lock);
+		idr_remove(&dst_index_idr, n->disk->first_minor);
+		mutex_unlock(&dst_hash_lock);
+
+		put_disk(n->disk);
+	}
+
+	if (n->bdev) {
+		sync_blockdev(n->bdev);
+		blkdev_put(n->bdev, FMODE_READ|FMODE_WRITE);
+	}
+
+	dst_state_lock(st);
+	st->need_exit = 1;
+	dst_state_exit_connected(st);
+	dst_state_unlock(st);
+
+	wake_up(&st->thread_wait);
+
+	dst_state_put(st);
+	n->state = NULL;
+}
+
+/*
+ * Free security attributes attached to given node.
+ */
+static void dst_security_exit(struct dst_node *n)
+{
+	struct dst_secure *s, *tmp;
+
+	list_for_each_entry_safe(s, tmp, &n->security_list, sec_entry) {
+		list_del(&s->sec_entry);
+		kfree(s);
+	}
+}
+
+/*
+ * Free node when there are no more users.
+ * Actually node has to be freed on behalf od userspace process,
+ * since there are number of threads, which are embedded in the
+ * node, so they can not exit and free node from there, that is
+ * why there is a wakeup if reference counter is not equal to zero.
+ */
+void dst_node_put(struct dst_node *n)
+{
+	if (unlikely(!n))
+		return;
+
+	dprintk("%s: n: %p, refcnt: %d.\n",
+			__func__, n, atomic_read(&n->refcnt));
+
+	if (atomic_dec_and_test(&n->refcnt)) {
+		dst_node_remove(n);
+		n->trans_scan_timeout = 0;
+		dst_node_cleanup(n);
+		thread_pool_destroy(n->pool);
+		dst_node_sysfs_exit(n);
+		dst_node_crypto_exit(n);
+		dst_security_exit(n);
+		dst_node_trans_exit(n);
+
+		kfree(n);
+
+		dprintk("%s: freed n: %p.\n", __func__, n);
+	} else {
+		wake_up(&n->wait);
+	}
+}
+
+/*
+ * This function finds devices major/minor numbers for given pathname.
+ */
+static int dst_lookup_device(const char *path, dev_t *dev)
+{
+	int err;
+	struct nameidata nd;
+	struct inode *inode;
+
+	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	if (err)
+		return err;
+
+	inode = nd.path.dentry->d_inode;
+	if (!inode) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	if (!S_ISBLK(inode->i_mode)) {
+		err = -ENOTBLK;
+		goto out;
+	}
+
+	*dev = inode->i_rdev;
+
+out:
+	path_put(&nd.path);
+	return err;
+}
+
+/*
+ * Setting up export device: lookup by the name, get its size
+ * and setup listening socket, which will accept clients, which
+ * will submit IO for given storage.
+ */
+static int dst_setup_export(struct dst_node *n, struct dst_ctl *ctl,
+		struct dst_export_ctl *le)
+{
+	int err;
+	dev_t dev = 0; /* gcc likes to scream here */
+
+	snprintf(n->info->local, sizeof(n->info->local), "%s", le->device);
+
+	err = dst_lookup_device(le->device, &dev);
+	if (err)
+		return err;
+
+	n->bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	if (!n->bdev)
+		return -ENODEV;
+
+	if (n->size != 0)
+		n->size = min_t(loff_t, n->bdev->bd_inode->i_size, n->size);
+	else
+		n->size = n->bdev->bd_inode->i_size;
+
+	n->info->size = n->size;
+	err = dst_node_init_listened(n, le);
+	if (err)
+		goto err_out_cleanup;
+
+	return 0;
+
+err_out_cleanup:
+	blkdev_put(n->bdev, FMODE_READ|FMODE_WRITE);
+	n->bdev = NULL;
+
+	return err;
+}
+
+/* Empty thread pool callbacks for the network processing threads. */
+static inline void *dst_thread_network_init(void *data)
+{
+	dprintk("%s: data: %p.\n", __func__, data);
+	return data;
+}
+
+static inline void dst_thread_network_cleanup(void *data)
+{
+	dprintk("%s: data: %p.\n", __func__, data);
+}
+
+/*
+ * Allocate DST node and initialize some of its parameters.
+ */
+static struct dst_node *dst_alloc_node(struct dst_ctl *ctl,
+		int (*start)(struct dst_node *),
+		int num)
+{
+	struct dst_node *n;
+	int err;
+
+	n = kzalloc(sizeof(struct dst_node), GFP_KERNEL);
+	if (!n)
+		return NULL;
+
+	INIT_LIST_HEAD(&n->node_entry);
+
+	INIT_LIST_HEAD(&n->security_list);
+	mutex_init(&n->security_lock);
+
+	init_waitqueue_head(&n->wait);
+
+	n->trans_scan_timeout = msecs_to_jiffies(ctl->trans_scan_timeout);
+	if (!n->trans_scan_timeout)
+		n->trans_scan_timeout = HZ;
+
+	n->trans_max_retries = ctl->trans_max_retries;
+	if (!n->trans_max_retries)
+		n->trans_max_retries = 10;
+
+	/*
+	 * Pretty much arbitrary default numbers.
+	 * 32 matches maximum number of pages in bio originated from ext3 (31).
+	 */
+	n->max_pages = ctl->max_pages;
+	if (!n->max_pages)
+		n->max_pages = 32;
+
+	if (n->max_pages > 1024)
+		n->max_pages = 1024;
+
+	n->start = start;
+	n->size = ctl->size;
+
+	atomic_set(&n->refcnt, 1);
+	atomic_long_set(&n->gen, 0);
+	snprintf(n->name, sizeof(n->name), "%s", ctl->name);
+
+	err = dst_node_sysfs_init(n);
+	if (err)
+		goto err_out_free;
+
+	n->pool = thread_pool_create(num, n->name, dst_thread_network_init,
+			dst_thread_network_cleanup, n);
+	if (IS_ERR(n->pool)) {
+		err = PTR_ERR(n->pool);
+		goto err_out_sysfs_exit;
+	}
+
+	dprintk("%s: n: %p, name: %s.\n", __func__, n, n->name);
+
+	return n;
+
+err_out_sysfs_exit:
+	dst_node_sysfs_exit(n);
+err_out_free:
+	kfree(n);
+	return NULL;
+}
+
+/*
+ * Starting a node, connected to the remote server:
+ * register block device and initialize transaction mechanism.
+ * In revers order though.
+ *
+ * It will autonegotiate some parameters with the remote node
+ * and update local if needed.
+ *
+ * Transaction initialization should be the last thing before
+ * starting the node, since transaction should include not only
+ * block IO, but also crypto related data (if any), which are
+ * initialized separately.
+ */
+static int dst_start_remote(struct dst_node *n)
+{
+	int err;
+
+	err = dst_node_trans_init(n, sizeof(struct dst_trans));
+	if (err)
+		return err;
+
+	err = dst_node_create_disk(n);
+	if (err)
+		return err;
+
+	dst_node_set_size(n);
+	add_disk(n->disk);
+
+	dprintk("DST: started remote node '%s', minor: %d.\n", n->name, n->disk->first_minor);
+
+	return 0;
+}
+
+/*
+ * Adding remote node and initialize connection.
+ */
+static int dst_add_remote(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	int err;
+	struct dst_network_ctl *rctl = data;
+
+	if (n)
+		return -EEXIST;
+
+	if (size != sizeof(struct dst_network_ctl))
+		return -EINVAL;
+
+	n = dst_alloc_node(ctl, dst_start_remote, 1);
+	if (!n)
+		return -ENOMEM;
+
+	memcpy(&n->info->net, rctl, sizeof(struct dst_network_ctl));
+	err = dst_node_init_connected(n, rctl);
+	if (err)
+		goto err_out_free;
+
+	dst_node_add(n);
+
+	return 0;
+
+err_out_free:
+	dst_node_put(n);
+	return err;
+}
+
+/*
+ * Adding export node: initializing block device and listening socket.
+ */
+static int dst_add_export(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	int err;
+	struct dst_export_ctl *le = data;
+
+	if (n)
+		return -EEXIST;
+
+	if (size != sizeof(struct dst_export_ctl))
+		return -EINVAL;
+
+	n = dst_alloc_node(ctl, dst_start_export, 2);
+	if (!n)
+		return -EINVAL;
+
+	err = dst_setup_export(n, ctl, le);
+	if (err)
+		goto err_out_free;
+
+	dst_node_add(n);
+
+	return 0;
+
+err_out_free:
+	dst_node_put(n);
+	return err;
+}
+
+static int dst_node_remove_unload(struct dst_node *n)
+{
+	printk(KERN_INFO "STOPPED name: '%s', size: %llu.\n",
+			n->name, n->size);
+
+	if (n->disk)
+		del_gendisk(n->disk);
+
+	dst_node_remove(n);
+	dst_node_sysfs_exit(n);
+
+	/*
+	 * This is not a hack. Really.
+	 * Node's reference counter allows to implement fine grained
+	 * node freeing, but since all transactions (which hold node's
+	 * reference counter) are processed in the dedicated thread,
+	 * it is possible that reference will hit zero in that thread,
+	 * so we will not be able to exit thread and cleanup the node.
+	 *
+	 * So, we remove disk, so no new activity is possible, and
+	 * wait until all pending transaction are completed (either
+	 * in receiving thread or by timeout in workqueue), in this
+	 * case reference counter will be less or equal to 2 (once set in
+	 * dst_alloc_node() and then in connector message parser;
+	 * or when we force module unloading, and connector message
+	 * parser does not hold a reference, in this case reference
+	 * counter will be equal to 1),
+	 * and subsequent dst_node_put() calls will free the node.
+	 */
+	dprintk("%s: going to sleep with %d refcnt.\n", __func__, atomic_read(&n->refcnt));
+	wait_event(n->wait, atomic_read(&n->refcnt) <= 2);
+
+	dst_node_put(n);
+	return 0;
+}
+
+/*
+ * Remove node from the hash table.
+ */
+static int dst_del_node(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	if (!n)
+		return -ENODEV;
+
+	return dst_node_remove_unload(n);
+}
+
+/*
+ * Initialize crypto processing for given node.
+ */
+static int dst_crypto_init(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	struct dst_crypto_ctl *crypto = data;
+
+	if (!n)
+		return -ENODEV;
+
+	if (size != sizeof(struct dst_crypto_ctl) + crypto->hash_keysize +
+			crypto->cipher_keysize)
+		return -EINVAL;
+
+	if (n->trans_cache)
+		return -EEXIST;
+
+	return dst_node_crypto_init(n, crypto);
+}
+
+/*
+ * Security attributes for given node.
+ */
+static int dst_security_init(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	struct dst_secure *s;
+
+	if (!n)
+		return -ENODEV;
+
+	if (size != sizeof(struct dst_secure_user))
+		return -EINVAL;
+
+	s = kmalloc(sizeof(struct dst_secure), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	memcpy(&s->sec, data, size);
+
+	mutex_lock(&n->security_lock);
+	list_add_tail(&s->sec_entry, &n->security_list);
+	mutex_unlock(&n->security_lock);
+
+	return 0;
+}
+
+/*
+ * Kill'em all!
+ */
+static int dst_start_node(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size)
+{
+	int err;
+
+	if (!n)
+		return -ENODEV;
+
+	if (n->trans_cache)
+		return 0;
+
+	err = n->start(n);
+	if (err)
+		return err;
+
+	printk(KERN_INFO "STARTED name: '%s', size: %llu.\n", n->name, n->size);
+	return 0;
+}
+
+typedef int (*dst_command_func)(struct dst_node *n, struct dst_ctl *ctl,
+		void *data, unsigned int size);
+
+/*
+ * List of userspace commands.
+ */
+static dst_command_func dst_commands[] = {
+	[DST_ADD_REMOTE] = &dst_add_remote,
+	[DST_ADD_EXPORT] = &dst_add_export,
+	[DST_DEL_NODE] = &dst_del_node,
+	[DST_CRYPTO] = &dst_crypto_init,
+	[DST_SECURITY] = &dst_security_init,
+	[DST_START] = &dst_start_node,
+};
+
+/*
+ * Configuration parser.
+ */
+static void cn_dst_callback(void *data)
+{
+	struct dst_ctl *ctl;
+	struct cn_msg *msg = data;
+	int err;
+	struct dst_ctl_ack ack;
+	struct dst_node *n = NULL, *tmp;
+	unsigned int hash;
+
+	if (msg->len < sizeof(struct dst_ctl)) {
+		err = -EBADMSG;
+		goto out;
+	}
+
+	ctl = (struct dst_ctl *)msg->data;
+
+	if (ctl->cmd >= DST_CMD_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+	hash = dst_hash(ctl->name, sizeof(ctl->name));
+
+	mutex_lock(&dst_hash_lock);
+	list_for_each_entry(tmp, &dst_hashtable[hash], node_entry) {
+		if (!memcmp(tmp->name, ctl->name, sizeof(tmp->name))) {
+			n = tmp;
+			dst_node_get(n);
+			break;
+		}
+	}
+	mutex_unlock(&dst_hash_lock);
+
+	err = dst_commands[ctl->cmd](n, ctl, msg->data + sizeof(struct dst_ctl),
+			msg->len - sizeof(struct dst_ctl));
+
+	dst_node_put(n);
+out:
+	memcpy(&ack.msg, msg, sizeof(struct cn_msg));
+
+	ack.msg.ack = msg->ack + 1;
+	ack.msg.len = sizeof(struct dst_ctl_ack) - sizeof(struct cn_msg);
+
+	ack.error = err;
+
+	cn_netlink_send(&ack.msg, 0, GFP_KERNEL);
+}
+
+/*
+ * Global initialization: sysfs, hash table, block device registration,
+ * connector and various caches.
+ */
+static int __init dst_sysfs_init(void)
+{
+	return bus_register(&dst_dev_bus_type);
+}
+
+static void dst_sysfs_exit(void)
+{
+	bus_unregister(&dst_dev_bus_type);
+}
+
+static int __init dst_hashtable_init(void)
+{
+	unsigned int i;
+
+	dst_hashtable = kcalloc(dst_hashtable_size, sizeof(struct list_head),
+			GFP_KERNEL);
+	if (!dst_hashtable)
+		return -ENOMEM;
+
+	for (i=0; i<dst_hashtable_size; ++i)
+		INIT_LIST_HEAD(&dst_hashtable[i]);
+
+	return 0;
+}
+
+static void dst_hashtable_exit(void)
+{
+	unsigned int i;
+	struct dst_node *n, *tmp;
+
+	for (i=0; i<dst_hashtable_size; ++i) {
+		list_for_each_entry_safe(n, tmp, &dst_hashtable[i], node_entry) {
+			dst_node_remove_unload(n);
+		}
+	}
+
+	kfree(dst_hashtable);
+}
+
+static int __init dst_sys_init(void)
+{
+	int err = -ENOMEM;
+
+	err = dst_hashtable_init();
+	if (err)
+		goto err_out_exit;
+
+	err = dst_export_init();
+	if (err)
+		goto err_out_hashtable_exit;
+
+	err = register_blkdev(dst_major, DST_NAME);
+	if (err < 0)
+		goto err_out_export_exit;
+	if (err)
+		dst_major = err;
+
+	err = dst_sysfs_init();
+	if (err)
+		goto err_out_unregister;
+
+	err = cn_add_callback(&cn_dst_id, "DST", cn_dst_callback);
+	if (err)
+		goto err_out_sysfs_exit;
+
+	printk(KERN_INFO "Distributed storage, '%s' release.\n", dst_name);
+
+	return 0;
+
+err_out_sysfs_exit:
+	dst_sysfs_exit();
+err_out_unregister:
+	unregister_blkdev(dst_major, DST_NAME);
+err_out_export_exit:
+	dst_export_exit();
+err_out_hashtable_exit:
+	dst_hashtable_exit();
+err_out_exit:
+	return err;
+}
+
+static void __exit dst_sys_exit(void)
+{
+	cn_del_callback(&cn_dst_id);
+	unregister_blkdev(dst_major, DST_NAME);
+	dst_hashtable_exit();
+	dst_sysfs_exit();
+	dst_export_exit();
+}
+
+module_init(dst_sys_init);
+module_exit(dst_sys_exit);
+
+MODULE_DESCRIPTION("Distributed storage");
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/connector.h b/include/linux/connector.h
index fc65d219d88c..b9966e64604e 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -39,8 +39,10 @@
 #define CN_IDX_V86D			0x4
 #define CN_VAL_V86D_UVESAFB		0x1
 #define CN_IDX_BB			0x5	/* BlackBoard, from the TSP GPL sampling framework */
+#define CN_DST_IDX			0x6
+#define CN_DST_VAL			0x1
 
-#define CN_NETLINK_USERS		6
+#define CN_NETLINK_USERS		7
 
 /*
  * Maximum connector's message size.
diff --git a/include/linux/dst.h b/include/linux/dst.h
new file mode 100644
index 000000000000..e26fed84b1aa
--- /dev/null
+++ b/include/linux/dst.h
@@ -0,0 +1,587 @@
+/*
+ * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __DST_H
+#define __DST_H
+
+#include <linux/types.h>
+#include <linux/connector.h>
+
+#define DST_NAMELEN		32
+#define DST_NAME		"dst"
+
+enum {
+	/* Remove node with given id from storage */
+	DST_DEL_NODE	= 0,
+	/* Add remote node with given id to the storage */
+	DST_ADD_REMOTE,
+	/* Add local node with given id to the storage to be exported and used by remote peers */
+	DST_ADD_EXPORT,
+	/* Crypto initialization command (hash/cipher used to protect the connection) */
+	DST_CRYPTO,
+	/* Security attributes for given connection (permissions for example) */
+	DST_SECURITY,
+	/* Register given node in the block layer subsystem */
+	DST_START,
+	DST_CMD_MAX
+};
+
+struct dst_ctl
+{
+	/* Storage name */
+	char			name[DST_NAMELEN];
+	/* Command flags */
+	__u32			flags;
+	/* Command itself (see above) */
+	__u32			cmd;
+	/* Maximum number of pages per single request in this device */
+	__u32			max_pages;
+	/* Stale/error transaction scanning timeout in milliseconds */
+	__u32			trans_scan_timeout;
+	/* Maximum number of retry sends before completing transaction as broken */
+	__u32			trans_max_retries;
+	/* Storage size */
+	__u64			size;
+};
+
+/* Reply command carries completion status */
+struct dst_ctl_ack
+{
+	struct cn_msg		msg;
+	int			error;
+	int			unused[3];
+};
+
+/*
+ * Unfortunaltely socket address structure is not exported to userspace
+ * and is redefined there.
+ */
+#define SADDR_MAX_DATA	128
+
+struct saddr {
+	/* address family, AF_xxx	*/
+	unsigned short		sa_family;
+	/* 14 bytes of protocol address	*/
+	char			sa_data[SADDR_MAX_DATA];
+	/* Number of bytes used in sa_data */
+	unsigned short		sa_data_len;
+};
+
+/* Address structure */
+struct dst_network_ctl
+{
+	/* Socket type: datagram, stream...*/
+	unsigned int		type;
+	/* Let me guess, is it a Jupiter diameter? */
+	unsigned int		proto;
+	/* Peer's address */
+	struct saddr		addr;
+};
+
+struct dst_crypto_ctl
+{
+	/* Cipher and hash names */
+	char			cipher_algo[DST_NAMELEN];
+	char			hash_algo[DST_NAMELEN];
+
+	/* Key sizes. Can be zero for digest for example */
+	unsigned int		cipher_keysize, hash_keysize;
+	/* Alignment. Calculated by the DST itself. */
+	unsigned int		crypto_attached_size;
+	/* Number of threads to perform crypto operations */
+	int			thread_num;
+};
+
+/* Export security attributes have this bits checked in when client connects */
+#define DST_PERM_READ		(1<<0)
+#define DST_PERM_WRITE		(1<<1)
+
+/*
+ * Right now it is simple model, where each remote address
+ * is assigned to set of permissions it is allowed to perform.
+ * In real world block device does not know anything but
+ * reading and writing, so it should be more than enough.
+ */
+struct dst_secure_user
+{
+	unsigned int		permissions;
+	struct saddr		addr;
+};
+
+/*
+ * Export control command: device to export and network address to accept
+ * clients to work with given device
+ */
+struct dst_export_ctl
+{
+	char			device[DST_NAMELEN];
+	struct dst_network_ctl	ctl;
+};
+
+enum {
+	DST_CFG	= 1, 		/* Request remote configuration */
+	DST_IO,			/* IO command */
+	DST_IO_RESPONSE,	/* IO response */
+	DST_PING,		/* Keepalive message */
+	DST_NCMD_MAX,
+};
+
+struct dst_cmd
+{
+	/* Network command itself, see above */
+	__u32			cmd;
+	/*
+	 * Size of the attached data
+	 * (in most cases, for READ command it means how many bytes were requested)
+	 */
+	__u32			size;
+	/* Crypto size: number of attached bytes with digest/hmac */
+	__u32			csize;
+	/* Here we can carry secret data */
+	__u32			reserved;
+	/* Read/write bits, see how they are encoded in bio structure */
+	__u64			rw;
+	/* BIO flags */
+	__u64			flags;
+	/* Unique command id (like transaction ID) */
+	__u64			id;
+	/* Sector to start IO from */
+	__u64			sector;
+	/* Hash data is placed after this header */
+	__u8			hash[0];
+};
+
+/*
+ * Convert command to/from network byte order.
+ * We do not use hton*() functions, since there is
+ * no 64-bit implementation.
+ */
+static inline void dst_convert_cmd(struct dst_cmd *c)
+{
+	c->cmd = __cpu_to_be32(c->cmd);
+	c->csize = __cpu_to_be32(c->csize);
+	c->size = __cpu_to_be32(c->size);
+	c->sector = __cpu_to_be64(c->sector);
+	c->id = __cpu_to_be64(c->id);
+	c->flags = __cpu_to_be64(c->flags);
+	c->rw = __cpu_to_be64(c->rw);
+}
+
+/* Transaction id */
+typedef __u64 dst_gen_t;
+
+#ifdef __KERNEL__
+
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/device.h>
+#include <linux/mempool.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <linux/rbtree.h>
+
+#ifdef CONFIG_DST_DEBUG
+#define dprintk(f, a...) printk(KERN_NOTICE f, ##a)
+#else
+static inline void __attribute__ ((format (printf, 1, 2)))
+	dprintk(const char *fmt, ...) {}
+#endif
+
+struct dst_node;
+
+struct dst_trans
+{
+	/* DST node we are working with */
+	struct dst_node		*n;
+
+	/* Entry inside transaction tree */
+	struct rb_node		trans_entry;
+
+	/* Merlin kills this transaction when this memory cell equals zero */
+	atomic_t		refcnt;
+
+	/* How this transaction should be processed by crypto engine */
+	short			enc;
+	/* How many times this transaction was resent */
+	short			retries;
+	/* Completion status */
+	int			error;
+
+	/* When did we send it to the remote peer */
+	long			send_time;
+
+	/* My name is...
+	 * Well, computers does not speak, they have unique id instead */
+	dst_gen_t		gen;
+
+	/* Block IO we are working with */
+	struct bio		*bio;
+
+	/* Network command for above block IO request */
+	struct dst_cmd		cmd;
+};
+
+struct dst_crypto_engine
+{
+	/* What should we do with all block requests */
+	struct crypto_hash	*hash;
+	struct crypto_ablkcipher	*cipher;
+
+	/* Pool of pages used to encrypt data into before sending */
+	int			page_num;
+	struct page		**pages;
+
+	/* What to do with current request */
+	int			enc;
+	/* Who we are and where do we go */
+	struct scatterlist	*src, *dst;
+
+	/* Maximum timeout waiting for encryption to be completed */
+	long			timeout;
+	/* IV is a 64-bit sequential counter */
+	u64			iv;
+
+	/* Secret data */
+	void			*private;
+
+	/* Cached temporary data lives here */
+	int			size;
+	void			*data;
+};
+
+struct dst_state
+{
+	/* The main state protection */
+	struct mutex		state_lock;
+
+	/* Polling machinery for sockets */
+	wait_queue_t 		wait;
+	wait_queue_head_t 	*whead;
+	/* Most of events are being waited here */
+	wait_queue_head_t 	thread_wait;
+
+	/* Who owns this? */
+	struct dst_node		*node;
+
+	/* Network address for this state */
+	struct dst_network_ctl	ctl;
+
+	/* Permissions to work with: read-only or rw connection */
+	u32			permissions;
+
+	/* Called when we need to clean private data */
+	void			(* cleanup)(struct dst_state *st);
+
+	/* Used by the server: BIO completion queues BIOs here */
+	struct list_head	request_list;
+	spinlock_t		request_lock;
+
+	/* Guess what? No, it is not number of planets */
+	atomic_t		refcnt;
+
+	/* This flags is set when connection should be dropped */
+	int			need_exit;
+
+	/*
+	 * Socket to work with. Second pointer is used for
+	 * lockless check if socket was changed before performing
+	 * next action (like working with cached polling result)
+	 */
+	struct socket		*socket, *read_socket;
+
+	/* Cached preallocated data */
+	void			*data;
+	unsigned int		size;
+
+	/* Currently processed command */
+	struct dst_cmd		cmd;
+};
+
+struct dst_info
+{
+	/* Device size */
+	u64			size;
+
+	/* Local device name for export devices */
+	char			local[DST_NAMELEN];
+
+	/* Network setup */
+	struct dst_network_ctl	net;
+
+	/* Sysfs bits use this */
+	struct device		device;
+};
+
+struct dst_node
+{
+	struct list_head	node_entry;
+
+	/* Hi, my name is stored here */
+	char			name[DST_NAMELEN];
+	/* My cache name is stored here */
+	char			cache_name[DST_NAMELEN];
+
+	/* Block device attached to given node.
+	 * Only valid for exporting nodes */
+	struct block_device 	*bdev;
+	/* Network state machine for given peer */
+	struct dst_state	*state;
+
+	/* Block IO machinery */
+	struct request_queue	*queue;
+	struct gendisk		*disk;
+
+	/* Number of threads in processing pool */
+	int			thread_num;
+	/* Maximum number of pages in single IO */
+	int			max_pages;
+
+	/* I'm that big in bytes */
+	loff_t			size;
+
+	/* Exported to userspace node information */
+	struct dst_info		*info;
+
+	/*
+	 * Security attribute list.
+	 * Used only by exporting node currently.
+	 */
+	struct list_head	security_list;
+	struct mutex		security_lock;
+
+	/*
+	 * When this unerflows below zero, university collapses.
+	 * But this will not happen, since node will be freed,
+	 * when reference counter reaches zero.
+	 */
+	atomic_t		refcnt;
+
+	/* How precisely should I be started? */
+	int 			(*start)(struct dst_node *);
+
+	/* Crypto capabilities */
+	struct dst_crypto_ctl	crypto;
+	u8			*hash_key;
+	u8			*cipher_key;
+
+	/* Pool of processing thread */
+	struct thread_pool	*pool;
+
+	/* Transaction IDs live here */
+	atomic_long_t		gen;
+
+	/*
+	 * How frequently and how many times transaction
+	 * tree should be scanned to drop stale objects.
+	 */
+	long			trans_scan_timeout;
+	int			trans_max_retries;
+
+	/* Small gnomes live here */
+	struct rb_root		trans_root;
+	struct mutex		trans_lock;
+
+	/*
+	 * Transaction cache/memory pool.
+	 * It is big enough to contain not only transaction
+	 * itself, but additional crypto data (digest/hmac).
+	 */
+	struct kmem_cache	*trans_cache;
+	mempool_t		*trans_pool;
+
+	/* This entity scans transaction tree */
+	struct delayed_work 	trans_work;
+
+	wait_queue_head_t	wait;
+};
+
+/* Kernel representation of the security attribute */
+struct dst_secure
+{
+	struct list_head	sec_entry;
+	struct dst_secure_user	sec;
+};
+
+int dst_process_bio(struct dst_node *n, struct bio *bio);
+
+int dst_node_init_connected(struct dst_node *n, struct dst_network_ctl *r);
+int dst_node_init_listened(struct dst_node *n, struct dst_export_ctl *le);
+
+static inline struct dst_state *dst_state_get(struct dst_state *st)
+{
+	BUG_ON(atomic_read(&st->refcnt) == 0);
+	atomic_inc(&st->refcnt);
+	return st;
+}
+
+void dst_state_put(struct dst_state *st);
+
+struct dst_state *dst_state_alloc(struct dst_node *n);
+int dst_state_socket_create(struct dst_state *st);
+void dst_state_socket_release(struct dst_state *st);
+
+void dst_state_exit_connected(struct dst_state *st);
+
+int dst_state_schedule_receiver(struct dst_state *st);
+
+void dst_dump_addr(struct socket *sk, struct sockaddr *sa, char *str);
+
+static inline void dst_state_lock(struct dst_state *st)
+{
+	mutex_lock(&st->state_lock);
+}
+
+static inline void dst_state_unlock(struct dst_state *st)
+{
+	mutex_unlock(&st->state_lock);
+}
+
+void dst_poll_exit(struct dst_state *st);
+int dst_poll_init(struct dst_state *st);
+
+static inline unsigned int dst_state_poll(struct dst_state *st)
+{
+	unsigned int revents = POLLHUP | POLLERR;
+
+	dst_state_lock(st);
+	if (st->socket)
+		revents = st->socket->ops->poll(NULL, st->socket, NULL);
+	dst_state_unlock(st);
+
+	return revents;
+}
+
+static inline int dst_thread_setup(void *private, void *data)
+{
+	return 0;
+}
+
+void dst_node_put(struct dst_node *n);
+
+static inline struct dst_node *dst_node_get(struct dst_node *n)
+{
+	atomic_inc(&n->refcnt);
+	return n;
+}
+
+int dst_data_recv(struct dst_state *st, void *data, unsigned int size);
+int dst_recv_cdata(struct dst_state *st, void *cdata);
+int dst_data_send_header(struct socket *sock,
+		void *data, unsigned int size, int more);
+
+int dst_send_bio(struct dst_state *st, struct dst_cmd *cmd, struct bio *bio);
+
+int dst_process_io(struct dst_state *st);
+int dst_export_crypto(struct dst_node *n, struct bio *bio);
+int dst_export_send_bio(struct bio *bio);
+int dst_start_export(struct dst_node *n);
+
+int __init dst_export_init(void);
+void dst_export_exit(void);
+
+/* Private structure for export block IO requests */
+struct dst_export_priv
+{
+	struct list_head		request_entry;
+	struct dst_state		*state;
+	struct bio			*bio;
+	struct dst_cmd			cmd;
+};
+
+static inline void dst_trans_get(struct dst_trans *t)
+{
+	atomic_inc(&t->refcnt);
+}
+
+struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen);
+int dst_trans_remove(struct dst_trans *t);
+int dst_trans_remove_nolock(struct dst_trans *t);
+void dst_trans_put(struct dst_trans *t);
+
+/*
+ * Convert bio into network command.
+ */
+static inline void dst_bio_to_cmd(struct bio *bio, struct dst_cmd *cmd,
+		u32 command, u64 id)
+{
+	cmd->cmd = command;
+	cmd->flags = (bio->bi_flags << BIO_POOL_BITS) >> BIO_POOL_BITS;
+	cmd->rw = bio->bi_rw;
+	cmd->size = bio->bi_size;
+	cmd->csize = 0;
+	cmd->id = id;
+	cmd->sector = bio->bi_sector;
+};
+
+int dst_trans_send(struct dst_trans *t);
+int dst_trans_crypto(struct dst_trans *t);
+
+int dst_node_crypto_init(struct dst_node *n, struct dst_crypto_ctl *ctl);
+void dst_node_crypto_exit(struct dst_node *n);
+
+static inline int dst_need_crypto(struct dst_node *n)
+{
+	struct dst_crypto_ctl *c = &n->crypto;
+	/*
+	 * Logical OR is appropriate here, but boolean one produces
+	 * more optimal code, so it is used instead.
+	 */
+	return (c->hash_algo[0] | c->cipher_algo[0]);
+}
+
+int dst_node_trans_init(struct dst_node *n, unsigned int size);
+void dst_node_trans_exit(struct dst_node *n);
+
+/*
+ * Pool of threads.
+ * Ready list contains threads currently free to be used,
+ * active one contains threads with some work scheduled for them.
+ * Caller can wait in given queue when thread is ready.
+ */
+struct thread_pool
+{
+	int			thread_num;
+	struct mutex		thread_lock;
+	struct list_head	ready_list, active_list;
+
+	wait_queue_head_t	wait;
+};
+
+void thread_pool_del_worker(struct thread_pool *p);
+void thread_pool_del_worker_id(struct thread_pool *p, unsigned int id);
+int thread_pool_add_worker(struct thread_pool *p,
+		char *name,
+		unsigned int id,
+		void *(* init)(void *data),
+		void (* cleanup)(void *data),
+		void *data);
+
+void thread_pool_destroy(struct thread_pool *p);
+struct thread_pool *thread_pool_create(int num, char *name,
+		void *(* init)(void *data),
+		void (* cleanup)(void *data),
+		void *data);
+
+int thread_pool_schedule(struct thread_pool *p,
+		int (* setup)(void *stored_private, void *setup_data),
+		int (* action)(void *stored_private, void *setup_data),
+		void *setup_data, long timeout);
+int thread_pool_schedule_private(struct thread_pool *p,
+		int (* setup)(void *private, void *data),
+		int (* action)(void *private, void *data),
+		void *data, long timeout, void *id);
+
+#endif /* __KERNEL__ */
+#endif /* __DST_H */
-- 
cgit v1.2.3-71-gd317


From 7237d3de78ff89ec2e18eae5fe962d063024fef5 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 30 Mar 2009 13:55:30 -0800
Subject: x86, ACPI: add support for x2apic ACPI extensions

All logical processors with APIC ID values of 255 and greater will have their
APIC reported through Processor X2APIC structure (type-9 entry type) and all
logical processors with APIC ID less than 255 will have their APIC reported
through legacy Processor Local APIC (type-0 entry type) only. This is the
same case even for NMI structure reporting.

The Processor X2APIC Affinity structure provides the association between the
X2APIC ID of a logical processor and the proximity domain to which the logical
processor belongs.

For OSPM, Procssor IDs outside the 0-254 range are to be declared as Device()
objects in the ACPI namespace.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/boot.c   | 63 ++++++++++++++++++++++++++++++++++++++++---
 arch/x86/mm/srat_64.c         | 30 +++++++++++++++++++++
 drivers/acpi/numa.c           | 46 ++++++++++++++++++++++++++++++-
 drivers/acpi/processor_core.c | 26 ++++++++++++++++++
 drivers/acpi/tables.c         | 30 +++++++++++++++++++++
 include/linux/acpi.h          |  1 +
 6 files changed, 191 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7678f10c4568..565e70c7ca93 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -259,6 +259,35 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 	generic_processor_info(id, ver);
 }
 
+static int __init
+acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+{
+	struct acpi_madt_local_x2apic *processor = NULL;
+
+	processor = (struct acpi_madt_local_x2apic *)header;
+
+	if (BAD_MADT_ENTRY(processor, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+#ifdef CONFIG_X86_X2APIC
+	/*
+	 * We need to register disabled CPU as well to permit
+	 * counting disabled CPUs. This allows us to size
+	 * cpus_possible_map more accurately, to permit
+	 * to not preallocating memory for all NR_CPUS
+	 * when we use CPU hotplug.
+	 */
+	acpi_register_lapic(processor->local_apic_id,	/* APIC ID */
+			    processor->lapic_flags & ACPI_MADT_ENABLED);
+#else
+	printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+#endif
+
+	return 0;
+}
+
 static int __init
 acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 {
@@ -318,6 +347,25 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 	return 0;
 }
 
+static int __init
+acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+		      const unsigned long end)
+{
+	struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
+
+	x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
+
+	if (BAD_MADT_ENTRY(x2apic_nmi, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+	if (x2apic_nmi->lint != 1)
+		printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+	return 0;
+}
+
 static int __init
 acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
 {
@@ -823,6 +871,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 static int __init acpi_parse_madt_lapic_entries(void)
 {
 	int count;
+	int x2count = 0;
 
 	if (!cpu_has_apic)
 		return -ENODEV;
@@ -846,22 +895,28 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
 				      acpi_parse_sapic, MAX_APICS);
 
-	if (!count)
+	if (!count) {
+		x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+						acpi_parse_x2apic, MAX_APICS);
 		count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
 					      acpi_parse_lapic, MAX_APICS);
-	if (!count) {
+	}
+	if (!count && !x2count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return -ENODEV;
-	} else if (count < 0) {
+	} else if (count < 0 || x2count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return count;
 	}
 
+	x2count =
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+				  acpi_parse_x2apic_nmi, 0);
 	count =
 	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
-	if (count < 0) {
+	if (count < 0 || x2count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return count;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..13d56f5b1349 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -115,6 +115,36 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 	reserve_early(phys, phys + length, "ACPI SLIT");
 }
 
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	int pxm, node;
+	int apic_id;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	apic_id = pa->apic_id;
+	apicid_to_node[apic_id] = node;
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+	       pxm, apic_id, node);
+}
+
 /* Callback for Proximity Domain -> LAPIC mapping */
 void __init
 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3a0d8ef25c75..d440ccd27d91 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -131,6 +131,21 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 #endif				/* ACPI_DEBUG_OUTPUT */
 		break;
 
+	case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+		{
+			struct acpi_srat_x2apic_cpu_affinity *p =
+			    (struct acpi_srat_x2apic_cpu_affinity *)header;
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+					  "SRAT Processor (x2apicid[0x%08x]) in"
+					  " proximity domain %d %s\n",
+					  p->apic_id,
+					  p->proximity_domain,
+					  (p->flags & ACPI_SRAT_CPU_ENABLED) ?
+					  "enabled" : "disabled"));
+		}
+#endif				/* ACPI_DEBUG_OUTPUT */
+		break;
 	default:
 		printk(KERN_WARNING PREFIX
 		       "Found unsupported SRAT entry (type = 0x%x)\n",
@@ -180,8 +195,35 @@ static int __init acpi_parse_slit(struct acpi_table_header *table)
 	return 0;
 }
 
+void __init __attribute__ ((weak))
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	printk(KERN_WARNING PREFIX
+	       "Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id);
+	return;
+}
+
+
+static int __init
+acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
+			   const unsigned long end)
+{
+	struct acpi_srat_x2apic_cpu_affinity *processor_affinity;
+
+	processor_affinity = (struct acpi_srat_x2apic_cpu_affinity *)header;
+	if (!processor_affinity)
+		return -EINVAL;
+
+	acpi_table_print_srat_entry(header);
+
+	/* let architecture-dependent part to do it */
+	acpi_numa_x2apic_affinity_init(processor_affinity);
+
+	return 0;
+}
+
 static int __init
-acpi_parse_processor_affinity(struct acpi_subtable_header * header,
+acpi_parse_processor_affinity(struct acpi_subtable_header *header,
 			      const unsigned long end)
 {
 	struct acpi_srat_cpu_affinity *processor_affinity;
@@ -241,6 +283,8 @@ int __init acpi_numa_init(void)
 {
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
+		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
+				      acpi_parse_x2apic_affinity, NR_CPUS);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
 				      acpi_parse_processor_affinity, NR_CPUS);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 0cc2fd31e376..775324e34ffa 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -427,6 +427,29 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
 	return 0;
 }
 
+static int map_x2apic_id(struct acpi_subtable_header *entry,
+			 int device_declaration, u32 acpi_id, int *apic_id)
+{
+	struct acpi_madt_local_x2apic *apic =
+		(struct acpi_madt_local_x2apic *)entry;
+	u32 tmp = apic->local_apic_id;
+
+	/* Only check enabled APICs*/
+	if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
+		return 0;
+
+	/* Device statement declaration type */
+	if (device_declaration) {
+		if (apic->uid == acpi_id)
+			goto found;
+	}
+
+	return 0;
+found:
+	*apic_id = tmp;
+	return 1;
+}
+
 static int map_lsapic_id(struct acpi_subtable_header *entry,
 		int device_declaration, u32 acpi_id, int *apic_id)
 {
@@ -476,6 +499,9 @@ static int map_madt_entry(int type, u32 acpi_id)
 		if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
 			if (map_lapic_id(header, acpi_id, &apic_id))
 				break;
+		} else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
+			if (map_x2apic_id(header, type, acpi_id, &apic_id))
+				break;
 		} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
 			if (map_lsapic_id(header, type, acpi_id, &apic_id))
 				break;
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index a8852952fac4..991c006a301b 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -62,6 +62,18 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
 		}
 		break;
 
+	case ACPI_MADT_TYPE_LOCAL_X2APIC:
+		{
+			struct acpi_madt_local_x2apic *p =
+			    (struct acpi_madt_local_x2apic *)header;
+			printk(KERN_INFO PREFIX
+			       "X2APIC (apic_id[0x%02x] uid[0x%02x] %s)\n",
+			       p->local_apic_id, p->uid,
+			       (p->lapic_flags & ACPI_MADT_ENABLED) ?
+			       "enabled" : "disabled");
+		}
+		break;
+
 	case ACPI_MADT_TYPE_IO_APIC:
 		{
 			struct acpi_madt_io_apic *p =
@@ -116,6 +128,24 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
 		}
 		break;
 
+	case ACPI_MADT_TYPE_LOCAL_X2APIC_NMI:
+		{
+			u16 polarity, trigger;
+			struct acpi_madt_local_x2apic_nmi *p =
+			    (struct acpi_madt_local_x2apic_nmi *)header;
+
+			polarity = p->inti_flags & ACPI_MADT_POLARITY_MASK;
+			trigger = (p->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
+
+			printk(KERN_INFO PREFIX
+			       "X2APIC_NMI (uid[0x%02x] %s %s lint[0x%x])\n",
+			       p->uid,
+			       mps_inti_flags_polarity[polarity],
+			       mps_inti_flags_trigger[trigger],
+			       p->lint);
+		}
+		break;
+
 	case ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE:
 		{
 			struct acpi_madt_local_apic_override *p =
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6fce2fc2d124..a6989e517549 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -96,6 +96,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 /* the following four functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
+void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);
 void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
 void acpi_numa_arch_fixup(void);
 
-- 
cgit v1.2.3-71-gd317


From 601cc11d054ae4b5e9b5babec3d8e4667a2cb9b5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 3 Apr 2009 08:03:22 -0700
Subject: Make non-compat preadv/pwritev use native register size

Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.

This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).

This also changes the order of 'high' and 'low' to be "low first".  Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.

This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code.  On x86-64, we now
generate

        testq   %rcx, %rcx      # pos_l
        js      .L122   #,
        movq    %rcx, -48(%rbp) # pos_l, pos

from the C source

        loff_t pos = pos_from_hilo(pos_h, pos_l);
	...
        if (pos < 0)
                return -EINVAL;

and the 'pos_h' register isn't even touched.  It used to generate code
like

        mov     %r8d, %r8d      # pos_low, pos_low
        salq    $32, %rcx       #, tmp71
        movq    %r8, %rax       # pos_low, pos.386
        orq     %rcx, %rax      # tmp71, pos.386
        js      .L122   #,
        movq    %rax, -48(%rbp) # pos.386, pos

which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)

Note: in all cases the user code wrapper can again be the same. You can
just do

	#define HALF_BITS (sizeof(unsigned long)*4)
	__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);

or something like that.  That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.

And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone.  Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.

[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]

Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c              |  4 ++--
 fs/read_write.c          | 14 ++++++++++----
 include/linux/compat.h   |  4 ++--
 include/linux/syscalls.h |  4 ++--
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 1c859dae758f..3f84d5f15889 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1236,7 +1236,7 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
 
 asmlinkage ssize_t
 compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
-		  unsigned long vlen, u32 pos_high, u32 pos_low)
+		  unsigned long vlen, u32 pos_low, u32 pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	struct file *file;
@@ -1293,7 +1293,7 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
 
 asmlinkage ssize_t
 compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
-		   unsigned long vlen, u32 pos_high, u32 pos_low)
+		   unsigned long vlen, u32 pos_low, u32 pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	struct file *file;
diff --git a/fs/read_write.c b/fs/read_write.c
index 6d5d8ff238aa..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,10 +731,16 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+
 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, u32, pos_high, u32, pos_low)
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct file *file;
 	ssize_t ret = -EBADF;
 	int fput_needed;
@@ -757,9 +763,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 }
 
 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, u32, pos_high, u32, pos_low)
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct file *file;
 	ssize_t ret = -EBADF;
 	int fput_needed;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 9723edd6455c..f2ded21f9a3c 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -193,10 +193,10 @@ asmlinkage ssize_t compat_sys_writev(unsigned long fd,
 		const struct compat_iovec __user *vec, unsigned long vlen);
 asmlinkage ssize_t compat_sys_preadv(unsigned long fd,
 		const struct compat_iovec __user *vec,
-		unsigned long vlen, u32 pos_high, u32 pos_low);
+		unsigned long vlen, u32 pos_low, u32 pos_high);
 asmlinkage ssize_t compat_sys_pwritev(unsigned long fd,
 		const struct compat_iovec __user *vec,
-		unsigned long vlen, u32 pos_high, u32 pos_low);
+		unsigned long vlen, u32 pos_low, u32 pos_high);
 
 int compat_do_execve(char * filename, compat_uptr_t __user *argv,
 	        compat_uptr_t __user *envp, struct pt_regs * regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b299a82a05e7..18771cac2f85 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -462,9 +462,9 @@ asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 			     size_t count, loff_t pos);
 asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
-			   unsigned long vlen, u32 pos_high, u32 pos_low);
+			   unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
-			    unsigned long vlen, u32 pos_high, u32 pos_low);
+			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_mkdir(const char __user *pathname, int mode);
 asmlinkage long sys_chdir(const char __user *filename);
-- 
cgit v1.2.3-71-gd317


From 3206450355100eae8e033645318b95bb60f1faff Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 6 Feb 2009 15:27:13 +0100
Subject: mfd: Support active high IRQs on WM835x

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/wm8350-core.c       | 16 +++++++++++++++-
 include/linux/mfd/wm8350/core.h |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index f22b18b70796..a285cc0cc704 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -1438,7 +1438,21 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq,
 	mutex_init(&wm8350->irq_mutex);
 	INIT_WORK(&wm8350->irq_work, wm8350_irq_worker);
 	if (irq) {
-		ret = request_irq(irq, wm8350_irq, 0,
+		int flags = 0;
+
+		if (pdata && pdata->irq_high) {
+			flags |= IRQF_TRIGGER_HIGH;
+
+			wm8350_set_bits(wm8350, WM8350_SYSTEM_CONTROL_1,
+					WM8350_IRQ_POL);
+		} else {
+			flags |= IRQF_TRIGGER_LOW;
+
+			wm8350_clear_bits(wm8350, WM8350_SYSTEM_CONTROL_1,
+					  WM8350_IRQ_POL);
+		}
+
+		ret = request_irq(irq, wm8350_irq, flags,
 				  "wm8350", wm8350);
 		if (ret != 0) {
 			dev_err(wm8350->dev, "Failed to request IRQ: %d\n",
diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index 980669d50dca..42cca672f340 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -640,9 +640,11 @@ struct wm8350 {
  *
  * @init: Function called during driver initialisation.  Should be
  *        used by the platform to configure GPIO functions and similar.
+ * @irq_high: Set if WM8350 IRQ is active high.
  */
 struct wm8350_platform_data {
 	int (*init)(struct wm8350 *wm8350);
+	int irq_high;
 };
 
 
-- 
cgit v1.2.3-71-gd317


From a23a175795cdb202619ac176129b2f0c2a5c9456 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Tue, 17 Feb 2009 10:06:41 +0100
Subject: mfd: convert DS1WM to use MFD core

This patch converts the DS1WM driver into an MFD cell. It also
calculates the bus_shift parameter from the memory resource size.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/w1/masters/ds1wm.c | 31 ++++++++++++++++++++-----------
 include/linux/ds1wm.h      | 12 ------------
 include/linux/mfd/ds1wm.h  |  5 +++++
 3 files changed, 25 insertions(+), 23 deletions(-)
 delete mode 100644 include/linux/ds1wm.h
 create mode 100644 include/linux/mfd/ds1wm.h

(limited to 'include/linux')

diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index 29e144f81cbe..f1e6b3dd1e43 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -19,7 +19,8 @@
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/delay.h>
-#include <linux/ds1wm.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/ds1wm.h>
 
 #include <asm/io.h>
 
@@ -89,7 +90,7 @@ struct ds1wm_data {
 	void		__iomem *map;
 	int		bus_shift; /* # of shifts to calc register offsets */
 	struct platform_device *pdev;
-	struct ds1wm_platform_data *pdata;
+	struct mfd_cell	*cell;
 	int		irq;
 	int		active_high;
 	struct clk	*clk;
@@ -217,8 +218,8 @@ static void ds1wm_up(struct ds1wm_data *ds1wm_data)
 {
 	int gclk, divisor;
 
-	if (ds1wm_data->pdata->enable)
-		ds1wm_data->pdata->enable(ds1wm_data->pdev);
+	if (ds1wm_data->cell->enable)
+		ds1wm_data->cell->enable(ds1wm_data->pdev);
 
 	gclk = clk_get_rate(ds1wm_data->clk);
 	clk_enable(ds1wm_data->clk);
@@ -244,8 +245,8 @@ static void ds1wm_down(struct ds1wm_data *ds1wm_data)
 	ds1wm_write_register(ds1wm_data, DS1WM_INT_EN,
 			     ds1wm_data->active_high ? DS1WM_INTEN_IAS : 0);
 
-	if (ds1wm_data->pdata->disable)
-		ds1wm_data->pdata->disable(ds1wm_data->pdev);
+	if (ds1wm_data->cell->disable)
+		ds1wm_data->cell->disable(ds1wm_data->pdev);
 
 	clk_disable(ds1wm_data->clk);
 }
@@ -330,13 +331,18 @@ static struct w1_bus_master ds1wm_master = {
 static int ds1wm_probe(struct platform_device *pdev)
 {
 	struct ds1wm_data *ds1wm_data;
-	struct ds1wm_platform_data *plat;
+	struct ds1wm_driver_data *plat;
 	struct resource *res;
+	struct mfd_cell *cell;
 	int ret;
 
 	if (!pdev)
 		return -ENODEV;
 
+	cell = pdev->dev.platform_data;
+	if (!cell)
+		return -ENODEV;
+
 	ds1wm_data = kzalloc(sizeof(*ds1wm_data), GFP_KERNEL);
 	if (!ds1wm_data)
 		return -ENOMEM;
@@ -348,15 +354,18 @@ static int ds1wm_probe(struct platform_device *pdev)
 		ret = -ENXIO;
 		goto err0;
 	}
-	ds1wm_data->map = ioremap(res->start, res->end - res->start + 1);
+	ds1wm_data->map = ioremap(res->start, resource_size(res));
 	if (!ds1wm_data->map) {
 		ret = -ENOMEM;
 		goto err0;
 	}
-	plat = pdev->dev.platform_data;
-	ds1wm_data->bus_shift = plat->bus_shift;
+	plat = cell->driver_data;
+
+	/* calculate bus shift from mem resource */
+	ds1wm_data->bus_shift = resource_size(res) >> 3;
+
 	ds1wm_data->pdev = pdev;
-	ds1wm_data->pdata = plat;
+	ds1wm_data->cell = cell;
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 	if (!res) {
diff --git a/include/linux/ds1wm.h b/include/linux/ds1wm.h
deleted file mode 100644
index d3c65e48a2e7..000000000000
--- a/include/linux/ds1wm.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* platform data for the DS1WM driver */
-
-struct ds1wm_platform_data {
-	int bus_shift;	    /* number of shifts needed to calculate the
-			     * offset between DS1WM registers;
-			     * e.g. on h5xxx and h2200 this is 2
-			     * (registers aligned to 4-byte boundaries),
-			     * while on hx4700 this is 1 */
-	int active_high;
-	void (*enable)(struct platform_device *pdev);
-	void (*disable)(struct platform_device *pdev);
-};
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
new file mode 100644
index 000000000000..d4898ba18207
--- /dev/null
+++ b/include/linux/mfd/ds1wm.h
@@ -0,0 +1,5 @@
+/* MFD cell driver data for the DS1WM driver */
+
+struct ds1wm_driver_data {
+	int active_high;
+};
-- 
cgit v1.2.3-71-gd317


From b72019dbd126e60bb5f9f350f76127b1527facba Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Tue, 17 Feb 2009 10:06:52 +0100
Subject: mfd: remove unused PASIC3 bus_shift field

Removes the now-unused bus_shift field from pasic3_platform_data.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/htc-pasic3.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/htc-pasic3.h b/include/linux/mfd/htc-pasic3.h
index b4294f12c4f8..3d3ed67bd969 100644
--- a/include/linux/mfd/htc-pasic3.h
+++ b/include/linux/mfd/htc-pasic3.h
@@ -48,7 +48,6 @@ struct pasic3_leds_machinfo {
 
 struct pasic3_platform_data {
 	struct pasic3_leds_machinfo *led_pdata;
-	unsigned int                 bus_shift;
 	unsigned int                 clock_rate;
 };
 
-- 
cgit v1.2.3-71-gd317


From 7d33ccbeecd8393cc690cf9a71008236cdd7cc2c Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Tue, 17 Feb 2009 10:09:19 +0100
Subject: mfd: remove DS1WM clock handling

This driver requests a clock that usually is supplied by the MFD in which
the DS1WM is contained. Currently, it is impossible for a MFD to register
their clocks with the generic clock API due to different implementations
across architectures.
For now, this patch removes the clock handling from DS1WM altogether,
trusting that the MFD enable/disable functions will switch the clock if
needed. The clock rate is obtained from a new parameter in driver_data.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/w1/masters/ds1wm.c | 27 +++++++--------------------
 include/linux/mfd/ds1wm.h  |  1 +
 2 files changed, 8 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index f1e6b3dd1e43..37f08c850608 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -16,7 +16,6 @@
 #include <linux/irq.h>
 #include <linux/pm.h>
 #include <linux/platform_device.h>
-#include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/delay.h>
 #include <linux/mfd/core.h>
@@ -93,7 +92,6 @@ struct ds1wm_data {
 	struct mfd_cell	*cell;
 	int		irq;
 	int		active_high;
-	struct clk	*clk;
 	int		slave_present;
 	void		*reset_complete;
 	void		*read_complete;
@@ -216,17 +214,17 @@ static int ds1wm_find_divisor(int gclk)
 
 static void ds1wm_up(struct ds1wm_data *ds1wm_data)
 {
-	int gclk, divisor;
+	int divisor;
+	struct ds1wm_driver_data *plat = ds1wm_data->cell->driver_data;
 
 	if (ds1wm_data->cell->enable)
 		ds1wm_data->cell->enable(ds1wm_data->pdev);
 
-	gclk = clk_get_rate(ds1wm_data->clk);
-	clk_enable(ds1wm_data->clk);
-	divisor = ds1wm_find_divisor(gclk);
+	divisor = ds1wm_find_divisor(plat->clock_rate);
 	if (divisor == 0) {
 		dev_err(&ds1wm_data->pdev->dev,
-			"no suitable divisor for %dHz clock\n", gclk);
+			"no suitable divisor for %dHz clock\n",
+			plat->clock_rate);
 		return;
 	}
 	ds1wm_write_register(ds1wm_data, DS1WM_CLKDIV, divisor);
@@ -247,8 +245,6 @@ static void ds1wm_down(struct ds1wm_data *ds1wm_data)
 
 	if (ds1wm_data->cell->disable)
 		ds1wm_data->cell->disable(ds1wm_data->pdev);
-
-	clk_disable(ds1wm_data->clk);
 }
 
 /* --------------------------------------------------------------------- */
@@ -385,26 +381,18 @@ static int ds1wm_probe(struct platform_device *pdev)
 	if (ret)
 		goto err1;
 
-	ds1wm_data->clk = clk_get(&pdev->dev, "ds1wm");
-	if (IS_ERR(ds1wm_data->clk)) {
-		ret = PTR_ERR(ds1wm_data->clk);
-		goto err2;
-	}
-
 	ds1wm_up(ds1wm_data);
 
 	ds1wm_master.data = (void *)ds1wm_data;
 
 	ret = w1_add_master_device(&ds1wm_master);
 	if (ret)
-		goto err3;
+		goto err2;
 
 	return 0;
 
-err3:
-	ds1wm_down(ds1wm_data);
-	clk_put(ds1wm_data->clk);
 err2:
+	ds1wm_down(ds1wm_data);
 	free_irq(ds1wm_data->irq, ds1wm_data);
 err1:
 	iounmap(ds1wm_data->map);
@@ -443,7 +431,6 @@ static int ds1wm_remove(struct platform_device *pdev)
 
 	w1_remove_master_device(&ds1wm_master);
 	ds1wm_down(ds1wm_data);
-	clk_put(ds1wm_data->clk);
 	free_irq(ds1wm_data->irq, ds1wm_data);
 	iounmap(ds1wm_data->map);
 	kfree(ds1wm_data);
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
index d4898ba18207..be469a357cbb 100644
--- a/include/linux/mfd/ds1wm.h
+++ b/include/linux/mfd/ds1wm.h
@@ -2,4 +2,5 @@
 
 struct ds1wm_driver_data {
 	int active_high;
+	int clock_rate;
 };
-- 
cgit v1.2.3-71-gd317